diff --git a/CMakeLists.txt b/CMakeLists.txt
index c649aafeddaf9f28c213d086236c3779d3137d92..23bbe829ac16180088bfa37df66e23f19b021ea3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -179,6 +179,7 @@ set(EXTERNAL_LIBS
 
 if(WITH_GPU)
     include(cuda)
+    include(tensorrt)
 endif(WITH_GPU)
 
 if(WITH_AMD_GPU)
diff --git a/Dockerfile b/Dockerfile
index fbec88c7966d6ea93495519843d6cda63f622661..870304a6acc99e715dffbfabd8058be000b6872c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,6 +1,6 @@
 # A image for building paddle binaries
 # Use cuda devel base image for both cpu and gpu environment
-FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04
+FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 
 ARG UBUNTU_MIRROR
@@ -45,6 +45,13 @@ ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
 # install glide
 RUN curl -s -q https://glide.sh/get | sh
 
+# Install TensorRT
+# The unnecessary files has been removed to make the library small. It only contains include and lib now.
+RUN wget -qO- http://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
+    tar -xz -C /usr/local && \
+    cp -rf /usr/local/TensorRT/include /usr && \
+    cp -rf /usr/local/TensorRT/lib /usr
+
 # git credential to skip password typing
 RUN git config --global credential.helper store
 
@@ -57,7 +64,7 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
 # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
 # version(1.7.1 for now), which causes building documentation failed.
-RUN pip install --upgrade pip && \
+RUN pip install --upgrade pip==9.0.3 && \
     pip install -U wheel && \
     pip install -U docopt PyYAML sphinx==1.5.6 && \
     pip install sphinx-rtd-theme==0.1.9 recommonmark
diff --git a/Dockerfile.android b/Dockerfile.android
index cc022d596b4b74dd1e4f4d0901dd81c91a7decd1..848a7eba6f1421432addae8acff407b611adb4ae 100644
--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -27,7 +27,7 @@ RUN git config --global credential.helper store
 # Fix locales to en_US.UTF-8
 RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 
-RUN pip install --upgrade pip && \
+RUN pip install --upgrade pip==9.0.3 && \
     pip install -U 'protobuf==3.1.0' && \
     pip install -U wheel sphinx && \
     pip install pre-commit
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 52a22c1fbf4779fa3c0ca687cab664bd3ca0410a..e3b9d94215a858c5c9a34e1b7e97540f1876801d 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -78,7 +78,7 @@ if(NOT CMAKE_CROSSCOMPILING)
     /usr/lib/reference/
   )
 else()
-  # Diable the finding of reference cblas under host's system path
+  # Disable the finding of reference cblas under host's system path
   set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/include)
   set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib)
 endif()
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index f726405c4773994f6ca6509e5218750805b03995..e490397cc0624c310949a4b571bd00cac6e8953b 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -80,6 +80,16 @@ if(WITH_GPU)
     # Include cuda and cudnn
     include_directories(${CUDNN_INCLUDE_DIR})
     include_directories(${CUDA_TOOLKIT_INCLUDE})
+
+    if(TENSORRT_FOUND)
+        if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
+            message(FATAL_ERROR "TensorRT needs CUDA >= 8.0 to compile")
+        endif()
+        if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+            message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile")
+        endif()
+        include_directories(${TENSORRT_INCLUDE_DIR})
+    endif()
 elseif(WITH_AMD_GPU)
     add_definitions(-DPADDLE_WITH_HIP)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index aa249159470773241e0f6da2e8e086264634dd4a..e90948782bb5e333bbdb47ef9d61c1e37e3cf9e4 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -33,7 +33,7 @@ ExternalProject_Add(
     extern_grpc
     DEPENDS protobuf zlib
     GIT_REPOSITORY "https://github.com/grpc/grpc.git"
-    GIT_TAG "v1.11.x"
+    GIT_TAG "v1.10.x"
     PREFIX          ${GRPC_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CONFIGURE_COMMAND ""
diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..0c07d36bed65400164853b99f18ec0335341cd94
--- /dev/null
+++ b/cmake/tensorrt.cmake
@@ -0,0 +1,33 @@
+if(NOT WITH_GPU)
+    return()
+endif()
+
+set(TENSORRT_ROOT "/usr" CACHE PATH "TENSORRT ROOT")
+find_path(TENSORRT_INCLUDE_DIR NvInfer.h
+    PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/include
+    $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/include
+    NO_DEFAULT_PATH
+)
+
+find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a
+    PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/lib
+    $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/lib
+    NO_DEFAULT_PATH
+    DOC "Path to TensorRT library.")
+
+if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY)
+    set(TENSORRT_FOUND ON)
+else()
+    set(TENSORRT_FOUND OFF)
+endif()
+
+if(TENSORRT_FOUND)
+    file(READ ${TENSORRT_INCLUDE_DIR}/NvInfer.h TENSORRT_VERSION_FILE_CONTENTS)
+    string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" TENSORRT_MAJOR_VERSION
+        "${TENSORRT_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define NV_TENSORRT_MAJOR +([0-9]+)" "\\1"
+        TENSORRT_MAJOR_VERSION "${TENSORRT_MAJOR_VERSION}")
+
+    message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
+        "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
+endif()
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 7066637a7cb27b83724cb4030c29a1019981f52b..0f9521616952a2857222feab8c38fb480761ee2d 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -3,7 +3,9 @@ add_custom_target(paddle_apis ALL
 
 add_custom_target(paddle_docs ALL
                   DEPENDS paddle_v2_docs paddle_v2_docs_cn
-                  paddle_fluid_docs paddle_fluid_docs_cn)
+                  paddle_fluid_docs paddle_fluid_docs_cn
+                  paddle_mobile_docs paddle_mobile_docs_cn)
 
 add_subdirectory(v2)
 add_subdirectory(fluid)
+add_subdirectory(mobile)
diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
index 22e6fb13d7320986a60bc1ef5530187e0970c767..5c02886efd7d11e9520910526fb90ec01e123bae 100644
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -473,6 +473,12 @@ multiplex
 ..  autofunction:: paddle.fluid.layers.multiplex
     :noindex:
 
+label_smooth
+------------
+
+..  autofunction:: paddle.fluid.layers.label_smooth
+    :noindex:
+
 ops
 ===
 
diff --git a/doc/fluid/design/concepts/parallel_executor.md b/doc/fluid/design/concepts/parallel_executor.md
index 9aed3b059a1595ba3971d7d5acfc0d16a731584b..4f88e27bed722e9f2f535e368926fe49b4e72e56 100644
--- a/doc/fluid/design/concepts/parallel_executor.md
+++ b/doc/fluid/design/concepts/parallel_executor.md
@@ -84,7 +84,7 @@ Running an operator can be asynchronized. There is a thread pool to execute an `
 
 ## Synchronize GPU Kernels
 
-The GPU is a non-blocking device. The different streams need be synchronized when switing streams. In current implementation, the synchronization based on the following algorithm:
+The GPU is a non-blocking device. The different streams need be synchronized when switching streams. In current implementation, the synchronization based on the following algorithm:
 
 1. `OpHandle` will record `DeviceContext` that it is used.
 2. In `OpHandle::Run`, if the `DeviceContext` of current operator is different from `DeviceContext` of any input variable, just wait the generate operator of this input variable.
diff --git a/doc/fluid/design/dist_train/README.md b/doc/fluid/design/dist_train/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2dd652d8bdcb8f3b6e759347bd55b217be909386
--- /dev/null
+++ b/doc/fluid/design/dist_train/README.md
@@ -0,0 +1,57 @@
+## Distributed training overview doc
+
+Currently Paddle Fluid use parameter server architecture to support distributed training.
+
+For synchronous and asynchronous training, the differences are mostly in the logic of parameter server. Now we have already support synchronous training.
+
+### Synchronous training
+
+The training process of synchronous training is:
+
+![synchronous distributed training](./src/sync_distributed_training.png)
+
+1. Pserver
+	1. set `barrier_condition_` to 0 and waits for trainers to send gradient.
+1. Trainer
+	1. Trainer read minibatch of data, run forward-backward with local parameter copy and get the gradients for parameters.
+	1. Trainer use split op to split all the gradient into blocks. The split method is determined at compile time.
+	1. Trainer use send_op to send all the split gradients to corresponding parameter server.
+	1. After trainer send all the gradients, it will send a `BATCH_BARRIER_MESSAGE` to all pservers.
+	1. Trainer call GetVariable to pserver and wait for `barrier_condition_` on pserver to be 1.
+1. Pserver
+   1. Pserver will count the number of `BATCH_BARRIER_MESSAGE`.
+	1. When the count of `BATCH_BARRIER_MESSAGE` is equal to the number of Trainer. Pserver thinks it received all gradient from all trainers.
+	1. Pserver will run the optimization block to optimize the parameters.
+	1. After optimization, pserver set `barrier_condition_` to 1.
+	1. Pserver wait for `FETCH_BARRIER_MESSAGE`.
+1. Trainer.
+	1. The trainer uses GetVariable to get all the parameters from pserver.
+	1. Trainer sends a `FETCH_BARRIER_MESSAGE` to each pserver.
+1. Pserver.
+	1. when the number of `FETCH_BARRIER_MESSAGE` reach the number of all trainers. Pserver think all the parameters have been got. it will go back to 1. to set `barrier_condition_` to 0.
+
+### Asynchronous training
+In the above process. There are two barriers for all trainers to synchronize with each other. In asynchronous training, these two barriers are not needed. The trainer can just send gradients to pserver and then get parameters back.
+
+The training process of asynchronous training can be:
+
+![asynchronous distributed training](./src/async_distributed_training.png)
+
+1. Pserver:
+	1. Each parameter has a queue to receive its gradient from trainers.
+	1. Each parameter has a thread to read data from the queue and run optimize block, using the gradient to optimize the parameter.
+	1. Using an independent thread to handle RPC call `GetVariable` for trainers to get parameters back.(Maybe here we should use a thread pool to speed up fetching the parameters.)
+
+1. Trainer:
+	1. Trainer read a batch of data. Run forward and backward with local parameter copy and get the gradients for parameters.
+	1. Trainer split all gradients to blocks and then send these gradient blocks to pservers(pserver will put them into the queue).
+	2. Trainer gets all parameters back from pserver.
+
+### Note:
+There are also some conditions that need to consider. For exmaple:
+
+1. If trainer needs to wait for the pserver to apply it's gradient and then get back the parameters back.
+1. If we need a lock between parameter update and parameter fetch.
+1. If one parameter must be on one server, or it can also be split and send to multiple parameter servers.
+
+The above architecture of asynchronous training can support different mode, we can have a detailed test in the future for these problems.
diff --git a/doc/fluid/design/dist_train/async_update.md b/doc/fluid/design/dist_train/async_update.md
new file mode 100644
index 0000000000000000000000000000000000000000..6a0835b761b69030ba30697e6e8863928efbf57f
--- /dev/null
+++ b/doc/fluid/design/dist_train/async_update.md
@@ -0,0 +1,58 @@
+# Design Doc: Asynchronous Update With Distributed Training
+
+## Background
+
+For the typical synchronous distributed training, some significant steps are as follows:
+
+1. A Trainer will compute the gradients and SEND them to the Parameter Server(PServer) nodes.
+1. After the PServer node received gradients came from all the Trainers, It will aggregate the
+gradient variables for the same parameter into one gradient variable and then apply the aggregated
+gradient to the respective parameter, finally using an optimize algorithms(SGD, Monument...)
+to update the parameters.
+1. The Trainer would wait for the PServers finished the optimize stage, and GET the parameters from PServer,
+so all the Trainers would get the same parameters.
+
+In the synchronously distributed training, there should be a `Barrier` to synchronise the
+parameters after the optimizing stage. The performance of a distributed training job would
+depend on the slowest node if there were hundreds or thousands of training nodes in a
+Job, the performance of synchronously distributed training might be very poor because of
+the slow node. So this design doc would introduce an approach to implement
+*asynchronously* distributed training in PaddlePaddle Fluid.
+
+## Design
+
+<img src="./src/async_update.png" width="600"/>
+
+As the figure above, we describe a global view of asynchronously update process and use
+the parameter `w1` as an example to introduce the steps:
+1. For each gradient variables, they may distribute on different GPU card and aggregate
+them while they are all calculated.
+1. Split the gradient variable into multiple blocks according to the number of PServer
+instances and then send them.
+1. PServer would run an `Optimize Block` using a specified optimize algorithm to update
+the specified parameter.
+1. The trainer will fetch latest parameter from PServer before running forward Op which depends
+on the specified parameter.
+1. Broadcast the received variable into multiple GPU cards and continue to run the next
+mini-batch.
+
+### Trainer
+
+- For the multiple devices distributed training, we need to aggregate the gradient
+variables which placed on different devices firstly and then schedule a `SendVars` Operator to
+send the gradient variables to the multiple PServer instances.
+- Schedule `FetchVars` operator to fetch the latest parameter from PServer before running
+the forward ops.
+- There could be a large number of gradient variables to be sent, so we need to use another
+thread pool(IO Threadpool) whose a number of the schedulable threads is larger than the
+computing thread pool to avoid competitive the thread resources with computing.
+
+### Parameter Server
+
+<img src="./src/async_pserver.png" width="750"/>
+
+- There should be multiple trainer instances want to optimize the same parameter at
+the same time, to avoid the racing, we need one `BlockingQueue` for each gradient
+variable to process them one by one.
+- We need a `Map` structure to map a gradient variable name to the `OptimizeBlock` which
+can optimize the respective parameter.
diff --git a/doc/fluid/design/dist_train/src/async_distributed_training.png b/doc/fluid/design/dist_train/src/async_distributed_training.png
new file mode 100644
index 0000000000000000000000000000000000000000..3b53ab59c0cd7b44b2956f16f1adc47fe85909d3
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_distributed_training.png differ
diff --git a/doc/fluid/design/dist_train/src/async_pserver.graffle b/doc/fluid/design/dist_train/src/async_pserver.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..d2301611774fcb3866473e3e6470568d1e1312cf
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_pserver.graffle differ
diff --git a/doc/fluid/design/dist_train/src/async_pserver.png b/doc/fluid/design/dist_train/src/async_pserver.png
new file mode 100644
index 0000000000000000000000000000000000000000..7d900b0c0eb291c67537b9cf93227c671bafdc73
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_pserver.png differ
diff --git a/doc/fluid/design/dist_train/src/async_update.graffle b/doc/fluid/design/dist_train/src/async_update.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..3a631888688a0d564a873fcb16d943958c91223e
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_update.graffle differ
diff --git a/doc/fluid/design/dist_train/src/async_update.png b/doc/fluid/design/dist_train/src/async_update.png
new file mode 100644
index 0000000000000000000000000000000000000000..3e8db973f45d6d9ac8dcce1dc7878067e79e6dcc
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_update.png differ
diff --git a/doc/fluid/design/dist_train/src/distributed_training.graffle b/doc/fluid/design/dist_train/src/distributed_training.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..1168801bc1fadfce310a74cb3110695bd1629f6b
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_training.graffle differ
diff --git a/doc/fluid/design/dist_train/src/sync_distributed_training.png b/doc/fluid/design/dist_train/src/sync_distributed_training.png
new file mode 100644
index 0000000000000000000000000000000000000000..e4f9a221fea4b7238e8a1d84e609c0371f6ef7a2
Binary files /dev/null and b/doc/fluid/design/dist_train/src/sync_distributed_training.png differ
diff --git a/doc/fluid/dev/index_cn.rst b/doc/fluid/dev/index_cn.rst
index b123b756e2251c38f319e1aefa2cb04fd7a36b03..ad798003f560e7fb0e6db6083fdd152fd3417584 100644
--- a/doc/fluid/dev/index_cn.rst
+++ b/doc/fluid/dev/index_cn.rst
@@ -4,6 +4,7 @@
 .. toctree::
   :maxdepth: 1
 
+  api_doc_std_cn.md
   new_op_cn.md
   new_op_kernel.md
   use_eigen_cn.md
diff --git a/doc/fluid/dev/index_en.rst b/doc/fluid/dev/index_en.rst
index 98988fc22dcedecdbcd67fb3bf761377bf046337..80c899a82fa452c5cd8f38dad89c15d3041b09e3 100644
--- a/doc/fluid/dev/index_en.rst
+++ b/doc/fluid/dev/index_en.rst
@@ -4,6 +4,7 @@ Development
 .. toctree::
   :maxdepth: 1
 
+  api_doc_std_en.md
   new_op_en.md
   new_op_kernel.md
   use_eigen_en.md
diff --git a/doc/mobile/CMakeLists.txt b/doc/mobile/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b104a6318d474d6531670b8ac3569448774850c7
--- /dev/null
+++ b/doc/mobile/CMakeLists.txt
@@ -0,0 +1,53 @@
+if(NOT DEFINED SPHINX_THEME)
+    set(SPHINX_THEME default)
+endif()
+
+if(NOT DEFINED SPHINX_THEME_DIR)
+    set(SPHINX_THEME_DIR)
+endif()
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+
+configure_file(
+        "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
+        "${BINARY_BUILD_DIR_EN}/conf.py"
+        @ONLY)
+
+sphinx_add_target(paddle_mobile_docs
+        html
+        ${BINARY_BUILD_DIR_EN}
+        ${SPHINX_CACHE_DIR_EN}
+        ${CMAKE_CURRENT_SOURCE_DIR}
+        ${SPHINX_HTML_DIR_EN})
+
+add_dependencies(paddle_mobile_docs gen_proto_py paddle_python)
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")
+
+configure_file(
+        "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.cn.in"
+        "${BINARY_BUILD_DIR_CN}/conf.py"
+        @ONLY)
+
+sphinx_add_target(paddle_mobile_docs_cn
+        html
+        ${BINARY_BUILD_DIR_CN}
+        ${SPHINX_CACHE_DIR_CN}
+        ${CMAKE_CURRENT_SOURCE_DIR}
+        ${SPHINX_HTML_DIR_CN})
+
+add_dependencies(paddle_mobile_docs_cn gen_proto_py paddle_python)
diff --git a/doc/mobile/index_cn.rst b/doc/mobile/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8297316e8fbb2b8f41954030293feadbcd81295e
--- /dev/null
+++ b/doc/mobile/index_cn.rst
@@ -0,0 +1,9 @@
+移动端
+=====
+
+..  toctree::
+  :maxdepth: 1
+
+  cross_compiling_for_android_cn.md
+  cross_compiling_for_ios_cn.md
+  cross_compiling_for_raspberry_cn.md
\ No newline at end of file
diff --git a/doc/mobile/index_en.rst b/doc/mobile/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e0acdff0284e3bc84b2cc4a34a142ee01754f940
--- /dev/null
+++ b/doc/mobile/index_en.rst
@@ -0,0 +1,9 @@
+Mobile
+======
+
+..  toctree::
+  :maxdepth: 1
+
+  cross_compiling_for_android_en.md
+  cross_compiling_for_ios_en.md
+  cross_compiling_for_raspberry_en.md
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 85b649b2937f6a281b9ee1fe7bae8101169f6102..897e41f79f4e3bb9cecbe7b42fc6c4fd3401d839 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -1,5 +1,5 @@
 cc_library(var_handle SRCS var_handle.cc DEPS place)
-cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context)
+cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor)
 cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
 nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
@@ -20,3 +20,11 @@ cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
         simple_threadpool device_context)
+
+cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory)
+cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory)
+
+cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
+        device_context broadcast_op_handle)
+cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
+        device_context gather_op_handle)
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7d29012380e1b1710704d71a28d21dcc3097eb51
--- /dev/null
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -0,0 +1,111 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/broadcast_op_handle.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+Tensor *GetTensorFromVar(Variable *in_var) {
+  if (in_var->IsType<LoDTensor>()) {
+    return in_var->GetMutable<LoDTensor>();
+  } else if (in_var->IsType<SelectedRows>()) {
+    return in_var->GetMutable<SelectedRows>()->mutable_value();
+  } else {
+    PADDLE_THROW("Var should be LoDTensor or SelectedRows");
+  }
+  return nullptr;
+}
+
+BroadcastOpHandle::BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
+                                     const std::vector<platform::Place> &places)
+    : local_scopes_(local_scopes), places_(places) {}
+
+void BroadcastOpHandle::RunImpl() {
+  // the input may have dummy var.
+  std::vector<VarHandle *> in_var_handle;
+  for (auto *in : inputs_) {
+    auto *out_handle = dynamic_cast<VarHandle *>(in);
+    if (out_handle) {
+      in_var_handle.push_back(out_handle);
+    }
+  }
+  PADDLE_ENFORCE_EQ(in_var_handle.size(), 1,
+                    "The number of input should be one.");
+
+  // the output may have dummy var.
+  std::vector<VarHandle *> out_var_handles;
+  for (auto *out : outputs_) {
+    auto *out_handle = dynamic_cast<VarHandle *>(out);
+    if (out_handle) {
+      out_var_handles.push_back(out_handle);
+    }
+  }
+
+  PADDLE_ENFORCE_EQ(
+      out_var_handles.size(), places_.size(),
+      "The number of output should equal to the number of places.");
+
+  // Wait input done, this Wait is asynchronous operation
+  auto &in_place = in_var_handle[0]->place_;
+  if (in_var_handle[0]->generated_op_) {
+    for (auto *out : out_var_handles) {
+      auto &out_p = out->place_;
+      in_var_handle[0]->generated_op_->Wait(dev_ctxes_[out_p]);
+    }
+  }
+
+  //
+  auto in_scope_idx = in_var_handle[0]->scope_idx_;
+  auto in_var =
+      local_scopes_.at(in_scope_idx)->FindVar(in_var_handle[0]->name_);
+  Tensor *in_tensor = GetTensorFromVar(in_var);
+
+  for (auto *out : out_var_handles) {
+    auto &out_p = out->place_;
+    auto out_var = local_scopes_.at(out->scope_idx_)->FindVar(out->name_);
+
+    PADDLE_ENFORCE_EQ(out_p.which(), in_place.which(),
+                      "Places must be all on CPU or all on CUDA.");
+
+    if (in_var->IsType<framework::SelectedRows>()) {
+      auto &in_sr = in_var->Get<framework::SelectedRows>();
+      auto out_sr = out_var->GetMutable<framework::SelectedRows>();
+      if (&in_sr == out_sr) continue;
+      out_sr->set_height(in_sr.height());
+      out_sr->set_rows(in_sr.rows());
+      out_sr->mutable_value()->Resize(in_sr.value().dims());
+      out_sr->mutable_value()->mutable_data(out_p, in_sr.value().type());
+    } else if (in_var->IsType<framework::LoDTensor>()) {
+      auto in_lod = in_var->Get<framework::LoDTensor>();
+      auto out_lod = out_var->GetMutable<framework::LoDTensor>();
+      if (&in_lod == out_lod) continue;
+      out_lod->set_lod(in_lod.lod());
+      out_lod->Resize(in_lod.dims());
+      out_lod->mutable_data(out_p, in_lod.type());
+    } else {
+      PADDLE_THROW("Var should be LoDTensor or SelectedRows.");
+    }
+
+    Tensor *out_tensor = GetTensorFromVar(out_var);
+    paddle::framework::TensorCopy(*in_tensor, out_p, *(dev_ctxes_[in_place]),
+                                  out_tensor);
+  }
+}
+
+std::string BroadcastOpHandle::Name() const { return "broadcast"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3292422522b64a38a50f39f04e6f0d2e15492dd
--- /dev/null
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -0,0 +1,48 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct BroadcastOpHandle : public OpHandleBase {
+  const std::vector<Scope *> &local_scopes_;
+  const std::vector<platform::Place> &places_;
+
+  BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
+                    const std::vector<platform::Place> &places);
+
+  std::string Name() const override;
+
+  bool IsMultiDeviceTransfer() override { return false; };
+
+ protected:
+  void RunImpl() override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dfc52b012f8b6bf5cf1a3feab90dc1ec7842ad6c
--- /dev/null
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@@ -0,0 +1,231 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/broadcast_op_handle.h"
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+
+// test data amount
+const f::DDim kDims = {20, 20};
+
+struct TestBroadcastOpHandle {
+  std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
+  std::vector<Scope*> local_scopes_;
+  Scope g_scope_;
+  std::unique_ptr<OpHandleBase> op_handle_;
+  std::vector<std::unique_ptr<VarHandleBase>> vars_;
+  std::vector<p::Place> gpu_list_;
+
+  void WaitAll() {
+    for (size_t j = 0; j < ctxs_.size(); ++j) {
+      ctxs_[j]->Wait();
+    }
+  }
+
+  void InitCtxOnGpu(bool use_gpu) {
+    if (use_gpu) {
+#ifdef PADDLE_WITH_CUDA
+      int count = p::GetCUDADeviceCount();
+      if (count <= 1) {
+        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
+                        "device count is "
+                     << count;
+        exit(0);
+      }
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CUDAPlace(i);
+        gpu_list_.push_back(p);
+        ctxs_.emplace_back(new p::CUDADeviceContext(p));
+      }
+#else
+      PADDLE_THROW("CUDA is not support.");
+#endif
+    } else {
+      int count = 8;
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CPUPlace();
+        gpu_list_.push_back(p);
+        ctxs_.emplace_back(new p::CPUDeviceContext(p));
+      }
+    }
+  }
+
+  void InitBroadcastOp(size_t input_scope_idx) {
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      local_scopes_.push_back(&(g_scope_.NewScope()));
+      local_scopes_[j]->Var("out");
+    }
+    local_scopes_[input_scope_idx]->Var("input");
+
+    op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_));
+
+    vars_.emplace_back(new VarHandle());
+    VarHandle* in_var_handle = static_cast<VarHandle*>(vars_.back().get());
+    in_var_handle->place_ = gpu_list_[input_scope_idx];
+    in_var_handle->name_ = "input";
+    in_var_handle->version_ = 1;
+    in_var_handle->scope_idx_ = input_scope_idx;
+    in_var_handle->generated_op_ = nullptr;
+    op_handle_->AddInput(in_var_handle);
+
+    // add dummy var
+    vars_.emplace_back(new DummyVarHandle());
+    DummyVarHandle* dummy_var_handle =
+        static_cast<DummyVarHandle*>(vars_.back().get());
+    dummy_var_handle->generated_op_ = nullptr;
+    op_handle_->AddInput(dummy_var_handle);
+
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get();
+      vars_.emplace_back(new VarHandle());
+      VarHandle* out_var_handle = static_cast<VarHandle*>(vars_.back().get());
+      out_var_handle->place_ = gpu_list_[j];
+      out_var_handle->name_ = "out";
+      out_var_handle->version_ = 2;
+      out_var_handle->scope_idx_ = j;
+      op_handle_->AddOutput(out_var_handle);
+    }
+
+    // add dummy var
+    vars_.emplace_back(new DummyVarHandle());
+    DummyVarHandle* out_dummy_var_handle =
+        static_cast<DummyVarHandle*>(vars_.back().get());
+    out_dummy_var_handle->generated_op_ = nullptr;
+    op_handle_->AddOutput(out_dummy_var_handle);
+  }
+
+  void TestBroadcastLodTensor(size_t input_scope_idx) {
+    auto in_var = local_scopes_[input_scope_idx]->Var("input");
+    auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
+    in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
+
+    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
+    for (size_t k = 0; k < send_vector.size(); ++k) {
+      send_vector[k] = k;
+    }
+    f::LoD lod{{0, 10, 20}};
+    paddle::framework::TensorFromVector<float>(
+        send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
+    in_lod_tensor->set_lod(lod);
+
+    op_handle_->Run(false);
+
+    WaitAll();
+
+    p::CPUPlace cpu_place;
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      auto out_var = local_scopes_[j]->Var("out");
+      auto out_tensor = out_var->Get<f::LoDTensor>();
+      PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
+
+      f::Tensor result_tensor;
+      f::TensorCopy(out_tensor, cpu_place, *(ctxs_[j]), &result_tensor);
+      float* ct = result_tensor.mutable_data<float>(cpu_place);
+
+      for (int64_t i = 0; i < f::product(kDims); ++i) {
+        ASSERT_NEAR(ct[i], send_vector[i], 1e-5);
+      }
+    }
+  }
+
+  void TestBroadcastSelectedRows(size_t input_scope_idx) {
+    auto in_var = local_scopes_[input_scope_idx]->Var("input");
+    auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+    auto value = in_selected_rows->mutable_value();
+    value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
+    int height = static_cast<int>(kDims[0]) * 2;
+    std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
+                              2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
+    in_selected_rows->set_height(height);
+    in_selected_rows->set_rows(rows);
+
+    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
+    for (size_t k = 0; k < send_vector.size(); ++k) {
+      send_vector[k] = k;
+    }
+    paddle::framework::TensorFromVector<float>(
+        send_vector, *(ctxs_[input_scope_idx]), value);
+
+    op_handle_->Run(false);
+
+    WaitAll();
+
+    p::CPUPlace cpu_place;
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      auto out_var = local_scopes_[j]->Var("out");
+      auto& out_select_rows = out_var->Get<f::SelectedRows>();
+      auto rt = out_select_rows.value();
+
+      PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
+                        "height is not equal.");
+      for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
+        PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k]);
+      }
+
+      f::Tensor result_tensor;
+      f::TensorCopy(rt, cpu_place, *(ctxs_[j]), &result_tensor);
+      float* ct = result_tensor.data<float>();
+
+      for (int64_t i = 0; i < f::product(kDims); ++i) {
+        ASSERT_NEAR(ct[i], send_vector[i], 1e-5);
+      }
+    }
+  }
+};
+
+TEST(BroadcastTester, TestCPUBroadcastTestLodTensor) {
+  TestBroadcastOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(false);
+  test_op.InitBroadcastOp(input_scope_idx);
+  test_op.TestBroadcastLodTensor(input_scope_idx);
+}
+
+TEST(BroadcastTester, TestCPUBroadcastTestSelectedRows) {
+  TestBroadcastOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(false);
+  test_op.InitBroadcastOp(input_scope_idx);
+  test_op.TestBroadcastSelectedRows(input_scope_idx);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(BroadcastTester, TestGPUBroadcastTestLodTensor) {
+  TestBroadcastOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(true);
+  test_op.InitBroadcastOp(input_scope_idx);
+  test_op.TestBroadcastLodTensor(input_scope_idx);
+}
+
+TEST(BroadcastTester, TestGPUBroadcastTestSelectedRows) {
+  TestBroadcastOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(true);
+  test_op.InitBroadcastOp(input_scope_idx);
+  test_op.TestBroadcastSelectedRows(input_scope_idx);
+}
+#endif
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
index e3f8bbb72f2a1b75b6041d41496cef0efc81874f..ff6d91c1dafb0ab4cabb1646cc333e19a89eb812 100644
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -35,7 +35,9 @@ void ComputationOpHandle::RunImpl() {
     }
   }
 
-  op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
+  this->RunAndRecordEvent([this] {
+    op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
+  });
 }
 
 std::string ComputationOpHandle::Name() const { return op_->Type(); }
diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8dd85be567d33991ac003707fec939a61a2d0962
--- /dev/null
+++ b/paddle/fluid/framework/details/gather_op_handle.cc
@@ -0,0 +1,126 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/gather_op_handle.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+GatherOpHandle::GatherOpHandle(const std::vector<Scope *> &local_scopes,
+                               const std::vector<platform::Place> &places)
+    : local_scopes_(local_scopes), places_(places) {}
+
+void GatherOpHandle::RunImpl() {
+  // the input may have dummy var.
+  std::vector<VarHandle *> in_var_handles;
+  for (auto *in : inputs_) {
+    auto *in_handle = dynamic_cast<VarHandle *>(in);
+    if (in_handle) {
+      in_var_handles.push_back(in_handle);
+    }
+  }
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), places_.size(),
+      "The number of output should equal to the number of places.");
+
+  // the output may have dummy var.
+  std::vector<VarHandle *> out_var_handles;
+  for (auto *out : outputs_) {
+    auto *out_handle = dynamic_cast<VarHandle *>(out);
+    if (out_handle) {
+      out_var_handles.push_back(out_handle);
+    }
+  }
+  PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
+                    "The number of output should be one.");
+
+  auto in_0_handle = static_cast<VarHandle *>(in_var_handles[0]);
+  auto pre_in_var =
+      local_scopes_[in_0_handle->scope_idx_]->FindVar(in_0_handle->name_);
+  auto pre_place = in_0_handle->place_;
+
+  PADDLE_ENFORCE(pre_in_var->IsType<framework::SelectedRows>(),
+                 "Currently, gather_op only can gather SelectedRows.");
+
+  PADDLE_ENFORCE_EQ(out_var_handles[0]->place_.which(), pre_place.which(),
+                    "The place of input and output should be the same.");
+
+  // Wait input done, this Wait is asynchronous operation
+  for (auto *in : in_var_handles) {
+    if (in->generated_op_) {
+      in->generated_op_->Wait(dev_ctxes_[in->place_]);
+    }
+  }
+
+  std::vector<int64_t> out_rows;
+  std::vector<Tensor> in_tensors;
+  std::vector<platform::Place> in_places;
+
+  auto &pre_in = pre_in_var->Get<framework::SelectedRows>();
+  // gather the inputs
+  for (auto *in : in_var_handles) {
+    auto in_handle = static_cast<VarHandle *>(in);
+    auto in_p = in_handle->place_;
+    in_places.push_back(in_p);
+    PADDLE_ENFORCE_EQ(in_p.which(), pre_place.which(),
+                      "Places must be all on CPU or all on CUDA.");
+    auto in_var =
+        local_scopes_.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
+    auto &in_sr = in_var->Get<framework::SelectedRows>();
+
+    PADDLE_ENFORCE_EQ(in_sr.value().type(), pre_in.value().type(),
+                      "The type of input is not consistent.");
+    PADDLE_ENFORCE_EQ(pre_in.height(), in_sr.height(),
+                      "The height of inputs is not consistent.");
+    PADDLE_ENFORCE_EQ(pre_in.GetCompleteDims(), in_sr.GetCompleteDims(), ,
+                      "The dims of inputs is not consistent.");
+
+    auto in_sr_rows = in_sr.rows();
+    out_rows.insert(out_rows.end(), in_sr_rows.begin(), in_sr_rows.end());
+
+    in_tensors.emplace_back(in_sr.value());
+  }
+
+  // write the output
+  auto &out_place = out_var_handles[0]->place_;
+  auto out_scope_idx = out_var_handles[0]->scope_idx_;
+  auto out_var =
+      local_scopes_[out_scope_idx]->FindVar(out_var_handles[0]->name_);
+
+  auto out = out_var->GetMutable<framework::SelectedRows>();
+  out->set_height(pre_in.height());
+  out->set_rows(out_rows);
+  size_t rows = out_rows.size();
+  DDim out_dim = pre_in.GetCompleteDims();
+  out_dim[0] = static_cast<int64_t>(rows);
+  out->mutable_value()->Resize(out_dim);
+  out->mutable_value()->mutable_data(out_place, pre_in.value().type());
+  Tensor *out_tensor = out->mutable_value();
+
+  // copy
+  int s = 0, e = 0;
+  for (size_t j = 0; j < in_tensors.size(); ++j) {
+    e += in_tensors[j].dims()[0];
+    auto sub_out = out_tensor->Slice(s, e);
+    paddle::framework::TensorCopy(in_tensors[j], out_place,
+                                  *(dev_ctxes_[in_places[j]]), &sub_out);
+    s = e;
+  }
+}
+
+std::string GatherOpHandle::Name() const { return "gather"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/gather_op_handle.h b/paddle/fluid/framework/details/gather_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c0231f642c05e6b558b7e2518a15e08c816fe4b
--- /dev/null
+++ b/paddle/fluid/framework/details/gather_op_handle.h
@@ -0,0 +1,48 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct GatherOpHandle : public OpHandleBase {
+  const std::vector<Scope *> &local_scopes_;
+  const std::vector<platform::Place> &places_;
+
+  GatherOpHandle(const std::vector<Scope *> &local_scopes,
+                 const std::vector<platform::Place> &places);
+
+  std::string Name() const override;
+
+  bool IsMultiDeviceTransfer() override { return false; };
+
+ protected:
+  void RunImpl() override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..10839f239d59e97946575297a6d125968a1458f4
--- /dev/null
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -0,0 +1,192 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/gather_op_handle.h"
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+
+// test data amount
+const f::DDim kDims = {20, 20};
+
+struct TestGatherOpHandle {
+  std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
+  std::vector<Scope*> local_scopes_;
+  Scope g_scope_;
+  std::unique_ptr<OpHandleBase> op_handle_;
+  std::vector<std::unique_ptr<VarHandleBase>> vars_;
+  std::vector<p::Place> gpu_list_;
+
+  void WaitAll() {
+    for (size_t j = 0; j < ctxs_.size(); ++j) {
+      ctxs_[j]->Wait();
+    }
+  }
+
+  void InitCtxOnGpu(bool use_gpu) {
+    if (use_gpu) {
+#ifdef PADDLE_WITH_CUDA
+      int count = p::GetCUDADeviceCount();
+      if (count <= 1) {
+        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
+                        "device count is "
+                     << count;
+        exit(0);
+      }
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CUDAPlace(i);
+        gpu_list_.push_back(p);
+        ctxs_.emplace_back(new p::CUDADeviceContext(p));
+      }
+#else
+      PADDLE_THROW("CUDA is not support.");
+#endif
+    } else {
+      int count = 8;
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CPUPlace();
+        gpu_list_.push_back(p);
+        ctxs_.emplace_back(new p::CPUDeviceContext(p));
+      }
+    }
+  }
+
+  void InitGatherOp(size_t input_scope_idx) {
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      local_scopes_.push_back(&(g_scope_.NewScope()));
+      local_scopes_[j]->Var("out");
+    }
+    local_scopes_[input_scope_idx]->Var("input");
+
+    op_handle_.reset(new GatherOpHandle(local_scopes_, gpu_list_));
+    // add input
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      op_handle_->dev_ctxes_[gpu_list_[j]] = ctxs_[j].get();
+      vars_.emplace_back(new VarHandle());
+      VarHandle* in_var_handle = static_cast<VarHandle*>(vars_.back().get());
+      in_var_handle->place_ = gpu_list_[j];
+      in_var_handle->name_ = "input";
+      in_var_handle->version_ = 1;
+      in_var_handle->scope_idx_ = j;
+      in_var_handle->generated_op_ = nullptr;
+      op_handle_->AddInput(in_var_handle);
+    }
+
+    // add dummy var
+    vars_.emplace_back(new DummyVarHandle());
+    DummyVarHandle* in_dummy_var_handle =
+        static_cast<DummyVarHandle*>(vars_.back().get());
+    in_dummy_var_handle->generated_op_ = nullptr;
+    op_handle_->AddInput(in_dummy_var_handle);
+
+    // add output
+    vars_.emplace_back(new VarHandle());
+    VarHandle* out_var_handle = static_cast<VarHandle*>(vars_.back().get());
+    out_var_handle->place_ = gpu_list_[input_scope_idx];
+    out_var_handle->name_ = "out";
+    out_var_handle->version_ = 2;
+    out_var_handle->scope_idx_ = input_scope_idx;
+    op_handle_->AddOutput(out_var_handle);
+
+    // add dummy var
+    vars_.emplace_back(new DummyVarHandle());
+    DummyVarHandle* dummy_var_handle =
+        static_cast<DummyVarHandle*>(vars_.back().get());
+    op_handle_->AddOutput(dummy_var_handle);
+  }
+
+  void TestGatherSelectedRows(size_t output_scope_idx) {
+    int height = kDims[0] * 2;
+    std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
+                              2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
+    std::vector<float> send_vector(f::product(kDims));
+    for (size_t k = 0; k < send_vector.size(); ++k) {
+      send_vector[k] = k;
+    }
+
+    for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
+         ++input_scope_idx) {
+      auto in_var = local_scopes_[input_scope_idx]->Var("input");
+      auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+      auto value = in_selected_rows->mutable_value();
+      value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
+
+      in_selected_rows->set_height(height);
+      in_selected_rows->set_rows(rows);
+
+      paddle::framework::TensorFromVector<float>(
+          send_vector, *(ctxs_[input_scope_idx]), value);
+      value->Resize(kDims);
+    }
+
+    auto out_var = local_scopes_[output_scope_idx]->Var("out");
+    auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
+
+    auto in_var = local_scopes_[output_scope_idx]->Var("input");
+    auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+
+    out_selected_rows->mutable_value()->ShareDataWith(
+        in_selected_rows->value());
+
+    op_handle_->Run(false);
+
+    WaitAll();
+
+    p::CPUPlace cpu_place;
+
+    auto& out_select_rows = out_var->Get<f::SelectedRows>();
+    auto rt = out_select_rows.value();
+
+    PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal.");
+    for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
+      PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]);
+    }
+
+    f::Tensor result_tensor;
+    f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor);
+    float* ct = result_tensor.data<float>();
+
+    for (int64_t j = 0; j < f::product(kDims); ++j) {
+      ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5);
+    }
+  }
+};
+
+TEST(GatherTester, TestCPUGatherTestSelectedRows) {
+  TestGatherOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(false);
+  test_op.InitGatherOp(input_scope_idx);
+  test_op.TestGatherSelectedRows(input_scope_idx);
+}
+
+#ifdef PADDLE_WITH_CUDA
+
+TEST(GatherTester, TestGPUGatherTestSelectedRows) {
+  TestGatherOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(false);
+  test_op.InitGatherOp(input_scope_idx);
+  test_op.TestGatherSelectedRows(input_scope_idx);
+}
+#endif
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index e0dd9e6068174a4b0348d503f4082bee6ff68dac..5a95cbc53625888bac539f91af391ff0babec17b 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -55,21 +55,21 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
   }
 }
 
-void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result, OpDesc *op,
+void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
+                                                const OpDesc &op,
                                                 const platform::Place &p,
                                                 const size_t &i) const {
   auto *op_handle = result->ops_.back().get();
-  op_handle->dev_ctxes_[p] = const_cast<platform::DeviceContext *>(
-      platform::DeviceContextPool::Instance().Get(p));
+  op_handle->dev_ctxes_[p] = platform::DeviceContextPool::Instance().Get(p);
 
-  auto var_names = op->InputArgumentNames();
+  auto var_names = op.InputArgumentNames();
 
   for (auto &each_var_name : var_names) {
     VarHandle *var = CreateOrGetLatestVarHandle(result, each_var_name, p, i);
     op_handle->AddInput(var);
   }
 
-  var_names = op->OutputArgumentNames();
+  var_names = op.OutputArgumentNames();
 
   for (auto &each_var_name : var_names) {
     CreateOpOutput(result, op_handle, each_var_name, p, i);
@@ -107,7 +107,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
       result.ops_.emplace_back(new SendOpHandle(*op, s, p));
       // Create inputs for output on original place and no ssa output
       // is created for send op.
-      CreateOpHandleIOs(&result, op, p, 0);
+      CreateOpHandleIOs(&result, *op, p, 0);
       continue;
     }
 
@@ -117,7 +117,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
 
       result.ops_.emplace_back(new ComputationOpHandle(*op, s, p));
       auto *op_handle = result.ops_.back().get();
-      CreateOpHandleIOs(&result, op, p, i);
+      CreateOpHandleIOs(&result, *op, p, i);
 
       auto var_names = op->OutputArgumentNames();
 
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index de34caab1be85eecb741a5003f026eb982e178ea..f1518d75b421006db6311c3b0f602e47000ab381 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -45,8 +45,8 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
   std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
 
  private:
-  void CreateOpHandleIOs(SSAGraph *result, OpDesc *op, const platform::Place &p,
-                         const size_t &i) const;
+  void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op,
+                         const platform::Place &p, const size_t &i) const;
 
  private:
   std::string loss_var_name_;
diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
index 55b5f113589e090386d287e228349f22fb94a7ab..1e48f75958a3ada4d1cd5c8d0f920da4fed2157e 100644
--- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
 
+#include <algorithm>
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -27,6 +29,32 @@ NCCLAllReduceOpHandle::NCCLAllReduceOpHandle(
   }
 }
 
+struct ReduceLoDTensor {
+  const std::vector<LoDTensor> &src_tensors_;
+  LoDTensor &dst_tensor_;
+
+  ReduceLoDTensor(const std::vector<LoDTensor> &src, LoDTensor *dst)
+      : src_tensors_(src), dst_tensor_(*dst) {}
+
+  template <typename T>
+  void operator()() const {
+    PADDLE_ENFORCE(!src_tensors_.empty());
+    auto &t0 = src_tensors_[0];
+    PADDLE_ENFORCE_NE(t0.numel(), 0);
+    dst_tensor_.Resize(t0.dims());
+    T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
+    std::copy(t0.data<T>(), t0.data<T>() + t0.numel(), dst);
+
+    for (size_t i = 1; i < src_tensors_.size(); ++i) {
+      auto &t = src_tensors_[i];
+      PADDLE_ENFORCE_EQ(t.dims(), t0.dims());
+      PADDLE_ENFORCE_EQ(t.type(), t0.type());
+      std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
+                     [](T a, T b) -> T { return a + b; });
+    }
+  }
+};
+
 void NCCLAllReduceOpHandle::RunImpl() {
   if (inputs_.size() == 1) {
     return;  // No need to all reduce when GPU count = 1;
@@ -41,37 +69,66 @@ void NCCLAllReduceOpHandle::RunImpl() {
     int dtype = -1;
     size_t numel = 0;
 
-    std::vector<std::function<void()>> all_reduce_calls;
+    std::vector<LoDTensor> lod_tensors;
 
     for (size_t i = 0; i < local_scopes_.size(); ++i) {
-      auto &p = places_[i];
       auto *s = local_scopes_[i];
-      int dev_id = boost::get<platform::CUDAPlace>(p).device;
 
       auto &lod_tensor = s->FindVar(var_name)->Get<LoDTensor>();
-      void *buffer = const_cast<void *>(lod_tensor.data<void>());
+      lod_tensors.emplace_back(lod_tensor);
+    }
 
-      if (dtype == -1) {
-        dtype = platform::ToNCCLDataType(lod_tensor.type());
-      }
+    if (platform::is_gpu_place(lod_tensors[0].place())) {
+      std::vector<std::function<void()>> all_reduce_calls;
+      for (size_t i = 0; i < local_scopes_.size(); ++i) {
+        auto &p = places_[i];
+        auto &lod_tensor = lod_tensors[i];
+        void *buffer = const_cast<void *>(lod_tensor.data<void>());
 
-      if (numel == 0) {
-        numel = static_cast<size_t>(lod_tensor.numel());
-      }
+        if (dtype == -1) {
+          dtype = platform::ToNCCLDataType(lod_tensor.type());
+        }
 
-      auto &nccl_ctx = nccl_ctxs_.at(dev_id);
-      auto stream = nccl_ctx.stream();
-      auto comm = nccl_ctx.comm_;
-      all_reduce_calls.emplace_back([=] {
-        PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
-            buffer, buffer, numel, static_cast<ncclDataType_t>(dtype), ncclSum,
-            comm, stream));
+        if (numel == 0) {
+          numel = static_cast<size_t>(lod_tensor.numel());
+        }
+
+        int dev_id = boost::get<platform::CUDAPlace>(p).device;
+        auto &nccl_ctx = nccl_ctxs_.at(dev_id);
+        auto stream = nccl_ctx.stream();
+        auto comm = nccl_ctx.comm_;
+        all_reduce_calls.emplace_back([=] {
+          PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+              buffer, buffer, numel, static_cast<ncclDataType_t>(dtype),
+              ncclSum, comm, stream));
+        });
+      }
+      this->RunAndRecordEvent([&] {
+        platform::NCCLGroupGuard guard;
+        for (auto &call : all_reduce_calls) {
+          call();
+        }
       });
-    }
+    } else {  // Special handle CPU only Operator's gradient. Like CRF
+      auto &trg =
+          *this->local_scopes_[0]->Var()->GetMutable<framework::LoDTensor>();
+
+      // Reduce All Tensor to trg in CPU
+      ReduceLoDTensor func(lod_tensors, &trg);
+      VisitDataType(ToDataType(lod_tensors[0].type()), func);
 
-    platform::NCCLGroupGuard guard;
-    for (auto &call : all_reduce_calls) {
-      call();
+      for (size_t i = 0; i < local_scopes_.size(); ++i) {
+        auto &scope = local_scopes_[i];
+        auto &p = places_[i];
+        auto *var = scope->FindVar(var_name);
+        auto *dev_ctx = dev_ctxes_[p];
+
+        RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
+          auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>();
+          auto &tensor_cpu = trg;
+          TensorCopy(tensor_cpu, p, *dev_ctx, &tensor_gpu);
+        });
+      }
     }
   }
 }
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index e4194a7442f677ec8970dbc387bb01ebbbf579f1..534d77860f87be08c8834efd373d90eb199ed6a2 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -54,17 +54,6 @@ void OpHandleBase::Run(bool use_event) {
 #endif
 
   RunImpl();
-
-#ifdef PADDLE_WITH_CUDA
-  if (use_event) {
-    for (auto &p : dev_ctxes_) {
-      int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
-      auto stream =
-          static_cast<platform::CUDADeviceContext *>(p.second)->stream();
-      PADDLE_ENFORCE(cudaEventRecord(events_.at(dev_id), stream));
-    }
-  }
-#endif
 }
 
 void OpHandleBase::Wait(platform::DeviceContext *waited_dev) {
@@ -97,6 +86,43 @@ void OpHandleBase::AddOutput(VarHandleBase *out) {
   out->generated_op_ = this;
 }
 
+void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
+#ifdef PADDLE_WITH_CUDA
+  if (!events_.empty()) {  // Use event
+    std::function<void()> method = callback;
+
+    for (auto &p : dev_ctxes_) {
+      method = [method, p, this]() {
+        static_cast<platform::CUDADeviceContext *>(p.second)->RecordEvent(
+            events_.at(boost::get<platform::CUDAPlace>(p.first).device),
+            method);
+      };
+    }
+    method();
+  } else {
+#endif
+    callback();
+#ifdef PADDLE_WITH_CUDA
+  }
+#endif
+}
+
+void OpHandleBase::RunAndRecordEvent(platform::Place p,
+                                     const std::function<void()> &callback) {
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_cpu_place(p) || events_.empty()) {
+    callback();
+  } else {
+    auto *ctx = dev_ctxes_.at(p);
+    auto *cuda_ctx = static_cast<platform::CUDADeviceContext *>(ctx);
+    cuda_ctx->RecordEvent(events_.at(boost::get<platform::CUDAPlace>(p).device),
+                          callback);
+  }
+#else
+  callback();
+#endif
+}
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index fbdb54ba8d940c8dedd44a42a85825af5d2ec664..a9a6c8d39cf8741f7d9c91579a650ad742cec381 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -62,6 +62,11 @@ class OpHandleBase {
   virtual bool IsMultiDeviceTransfer() { return false; }
 
  protected:
+  void RunAndRecordEvent(const std::function<void()> &callback);
+
+  void RunAndRecordEvent(platform::Place p,
+                         const std::function<void()> &callback);
+
   virtual void RunImpl() = 0;
 };
 
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index 0a6f6129b812ca84db7573957b1ee0a32c1ef5c4..7fb9f99a8a1bc044e2f25f373265a5ec9f7d76d5 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
 
+#include <string>
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -37,11 +39,13 @@ void ScaleLossGradOpHandle::RunImpl() {
     *tmp = coeff_;
   } else {
 #ifdef PADDLE_WITH_CUDA
-    auto stream =
-        static_cast<platform::CUDADeviceContext *>(this->dev_ctxes_[place_])
-            ->stream();
-    memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
-                 platform::CPUPlace(), &coeff_, sizeof(float), stream);
+    this->RunAndRecordEvent([&] {
+      auto stream =
+          static_cast<platform::CUDADeviceContext *>(this->dev_ctxes_[place_])
+              ->stream();
+      memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
+                   platform::CPUPlace(), &coeff_, sizeof(float), stream);
+    });
 #endif
   }
 }
diff --git a/paddle/fluid/framework/details/send_op_handle.cc b/paddle/fluid/framework/details/send_op_handle.cc
index d181607e86372f4872c38bc35db786ac142ccc65..549b9d9abbe5bfd17df3509e0442bfa19b7ecd61 100644
--- a/paddle/fluid/framework/details/send_op_handle.cc
+++ b/paddle/fluid/framework/details/send_op_handle.cc
@@ -34,7 +34,7 @@ void SendOpHandle::RunImpl() {
     }
     in->generated_op_->Wait(dev_ctxes_[p]);
   }
-  op_->Run(*local_scope_, place_);
+  this->RunAndRecordEvent([&] { op_->Run(*local_scope_, place_); });
 }
 
 std::string SendOpHandle::Name() const { return "send"; }
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 1ce69ab02b09fe7ec17f479bcef97c931e853dc4..3d2bd633afff1d453d00faeca3b3dcf77f8dd5d7 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -33,13 +33,6 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
       running_ops_(0),
       allow_op_delay_(allow_op_delay) {}
 
-void ThreadedSSAGraphExecutor::RunDelayedOps(
-    const std::unordered_set<OpHandleBase *> &delayed_ops) {
-  for (auto op : delayed_ops) {
-    op->Run(use_event_);
-  }
-}
-
 FeedFetchList ThreadedSSAGraphExecutor::Run(
     const std::vector<std::string> &fetch_tensors) {
   std::unordered_map<OpHandleBase *, size_t> pending_ops;
@@ -51,8 +44,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   // together since we currently cannot overlap computation and memcpy streams.
   // Should revisit it if overlapping is available.
   std::unordered_set<OpHandleBase *> delayed_ops;
-  std::unordered_set<OpHandleBase *> blocked_by_delayed_ops;
-  std::unordered_set<VarHandleBase *> delayed_vars;
 
   auto InsertPendingVar = [&pending_vars, &ready_vars](VarHandleBase &var) {
     pending_vars.insert(&var);
@@ -122,24 +113,26 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     InsertPendingOp(*op);
   }
 
-  auto run_all_ready_ops = [&] {
-    for (auto *op : ready_ops) {
-      if (op->IsMultiDeviceTransfer() && allow_op_delay_) {
-        delayed_ops.insert(op);
-        delayed_vars.insert(op->outputs_.begin(), op->outputs_.end());
-        ready_vars.Extend(op->outputs_);
-        continue;
-      }
+  auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
+    for (auto *op : set) {
       running_ops_++;
       RunOp(&ready_vars, op);
     }
-    ready_ops.clear();
+    set.clear();
   };
 
   // Step 3. Execution
-  while (!pending_vars.empty() || !ready_ops.empty() || !delayed_ops.empty()) {
+  while (!pending_vars.empty()) {
     // 1. Run All Ready ops
-    run_all_ready_ops();
+    // Keep loop until all vars are ready.
+    //
+    // NOTE: DelayedOps have a lower priority. It will be scheduled after all
+    // ready_ops have been performed.
+    if (ready_ops.empty() && allow_op_delay_) {
+      run_all_ops(delayed_ops);
+    } else {
+      run_all_ops(ready_ops);
+    }
 
     // 2. Find ready variable
     bool timeout;
@@ -160,29 +153,16 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
         auto &deps = pending_ops[op];
         --deps;
         if (deps == 0) {
-          if (delayed_vars.find(ready_var) != delayed_vars.end()) {
-            blocked_by_delayed_ops.insert(op);
+          if (op->IsMultiDeviceTransfer() && allow_op_delay_) {
+            delayed_ops.insert(op);
           } else {
             ready_ops.insert(op);
           }
         }
       }
     }
-    // When there are no other ops to schedule, schedule buffered delayed
-    // ops and unblock other ops.
-    if (ready_ops.empty() && !delayed_ops.empty() && running_ops_ == 0) {
-      RunDelayedOps(delayed_ops);
-      delayed_ops.clear();
-      for (auto *op : blocked_by_delayed_ops) {
-        ready_ops.insert(op);
-      }
-      blocked_by_delayed_ops.clear();
-    }
-    // Keep loop until all vars are ready.
   }
   PADDLE_ENFORCE(ready_ops.empty());
-  PADDLE_ENFORCE(delayed_ops.empty());
-  PADDLE_ENFORCE(blocked_by_delayed_ops.empty());
 
   // Wait FetchOps.
   if (!fetch_ops.empty()) {
@@ -196,10 +176,12 @@ void ThreadedSSAGraphExecutor::RunOp(
     BlockingQueue<VarHandleBase *> *ready_var_q, details::OpHandleBase *op) {
   auto op_run = [ready_var_q, op, this] {
     try {
-      VLOG(10) << op->Name() << " : " << op->DebugString();
+      VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
       op->Run(use_event_);
+      VLOG(10) << op << " " << op->Name() << " Done ";
       running_ops_--;
       ready_var_q->Extend(op->outputs_);
+      VLOG(10) << op << " " << op->Name() << "Signal posted";
     } catch (platform::EnforceNotMet ex) {
       exception_.reset(new platform::EnforceNotMet(ex));
     } catch (...) {
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index bb5e837b135c35b5aea403496b45aab1ccc288ff..d70bbd4ef0eb02d1b473bf88e526996819aec5f9 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -88,8 +88,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   void RunOp(BlockingQueue<VarHandleBase *> *ready_var_q,
              details::OpHandleBase *op);
 
-  void RunDelayedOps(const std::unordered_set<OpHandleBase *> &delayed_ops);
-
  private:
   std::unique_ptr<::ThreadPool> pool_;
   std::vector<Scope *> local_scopes_;
diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
index 569dda17c6e91d5658c4f8b9ba0b8c8fbd966832..871e41343f53b801a22d3a450f0906f37fb372d1 100644
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -50,6 +50,7 @@ struct VarHandle : public VarHandleBase {
   // version field currently is not used, however, just store the version to
   // debug easily.
   size_t version_;
+  size_t scope_idx_;
   std::string name_;
   platform::Place place_;
 };
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index c2ca1bbc78f3ebc6066df6b666720af0d1fbbf59..513e720fd099bcd898a6c73afd1a3a16f6f53aab 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -83,8 +83,8 @@ static void CheckTensorNANOrInf(const std::string& name,
   if (tensor.memory_size() == 0) {
     return;
   }
-  if (tensor.type().hash_code() != typeid(float).hash_code() &&
-      tensor.type().hash_code() != typeid(double).hash_code()) {
+  if (tensor.type().hash_code() != typeid(float).hash_code() &&   // NOLINT
+      tensor.type().hash_code() != typeid(double).hash_code()) {  // NOLINT
     return;
   }
   PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
@@ -145,12 +145,13 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
 // Return true if the block has feed operators and holder of matching info.
 static bool has_feed_operators(
     const BlockDesc& block,
-    std::map<std::string, const LoDTensor*>& feed_targets,
+    const std::map<std::string, const LoDTensor*>& feed_targets,
     const std::string& feed_holder_name) {
   size_t feed_count = 0;
   for (auto* op : block.AllOps()) {
     if (op->Type() == kFeedOpType) {
       feed_count++;
+      // The input variable's name of feed_op should be feed_holder_name.
       PADDLE_ENFORCE_EQ(op->Input("X")[0], feed_holder_name,
                         "Input to feed op should be '%s'", feed_holder_name);
       std::string feed_target_name = op->Output("Out")[0];
@@ -166,13 +167,15 @@ static bool has_feed_operators(
         feed_count, feed_targets.size(),
         "The number of feed operators should match 'feed_targets'");
 
-    // When feed operator are present, so should be feed_holder
-    auto var = block.FindVar(feed_holder_name);
-    PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
-                            feed_holder_name);
-    PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FEED_MINIBATCH,
-                      "'%s' variable should be 'FEED_MINIBATCH' type",
-                      feed_holder_name);
+    if (!feed_holder_name.empty()) {
+      // When feed operator are present, so should be feed_holder.
+      auto var = block.FindVar(feed_holder_name);
+      PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
+                              feed_holder_name);
+      PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FEED_MINIBATCH,
+                        "'%s' variable should be 'FEED_MINIBATCH' type",
+                        feed_holder_name);
+    }
   }
 
   return feed_count > 0;
@@ -185,12 +188,14 @@ static bool has_feed_operators(
 // and fetch_holder_name. Raise exception when any mismatch is found.
 // Return true if the block has fetch operators and holder of matching info.
 static bool has_fetch_operators(
-    const BlockDesc& block, std::map<std::string, LoDTensor*>& fetch_targets,
+    const BlockDesc& block,
+    const std::map<std::string, LoDTensor*>& fetch_targets,
     const std::string& fetch_holder_name) {
   size_t fetch_count = 0;
   for (auto* op : block.AllOps()) {
     if (op->Type() == kFetchOpType) {
       fetch_count++;
+      // The output variable's name of fetch_op should be fetch_holder_name.
       PADDLE_ENFORCE_EQ(op->Output("Out")[0], fetch_holder_name,
                         "Output of fetch op should be '%s'", fetch_holder_name);
       std::string fetch_target_name = op->Input("X")[0];
@@ -206,13 +211,15 @@ static bool has_fetch_operators(
         fetch_count, fetch_targets.size(),
         "The number of fetch operators should match 'fetch_targets'");
 
-    // When fetch operator are present, so should be fetch_holder
-    auto var = block.FindVar(fetch_holder_name);
-    PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
-                            fetch_holder_name);
-    PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FETCH_LIST,
-                      "'%s' variable should be 'FETCH_LIST' type",
-                      fetch_holder_name);
+    if (!fetch_holder_name.empty()) {
+      // When fetch operator are present, so should be fetch_holder.
+      auto var = block.FindVar(fetch_holder_name);
+      PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
+                              fetch_holder_name);
+      PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FETCH_LIST,
+                        "'%s' variable should be 'FETCH_LIST' type",
+                        fetch_holder_name);
+    }
   }
 
   return fetch_count > 0;
@@ -259,16 +266,6 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
     }
   }
 
-  // map the data of feed_targets to feed_holder
-  for (auto* op : global_block->AllOps()) {
-    if (op->Type() == kFeedOpType) {
-      std::string feed_target_name = op->Output("Out")[0];
-      int idx = boost::get<int>(op->GetAttr("col"));
-      SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name,
-                      idx);
-    }
-  }
-
   if (!has_fetch_ops) {
     // create fetch_holder variable
     auto* fetch_holder = global_block->Var(fetch_holder_name);
@@ -292,17 +289,9 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
     }
   }
 
-  Run(*copy_program, scope, 0, create_vars, create_vars);
-
-  // obtain the data of fetch_targets from fetch_holder
-  for (auto* op : global_block->AllOps()) {
-    if (op->Type() == kFetchOpType) {
-      std::string fetch_target_name = op->Input("X")[0];
-      int idx = boost::get<int>(op->GetAttr("col"));
-      *fetch_targets[fetch_target_name] =
-          GetFetchVariable(*scope, fetch_holder_name, idx);
-    }
-  }
+  auto ctx = Prepare(*copy_program, 0);
+  RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets, create_vars,
+                     feed_holder_name, fetch_holder_name);
 }
 
 std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
@@ -370,5 +359,42 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   }
 }
 
+void Executor::RunPreparedContext(
+    ExecutorPrepareContext* ctx, Scope* scope,
+    std::map<std::string, const LoDTensor*>& feed_targets,
+    std::map<std::string, LoDTensor*>& fetch_targets, bool create_vars,
+    const std::string& feed_holder_name, const std::string& fetch_holder_name) {
+  auto& global_block = ctx->prog_.Block(ctx->block_id_);
+
+  PADDLE_ENFORCE(
+      has_feed_operators(global_block, feed_targets, feed_holder_name),
+      "Program in ExecutorPrepareContext should has feed_ops.");
+  PADDLE_ENFORCE(
+      has_fetch_operators(global_block, fetch_targets, fetch_holder_name),
+      "Program in the prepared context should has fetch_ops.");
+
+  // map the data of feed_targets to feed_holder
+  for (auto* op : global_block.AllOps()) {
+    if (op->Type() == kFeedOpType) {
+      std::string feed_target_name = op->Output("Out")[0];
+      int idx = boost::get<int>(op->GetAttr("col"));
+      SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name,
+                      idx);
+    }
+  }
+
+  RunPreparedContext(ctx, scope, create_vars, create_vars);
+
+  // obtain the data of fetch_targets from fetch_holder
+  for (auto* op : global_block.AllOps()) {
+    if (op->Type() == kFetchOpType) {
+      std::string fetch_target_name = op->Input("X")[0];
+      int idx = boost::get<int>(op->GetAttr("col"));
+      *fetch_targets[fetch_target_name] =
+          GetFetchVariable(*scope, fetch_holder_name, idx);
+    }
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index 75b29b2f4065ad75b62a134b890b8f9f6730fdc7..43defdacf2a1c2f59cf3af2461ae6cfc4c61f5be 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <map>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
@@ -70,6 +73,13 @@ class Executor {
                           bool create_local_scope = true,
                           bool create_vars = true);
 
+  void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
+                          std::map<std::string, const LoDTensor*>& feed_targets,
+                          std::map<std::string, LoDTensor*>& fetch_targets,
+                          bool create_vars = true,
+                          const std::string& feed_holder_name = "feed",
+                          const std::string& fetch_holder_name = "fetch");
+
  private:
   const platform::Place place_;
 };
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index c1486b527d2e06d2b3f7e0f89458bf9a22564586..0962f40c4a64f18f7105626c54a83f1c5b299c50 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -155,13 +155,9 @@ void ParallelExecutor::BCastParamsToGPUs(
 #endif
 }
 
-void ParallelExecutor::Run(
-    const std::vector<std::string> &fetch_tensors,
-    const std::string &fetched_var_name,
-    const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
+void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
+                           const std::string &fetched_var_name) {
   platform::RecordBlock b(0);
-  SplitTensorToPlaces(feed_tensors);
-
   // Create local scopes.
   for (auto &scope : member_->local_scopes_) {
     Scope &local_scope = scope->NewScope();
@@ -195,14 +191,28 @@ void ParallelExecutor::Run(
     auto &local_scope =
         *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
     scope->DeleteScope(local_scope);
-    local_scope = nullptr;
   }
 }
 
-void ParallelExecutor::SplitTensorToPlaces(
-    const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
-  for (auto it : feed_tensors) {
-    auto lod_tensors = it.second.SplitLoDTensor(member_->places_);
+void ParallelExecutor::FeedTensorsIntoLocalScopes(
+    const std::vector<std::unordered_map<std::string, LoDTensor>> &tensors) {
+  PADDLE_ENFORCE_EQ(member_->local_scopes_.size(), tensors.size());
+
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    auto &map = tensors[i];
+    auto *scope = member_->local_scopes_[i];
+    for (auto &pair : map) {
+      auto *trg = scope->Var(pair.first)->GetMutable<LoDTensor>();
+      trg->ShareDataWith(pair.second);
+      trg->set_lod(pair.second.lod());
+    }
+  }
+}
+
+void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
+    const std::unordered_map<std::string, LoDTensor> &tensors) {
+  for (auto pair : tensors) {
+    auto lod_tensors = pair.second.SplitLoDTensor(member_->places_);
     PADDLE_ENFORCE_EQ(
         member_->places_.size(), lod_tensors.size(),
         "The number of samples of current batch is less than the count of "
@@ -211,7 +221,7 @@ void ParallelExecutor::SplitTensorToPlaces(
     for (size_t j = 0; j < member_->places_.size(); ++j) {
       // TODO(panxy0718): Do I need to delete this var?
       auto t =
-          member_->local_scopes_[j]->Var(it.first)->GetMutable<LoDTensor>();
+          member_->local_scopes_[j]->Var(pair.first)->GetMutable<LoDTensor>();
       t->ShareDataWith(lod_tensors[j]);
       t->set_lod(lod_tensors[j].lod());
     }
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index b4f16dba858fb279ec23a8a04257dda6651148cc..303ac3bc55cfed57a03765b27d8aba581eabd1c8 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -44,16 +44,22 @@ class ParallelExecutor {
 
   std::vector<Scope*>& GetLocalScopes();
 
+  /**
+   * Feed tensors to local scopes. The size of tensors should be equal to the
+   * size of local scopes.
+   */
+  void FeedTensorsIntoLocalScopes(
+      const std::vector<std::unordered_map<std::string, LoDTensor>>& tensors);
+
+  void FeedAndSplitTensorIntoLocalScopes(
+      const std::unordered_map<std::string, LoDTensor>& tensors);
+
   void Run(const std::vector<std::string>& fetch_tensors,
-           const std::string& fetched_var_name,
-           const std::unordered_map<std::string, LoDTensor>& feed_tensors);
+           const std::string& fetched_var_name);
 
   void BCastParamsToGPUs(const std::unordered_set<std::string>& vars) const;
 
  private:
-  void SplitTensorToPlaces(
-      const std::unordered_map<std::string, LoDTensor>& feed_tensors);
-
   ParallelExecutorPrivate* member_;
 };
 
diff --git a/paddle/fluid/framework/program_desc_test.cc b/paddle/fluid/framework/program_desc_test.cc
index 66618a291b59996836e822587af618927a4263c7..6c46e9aad5b7fbf67fdcc07a12e7932ac8b6412b 100644
--- a/paddle/fluid/framework/program_desc_test.cc
+++ b/paddle/fluid/framework/program_desc_test.cc
@@ -66,7 +66,7 @@ TEST(ProgramDesc, copy_ctor) {
 
   for (size_t i = 0; i < global_block->OpSize(); ++i) {
     auto op_origin = global_block->Op(i);
-    auto op_copy = global_block->Op(i);
+    auto op_copy = global_block_copy->Op(i);
 
     ASSERT_EQ(op_origin->Type(), op_copy->Type());
     ASSERT_EQ(op_origin->Inputs(), op_copy->Inputs());
@@ -131,7 +131,7 @@ TEST(ProgramDescBind, serialize_and_deserialize) {
 
   for (size_t i = 0; i < global_block->OpSize(); ++i) {
     auto op_origin = global_block->Op(i);
-    auto op_restored = global_block->Op(i);
+    auto op_restored = global_block_restored->Op(i);
 
     ASSERT_EQ(op_origin->Type(), op_restored->Type());
     ASSERT_EQ(op_origin->Inputs(), op_restored->Inputs());
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 1d864af011bced9df188147ec436b8de12947ba9..d1b01ae05b808b229309e9689165483a11530c84 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -11,8 +11,10 @@
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License. */
-
 #include "paddle/fluid/framework/tensor_util.h"
+#include <algorithm>
+#include <limits>
+#include <vector>
 
 namespace paddle {
 namespace framework {
@@ -65,8 +67,6 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
     auto ctx_place = ctx.GetPlace();
     PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
-    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
     memory::Copy(
         dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc
index 9854d618d2b29ed123833f55198179638c95d6db..f26f212d4d5793b88fd1e6d782cdf983bf341879 100644
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@@ -14,8 +14,12 @@
 
 #include "paddle/fluid/framework/threadpool.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/platform/enforce.h"
 
+DEFINE_int32(io_threadpool_size, 100,
+             "number of threads used for doing IO, default 100");
+
 namespace paddle {
 namespace framework {
 
@@ -91,5 +95,20 @@ void ThreadPool::TaskLoop() {
   }
 }
 
+std::unique_ptr<ThreadPool> ThreadPoolIO::io_threadpool_(nullptr);
+std::once_flag ThreadPoolIO::io_init_flag_;
+
+ThreadPool* ThreadPoolIO::GetInstanceIO() {
+  std::call_once(io_init_flag_, &ThreadPoolIO::InitIO);
+  return io_threadpool_.get();
+}
+
+void ThreadPoolIO::InitIO() {
+  if (io_threadpool_.get() == nullptr) {
+    // TODO(typhoonzero1986): make this configurable
+    io_threadpool_.reset(new ThreadPool(FLAGS_io_threadpool_size));
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h
index f9dce7105e32ff0ba03d03f8faaac3a4ed1a3595..94111ee335b1a5df327b3e46d62069b4735c54f6 100644
--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
@@ -14,12 +14,12 @@ limitations under the License. */
 
 #pragma once
 
-#include <condition_variable>
+#include <condition_variable>  // NOLINT
 #include <functional>
-#include <future>
-#include <mutex>
+#include <future>  // NOLINT
+#include <mutex>   // NOLINT
 #include <queue>
-#include <thread>
+#include <thread>  // NOLINT
 #include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -28,6 +28,22 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+struct ExceptionHandler {
+  mutable std::future<std::unique_ptr<platform::EnforceNotMet>> future_;
+  explicit ExceptionHandler(
+      std::future<std::unique_ptr<platform::EnforceNotMet>>&& f)
+      : future_(std::move(f)) {}
+  void operator()() const {
+    auto ex = this->future_.get();
+    if (ex != nullptr) {
+      LOG(FATAL) << "The exception is thrown inside the thread pool. You "
+                    "should use RunAndGetException to handle the exception.\n"
+                    "The default exception handler is LOG(FATAL)."
+                 << ex->what();
+    }
+  }
+};
+
 // ThreadPool maintains a queue of tasks, and runs them using a fixed
 // number of threads.
 class ThreadPool {
@@ -87,22 +103,6 @@ class ThreadPool {
   void Wait();
 
  private:
-  struct ExceptionHandler {
-    mutable std::future<std::unique_ptr<platform::EnforceNotMet>> future_;
-    explicit ExceptionHandler(
-        std::future<std::unique_ptr<platform::EnforceNotMet>>&& f)
-        : future_(std::move(f)) {}
-    void operator()() const {
-      auto ex = this->future_.get();
-      if (ex != nullptr) {
-        LOG(FATAL) << "The exception is thrown inside the thread pool. You "
-                      "should use RunAndGetException to handle the exception.\n"
-                      "The default exception handler is LOG(FATAL)."
-                   << ex->what();
-      }
-    }
-  };
-
   DISABLE_COPY_AND_ASSIGN(ThreadPool);
 
   // If the task queue is empty and avaialbe is equal to the number of
@@ -135,6 +135,17 @@ class ThreadPool {
   std::condition_variable completed_;
 };
 
+class ThreadPoolIO : ThreadPool {
+ public:
+  static ThreadPool* GetInstanceIO();
+  static void InitIO();
+
+ private:
+  // NOTE: threadpool in base will be inhereted here.
+  static std::unique_ptr<ThreadPool> io_threadpool_;
+  static std::once_flag io_init_flag_;
+};
+
 // Run a function asynchronously.
 // NOTE: The function must return void. If the function need to return a value,
 // you can use lambda to capture a value pointer.
@@ -143,5 +154,10 @@ std::future<void> Async(Callback callback) {
   return ThreadPool::GetInstance()->Run(callback);
 }
 
+template <typename Callback>
+std::future<void> AsyncIO(Callback callback) {
+  return ThreadPoolIO::GetInstanceIO()->Run(callback);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index e53bcf2384e54e21c7dd5638f3b7469a35b571bf..cc45bfe9b17d767be039cc0d8d83234b6994d6c1 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -21,4 +21,7 @@ endif()
 
 if(WITH_TESTING)
   add_subdirectory(tests/book)
+  if (TENSORRT_FOUND)
+    add_subdirectory(tensorrt)
+  endif()
 endif()
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index a29d457b6fa9d0e8297252c8ff1117013d2055f8..3b58019db6e55fa8198d2f77731095c6cf356266 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace inference {
 
-// Temporarilly add this function for exposing framework::InitDevices() when
+// Temporarily add this function for exposing framework::InitDevices() when
 // linking the inference shared library.
 void Init(bool init_p2p) { framework::InitDevices(init_p2p); }
 
diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e39c0daac76e0993382868289f66351da3d16f8f
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@@ -0,0 +1 @@
+nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
diff --git a/paddle/fluid/inference/tensorrt/test_tensorrt.cc b/paddle/fluid/inference/tensorrt/test_tensorrt.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a81a708e7a79225fd52c4b8e081afdcd8fe7e9ad
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/test_tensorrt.cc
@@ -0,0 +1,155 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "NvInfer.h"
+#include "cuda.h"
+#include "cuda_runtime_api.h"
+#include "paddle/fluid/platform/dynload/tensorrt.h"
+
+namespace dy = paddle::platform::dynload;
+
+class Logger : public nvinfer1::ILogger {
+ public:
+  void log(nvinfer1::ILogger::Severity severity, const char* msg) override {
+    switch (severity) {
+      case Severity::kINFO:
+        LOG(INFO) << msg;
+        break;
+      case Severity::kWARNING:
+        LOG(WARNING) << msg;
+        break;
+      case Severity::kINTERNAL_ERROR:
+      case Severity::kERROR:
+        LOG(ERROR) << msg;
+        break;
+      default:
+        break;
+    }
+  }
+};
+
+class ScopedWeights {
+ public:
+  ScopedWeights(float value) : value_(value) {
+    w.type = nvinfer1::DataType::kFLOAT;
+    w.values = &value_;
+    w.count = 1;
+  }
+  const nvinfer1::Weights& get() { return w; }
+
+ private:
+  float value_;
+  nvinfer1::Weights w;
+};
+
+// The following two API are implemented in TensorRT's header file, cannot load
+// from the dynamic library. So create our own implementation and directly
+// trigger the method from the dynamic library.
+nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger& logger) {
+  return static_cast<nvinfer1::IBuilder*>(
+      dy::createInferBuilder_INTERNAL(&logger, NV_TENSORRT_VERSION));
+}
+nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger& logger) {
+  return static_cast<nvinfer1::IRuntime*>(
+      dy::createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION));
+}
+
+const char* kInputTensor = "input";
+const char* kOutputTensor = "output";
+
+// Creates a network to compute y = 2x + 3
+nvinfer1::IHostMemory* CreateNetwork() {
+  Logger logger;
+  // Create the engine.
+  nvinfer1::IBuilder* builder = createInferBuilder(logger);
+  ScopedWeights weights(2.);
+  ScopedWeights bias(3.);
+
+  nvinfer1::INetworkDefinition* network = builder->createNetwork();
+  // Add the input
+  auto input = network->addInput(kInputTensor, nvinfer1::DataType::kFLOAT,
+                                 nvinfer1::DimsCHW{1, 1, 1});
+  EXPECT_NE(input, nullptr);
+  // Add the hidden layer.
+  auto layer = network->addFullyConnected(*input, 1, weights.get(), bias.get());
+  EXPECT_NE(layer, nullptr);
+  // Mark the output.
+  auto output = layer->getOutput(0);
+  output->setName(kOutputTensor);
+  network->markOutput(*output);
+  // Build the engine.
+  builder->setMaxBatchSize(1);
+  builder->setMaxWorkspaceSize(1 << 10);
+  auto engine = builder->buildCudaEngine(*network);
+  EXPECT_NE(engine, nullptr);
+  // Serialize the engine to create a model, then close.
+  nvinfer1::IHostMemory* model = engine->serialize();
+  network->destroy();
+  engine->destroy();
+  builder->destroy();
+  return model;
+}
+
+void Execute(nvinfer1::IExecutionContext& context, const float* input,
+             float* output) {
+  const nvinfer1::ICudaEngine& engine = context.getEngine();
+  // Two binds, input and output
+  ASSERT_EQ(engine.getNbBindings(), 2);
+  const int input_index = engine.getBindingIndex(kInputTensor);
+  const int output_index = engine.getBindingIndex(kOutputTensor);
+  // Create GPU buffers and a stream
+  void* buffers[2];
+  ASSERT_EQ(0, cudaMalloc(&buffers[input_index], sizeof(float)));
+  ASSERT_EQ(0, cudaMalloc(&buffers[output_index], sizeof(float)));
+  cudaStream_t stream;
+  ASSERT_EQ(0, cudaStreamCreate(&stream));
+  // Copy the input to the GPU, execute the network, and copy the output back.
+  ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], input, sizeof(float),
+                               cudaMemcpyHostToDevice, stream));
+  context.enqueue(1, buffers, stream, nullptr);
+  ASSERT_EQ(0, cudaMemcpyAsync(output, buffers[output_index], sizeof(float),
+                               cudaMemcpyDeviceToHost, stream));
+  cudaStreamSynchronize(stream);
+
+  // Release the stream and the buffers
+  cudaStreamDestroy(stream);
+  ASSERT_EQ(0, cudaFree(buffers[input_index]));
+  ASSERT_EQ(0, cudaFree(buffers[output_index]));
+}
+
+TEST(TensorrtTest, BasicFunction) {
+  // Create the network serialized model.
+  nvinfer1::IHostMemory* model = CreateNetwork();
+
+  // Use the model to create an engine and an execution context.
+  Logger logger;
+  nvinfer1::IRuntime* runtime = createInferRuntime(logger);
+  nvinfer1::ICudaEngine* engine =
+      runtime->deserializeCudaEngine(model->data(), model->size(), nullptr);
+  model->destroy();
+  nvinfer1::IExecutionContext* context = engine->createExecutionContext();
+
+  // Execute the network.
+  float input = 1234;
+  float output;
+  Execute(*context, &input, &output);
+  EXPECT_EQ(output, input * 2 + 3);
+
+  // Destroy the engine.
+  context->destroy();
+  engine->destroy();
+  runtime->destroy();
+}
diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
index ca2077d07411d2cd6095e0dc2a874af0890145c5..1e6555bb02033a28dedd2a1d1962981dfcc97cc2 100644
--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
@@ -46,8 +46,8 @@ TEST(inference, image_classification) {
 
   // Run inference on CPU
   LOG(INFO) << "--- CPU Runs: ---";
-  TestInference<paddle::platform::CPUPlace, false>(dirname, cpu_feeds,
-                                                   cpu_fetchs1, FLAGS_repeat);
+  TestInference<paddle::platform::CPUPlace, false, true>(
+      dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat);
   LOG(INFO) << output1.dims();
 
 #ifdef PADDLE_WITH_CUDA
@@ -57,8 +57,8 @@ TEST(inference, image_classification) {
 
   // Run inference on CUDA GPU
   LOG(INFO) << "--- GPU Runs: ---";
-  TestInference<paddle::platform::CUDAPlace, false>(dirname, cpu_feeds,
-                                                    cpu_fetchs2, FLAGS_repeat);
+  TestInference<paddle::platform::CUDAPlace, false, true>(
+      dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat);
   LOG(INFO) << output2.dims();
 
   CheckError<float>(output1, output2);
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index 064e400f0c750872ab2142c5fc8e28dd3da85b1a..c3a8d0889c6a6dd9591837ccc523da56f8d13661 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -89,7 +89,7 @@ void CheckError(const paddle::framework::LoDTensor& output1,
   EXPECT_EQ(count, 0U) << "There are " << count << " different elements.";
 }
 
-template <typename Place, bool CreateVars = true>
+template <typename Place, bool CreateVars = true, bool PrepareContext = false>
 void TestInference(const std::string& dirname,
                    const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
                    const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
@@ -175,8 +175,15 @@ void TestInference(const std::string& dirname,
     }
 
     // Ignore the profiling results of the first run
-    executor.Run(*inference_program, scope, feed_targets, fetch_targets,
-                 CreateVars);
+    std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx;
+    if (PrepareContext) {
+      ctx = executor.Prepare(*inference_program, 0);
+      executor.RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets,
+                                  CreateVars);
+    } else {
+      executor.Run(*inference_program, scope, feed_targets, fetch_targets,
+                   CreateVars);
+    }
 
     // Enable the profiler
     paddle::platform::EnableProfiler(state);
@@ -187,8 +194,15 @@ void TestInference(const std::string& dirname,
           "run_inference",
           paddle::platform::DeviceContextPool::Instance().Get(place));
 
-      executor.Run(*inference_program, scope, feed_targets, fetch_targets,
-                   CreateVars);
+      if (PrepareContext) {
+        // Note: if you change the inference_program, you need to call
+        // executor.Prepare() again to get a new ExecutorPrepareContext.
+        executor.RunPreparedContext(ctx.get(), scope, feed_targets,
+                                    fetch_targets, CreateVars);
+      } else {
+        executor.Run(*inference_program, scope, feed_targets, fetch_targets,
+                     CreateVars);
+      }
     }
 
     // Disable the profiler and print the timing information
diff --git a/paddle/fluid/operators/average_accumulates_op.cu b/paddle/fluid/operators/average_accumulates_op.cu
index 046f72b471fa7ffcc82d84262a668c90a7f577a8..104e24f6ee2e2503d98f3a3991a903d8dbc4bdfe 100644
--- a/paddle/fluid/operators/average_accumulates_op.cu
+++ b/paddle/fluid/operators/average_accumulates_op.cu
@@ -25,12 +25,14 @@ void GetAccumulators<paddle::platform::CUDADeviceContext>(
   auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
   auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
   auto stream = ctx.cuda_device_context().stream();
-  memory::Copy(platform::CPUPlace(), old_num_accumulates_,
-               platform::CUDAPlace(), in_old_num_accumulates->data<int64_t>(),
-               sizeof(int64_t), stream);
-  memory::Copy(platform::CPUPlace(), num_accumulates_, platform::CUDAPlace(),
+  auto cuda_place =
+      boost::get<platform::CUDAPlace>(in_old_num_accumulates->place());
+  memory::Copy(platform::CPUPlace(), old_num_accumulates_, cuda_place,
+               in_old_num_accumulates->data<int64_t>(), sizeof(int64_t),
+               stream);
+  memory::Copy(platform::CPUPlace(), num_accumulates_, cuda_place,
                in_num_accumulates->data<int64_t>(), sizeof(int64_t), stream);
-  memory::Copy(platform::CPUPlace(), num_updates_, platform::CUDAPlace(),
+  memory::Copy(platform::CPUPlace(), num_updates_, cuda_place,
                in_num_updates->data<int64_t>(), sizeof(int64_t), stream);
 }
 
@@ -42,14 +44,16 @@ void SetAccumulators<paddle::platform::CUDADeviceContext>(
   auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
   auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
   auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
+  auto cuda_place =
+      boost::get<platform::CUDAPlace>(out_old_num_accumulates->place());
 
-  memory::Copy(platform::CUDAPlace(), out_old_num_accumulates->data<int64_t>(),
+  memory::Copy(cuda_place, out_old_num_accumulates->data<int64_t>(),
                platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t),
                stream);
-  memory::Copy(platform::CUDAPlace(), out_num_accumulates->data<int64_t>(),
+  memory::Copy(cuda_place, out_num_accumulates->data<int64_t>(),
                platform::CPUPlace(), &num_accumulates_, sizeof(int64_t),
                stream);
-  memory::Copy(platform::CUDAPlace(), out_num_updates->data<int64_t>(),
+  memory::Copy(cuda_place, out_num_updates->data<int64_t>(),
                platform::CPUPlace(), &num_updates_, sizeof(int64_t), stream);
 }
 
diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
index 45f88ec8697d9f3de2612f28889fefc36f7ddbf9..661dfa69fe1580ff3890f12defcd124225be0c06 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -35,7 +35,8 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
 
-  framework::Async([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, this] {
+  framework::AsyncIO([var_name_val, p_ctx, ep_val, p_scope, time_out, ch,
+                      this] {
     auto* var = p_scope->FindVar(var_name_val);
 
     ::grpc::ByteBuffer req;
@@ -89,7 +90,8 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
 
-  framework::Async([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {
+  framework::AsyncIO([var_name_val, ep_val, p_scope, p_ctx, time_out, ch,
+                      this] {
     // prepare input
     sendrecv::VariableMessage req;
     req.set_varname(var_name_val);
@@ -132,8 +134,8 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
 
-  framework::Async([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
-                    time_out, ch, this] {
+  framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
+                      time_out, ch, this] {
     auto* var = p_scope->FindVar(in_var_name_val);
 
     ::grpc::ByteBuffer req;
@@ -196,7 +198,7 @@ bool RPCClient::Wait() {
   std::vector<std::future<void>> waits(req_count_);
 
   for (int i = 0; i < req_count_; i++) {
-    waits[i] = framework::Async([i, &a, this] { a[i] = Proceed(); });
+    waits[i] = framework::AsyncIO([i, &a, this] { a[i] = Proceed(); });
   }
 
   for (int i = 0; i < req_count_; i++) {
diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
index 0b582a08bc0bfbcfdc8f338a6add8edaa5e80818..119e146e078e476b2768a8495ea63e468f952fd2 100644
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -217,10 +217,10 @@ void AsyncGRPCServer::RunSyncUpdate() {
   std::function<void()> prefetch_register =
       std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this);
 
+  // TODO(wuyi): Run these "HandleRequest" in thread pool
   t_send_.reset(
       new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
                                 cq_send_.get(), "cq_send", send_register)));
-
   t_get_.reset(
       new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
                                 cq_get_.get(), "cq_get", get_register)));
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index c28047e6e915280eed6886f99cd6d55704e3f4ad..9badf26c9bb80acad029be3d1b63377cef63d929 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -268,6 +268,7 @@ void batched_gemm<platform::CUDADeviceContext, float16>(
     const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
     const float16 alpha, const float16* A, const float16* B, const float16 beta,
     float16* C, const int batchCount, const int strideA, const int strideB) {
+#if CUDA_VERSION >= 8000
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   int lda = (transA == CblasNoTrans) ? K : M;
@@ -289,7 +290,6 @@ void batched_gemm<platform::CUDADeviceContext, float16>(
   PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
                     "cublas Hgemm requires GPU compute capability >= 53");
 
-#if CUDA_VERSION >= 8000
   PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(
       context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
       strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount));
@@ -304,6 +304,7 @@ void batched_gemm<platform::CUDADeviceContext, float>(
     const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
     const float alpha, const float* A, const float* B, const float beta,
     float* C, const int batchCount, const int strideA, const int strideB) {
+#if CUDA_VERSION >= 8000
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   int lda = (transA == CblasNoTrans) ? K : M;
@@ -315,7 +316,6 @@ void batched_gemm<platform::CUDADeviceContext, float>(
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   const int strideC = M * N;
 
-#if CUDA_VERSION >= 8000
   PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(
       context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
       strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
@@ -330,6 +330,7 @@ void batched_gemm<platform::CUDADeviceContext, double>(
     const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
     const double alpha, const double* A, const double* B, const double beta,
     double* C, const int batchCount, const int strideA, const int strideB) {
+#if CUDA_VERSION >= 8000
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   int lda = (transA == CblasNoTrans) ? K : M;
@@ -341,7 +342,6 @@ void batched_gemm<platform::CUDADeviceContext, double>(
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   const int strideC = M * N;
 
-#if CUDA_VERSION >= 8000
   PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(
       context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
       strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index 33a50b5cebc1f65ccf9a00280f0eeadd00982555..0b7c1d6af714558d35918dac62d92d9e0f86c970 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -33,28 +33,14 @@ static constexpr size_t kChannelSize = 0;  // kCacheSize - 2
 
 class DoubleBufferReader : public framework::DecoratedReader {
  public:
-  struct Item {
-    Item() : ctx_(nullptr) {}
-    Item(Item&& b) {
-      payloads_ = std::move(b.payloads_);
-      ctx_ = std::move(b.ctx_);
-    }
-    Item& operator=(Item&& b) {
-      payloads_ = std::move(b.payloads_);
-      ctx_ = std::move(b.ctx_);
-      return *this;
-    }
-
-    std::vector<framework::LoDTensor> payloads_;
-    platform::DeviceContext* ctx_;
-  };
-
   explicit DoubleBufferReader(
       ReaderBase* reader, platform::Place target_place = platform::CPUPlace())
       : DecoratedReader(reader), place_(target_place) {
+    cpu_tensor_cache_.resize(kCacheSize);
+    gpu_tensor_cache_.resize(kCacheSize);
 #ifdef PADDLE_WITH_CUDA
-    for (size_t i = 0; i < kCacheSize; ++i) {
-      if (platform::is_gpu_place(place_)) {
+    if (platform::is_gpu_place(place_)) {
+      for (size_t i = 0; i < kCacheSize; ++i) {
         ctxs_.emplace_back(new platform::CUDADeviceContext(
             boost::get<platform::CUDAPlace>(place_)));
       }
@@ -72,7 +58,7 @@ class DoubleBufferReader : public framework::DecoratedReader {
   bool HasNext() const;
 
   void StartPrefetcher() {
-    channel_ = framework::MakeChannel<Item>(kChannelSize);
+    channel_ = framework::MakeChannel<size_t>(kChannelSize);
     prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
   }
 
@@ -88,8 +74,10 @@ class DoubleBufferReader : public framework::DecoratedReader {
   void PrefetchThreadFunc();
 
   std::thread prefetcher_;
-  framework::Channel<Item>* channel_;
+  framework::Channel<size_t>* channel_;
   platform::Place place_;
+  std::vector<std::vector<framework::LoDTensor>> cpu_tensor_cache_;
+  std::vector<std::vector<framework::LoDTensor>> gpu_tensor_cache_;
   std::vector<std::unique_ptr<platform::DeviceContext>> ctxs_;
 };
 
@@ -153,11 +141,14 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
 void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
   out->clear();
   if (HasNext()) {
-    Item batch;
-    channel_->Receive(&batch);
-    *out = batch.payloads_;
-    if (batch.ctx_) {
-      batch.ctx_->Wait();
+    size_t cached_tensor_id;
+    channel_->Receive(&cached_tensor_id);
+    if (platform::is_gpu_place(place_)) {
+      *out = gpu_tensor_cache_[cached_tensor_id];
+      ctxs_[cached_tensor_id]->Wait();
+    } else {
+      // CPU place
+      *out = cpu_tensor_cache_[cached_tensor_id];
     }
   }
 }
@@ -176,42 +167,33 @@ bool DoubleBufferReader::HasNext() const {
 
 void DoubleBufferReader::PrefetchThreadFunc() {
   VLOG(5) << "A new prefetch thread starts.";
-  std::vector<std::vector<framework::LoDTensor>> cpu_tensor_cache(kCacheSize);
-  std::vector<std::vector<framework::LoDTensor>> gpu_tensor_cache(kCacheSize);
   size_t cached_tensor_id = 0;
-
   while (true) {
-    Item batch;
-    auto& cpu_batch = cpu_tensor_cache[cached_tensor_id];
+    auto& cpu_batch = cpu_tensor_cache_[cached_tensor_id];
     reader_->ReadNext(&cpu_batch);
     if (cpu_batch.empty()) {
       // The underlying reader have no next data.
       break;
     }
     if (platform::is_gpu_place(place_)) {
-      auto& gpu_batch = gpu_tensor_cache[cached_tensor_id];
+      auto& gpu_batch = gpu_tensor_cache_[cached_tensor_id];
       auto* gpu_ctx = ctxs_[cached_tensor_id].get();
       gpu_batch.resize(cpu_batch.size());
       for (size_t i = 0; i < cpu_batch.size(); ++i) {
         framework::TensorCopy(cpu_batch[i], place_, *gpu_ctx, &gpu_batch[i]);
         gpu_batch[i].set_lod(cpu_batch[i].lod());
       }
-      batch.payloads_ = gpu_batch;
-      batch.ctx_ = gpu_ctx;
-    } else {
-      // CPUPlace
-      batch.payloads_ = cpu_batch;
     }
-    ++cached_tensor_id;
-    cached_tensor_id %= kCacheSize;
-
     try {
-      channel_->Send(&batch);
+      size_t tmp = cached_tensor_id;
+      channel_->Send(&tmp);
     } catch (paddle::platform::EnforceNotMet e) {
       VLOG(5) << "WARNING: The double buffer channel has been closed. The "
                  "prefetch thread will terminate.";
       break;
     }
+    ++cached_tensor_id;
+    cached_tensor_id %= kCacheSize;
   }
   channel_->Close();
   VLOG(5) << "Prefetch thread terminates.";
diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h
index 807e5ad951b893a4c027a96d743f0606b70cf160..8320c257c9ab15efec29eabe99eca5b6f74c9e31 100644
--- a/paddle/fluid/operators/reshape_op.h
+++ b/paddle/fluid/operators/reshape_op.h
@@ -60,7 +60,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
   static framework::DDim ValidateShape(const std::vector<int> shape,
                                        const framework::DDim &in_dims) {
     const int64_t in_size = framework::product(in_dims);
-    // only one dimension canbe set to -1, whose size will be automatically
+    // only one dimension can be set to -1, whose size will be automatically
     // infered.
     const int64_t unk_dim_val = -1;
     const int64_t copy_dim_val = 0;
@@ -119,13 +119,15 @@ class ReshapeKernel : public framework::OpKernel<T> {
     auto *shape_tensor = ctx.Input<framework::LoDTensor>("Shape");
 
     framework::DDim out_dims = out->dims();
+
     if (shape_tensor) {
       auto *shape_data = shape_tensor->data<int>();
+      framework::Tensor cpu_shape_tensor;
       if (platform::is_gpu_place(ctx.GetPlace())) {
-        framework::Tensor cpu_shape_tensor;
         TensorCopy(*shape_tensor, platform::CPUPlace(), ctx.device_context(),
                    &cpu_shape_tensor);
         shape_data = cpu_shape_tensor.data<int>();
+        ctx.device_context().Wait();
       }
       auto shape =
           std::vector<int>(shape_data, shape_data + shape_tensor->numel());
@@ -145,6 +147,7 @@ class ReshapeKernel : public framework::OpKernel<T> {
     if (!inplace) {
       out->mutable_data<T>(ctx.GetPlace());
       framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out);
+      ctx.device_context().Wait();
       // TensorCopy will resize to in_dims.
       out->Resize(out_dims);
     } else {
@@ -167,6 +170,7 @@ class ReshapeGradKernel : public framework::OpKernel<T> {
     auto in_dims = d_x->dims();
     if (!inplace) {
       framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
+      ctx.device_context().Wait();
       d_x->Resize(in_dims);
     } else {
       d_x->ShareDataWith(*d_out);
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index 7ca7639fdb9b4c0fe5fe059a1cad1a22987d47e4..1e938638c9182972a2ae2436166ff0aa49efd4be 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/scale_op.h"
-
 #include <string>
 
 namespace paddle {
diff --git a/paddle/fluid/operators/scatter_op.cu b/paddle/fluid/operators/scatter_op.cu
index ef7d700659d8d713715a10910baf739954ba0786..a70b9091727935ddcbb83dd5775729969f7d64e5 100644
--- a/paddle/fluid/operators/scatter_op.cu
+++ b/paddle/fluid/operators/scatter_op.cu
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "gather.cu.h"
+#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/gather_op.h"
-#include "scatter.cu.h"
+#include "paddle/fluid/operators/scatter.cu.h"
+#include "paddle/fluid/operators/scatter_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/scatter_op.h b/paddle/fluid/operators/scatter_op.h
index 2151d8a9240fc88966533f4a07d5cf56b6c1c3bc..d29947b55e751a3e7993f765198364f4debe2472 100644
--- a/paddle/fluid/operators/scatter_op.h
+++ b/paddle/fluid/operators/scatter_op.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "gather.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "scatter.h"
+#include "paddle/fluid/operators/gather.h"
+#include "paddle/fluid/operators/scatter.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/scatter_test.cc b/paddle/fluid/operators/scatter_test.cc
index b67af3c3710eafc57b660a48e4c340d5eefe7e5b..750245153a7df6c4a7ce088038005dcab1685b5f 100644
--- a/paddle/fluid/operators/scatter_test.cc
+++ b/paddle/fluid/operators/scatter_test.cc
@@ -13,44 +13,48 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/scatter.h"
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/place.h"
-
 #include <gtest/gtest.h>
 #include <iostream>
 #include <string>
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/place.h"
 
 TEST(scatter, ScatterUpdate) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  using namespace paddle::operators;
+  // using namespace paddle::framework;
+  // using namespace paddle::platform;
+  // using namespace paddle::operators;
 
-  Tensor* src = new Tensor();
-  Tensor* index = new Tensor();
-  Tensor* output = new Tensor();
+  paddle::framework::Tensor* src = new paddle::framework::Tensor();
+  paddle::framework::Tensor* index = new paddle::framework::Tensor();
+  paddle::framework::Tensor* output = new paddle::framework::Tensor();
 
   float* p_src = nullptr;
   int* p_index = nullptr;
-  p_src = src->mutable_data<float>(make_ddim({1, 4}), CPUPlace());
-  p_index = index->mutable_data<int>(make_ddim({1}), CPUPlace());
+  p_src = src->mutable_data<float>(paddle::framework::make_ddim({1, 4}),
+                                   paddle::platform::CPUPlace());
+  p_index = index->mutable_data<int>(paddle::framework::make_ddim({1}),
+                                     paddle::platform::CPUPlace());
 
-  for (size_t i = 0; i < 4; ++i) p_src[i] = float(i);
+  for (size_t i = 0; i < 4; ++i) p_src[i] = static_cast<float>(i);
   p_index[0] = 1;
 
-  float* p_output = output->mutable_data<float>(make_ddim({4, 4}), CPUPlace());
+  float* p_output = output->mutable_data<float>(
+      paddle::framework::make_ddim({4, 4}), paddle::platform::CPUPlace());
 
   auto* cpu_place = new paddle::platform::CPUPlace();
   paddle::platform::CPUDeviceContext ctx(*cpu_place);
-  ScatterAssign<float>(ctx, *src, *index, output);
+  paddle::operators::ScatterAssign<float>(ctx, *src, *index, output);
 
-  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], float(0));
-  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output->data<float>()[i], float(0));
-  for (size_t i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], float(i - 4));
+  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], 0.0f);
+  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output->data<float>()[i], 0.0f);
+  for (size_t i = 4; i < 8; ++i) {
+    EXPECT_EQ(p_output[i], static_cast<float>(i - 4));
+  }
   for (size_t i = 4; i < 8; ++i)
-    EXPECT_EQ(output->data<float>()[i], float(i - 4));
-  for (size_t i = 8; i < 16; ++i) EXPECT_EQ(p_output[i], float(0));
-  for (size_t i = 8; i < 16; ++i) EXPECT_EQ(output->data<float>()[i], float(0));
+    EXPECT_EQ(output->data<float>()[i], static_cast<float>(i - 4));
+  for (size_t i = 8; i < 16; ++i) EXPECT_EQ(p_output[i], 0.0f);
+  for (size_t i = 8; i < 16; ++i) EXPECT_EQ(output->data<float>()[i], 0.0f);
 
   delete src;
   delete index;
diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc
index 8d02a6f29177536562e38372eb0336424aa0a47c..12b844daaa33162b86b7daffa2e4c49785701662 100644
--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <future>  // NOLINT
 #include <ostream>
 
 #include "paddle/fluid/framework/data_type.h"
@@ -19,7 +20,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 
-#include <future>
 #include "paddle/fluid/operators/detail/grpc_client.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc
index d47f66de2161dce7ed162db4c2e23859e19596cb..82ff087d0a7a4b482aef842e618f593b17dca171 100644
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <future>
+#include <future>  // NOLINT
 #include <ostream>
 
 #include "paddle/fluid/framework/data_type.h"
diff --git a/paddle/fluid/operators/send_recv_util.h b/paddle/fluid/operators/send_recv_util.h
index 196f56f6340a75b599b8dd15957dfe6835f9bf59..113513eb6b327773ab4a1c062fb8a3f06fddfbca 100644
--- a/paddle/fluid/operators/send_recv_util.h
+++ b/paddle/fluid/operators/send_recv_util.h
@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#pragma once
+#include <string>
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/sequence_concat_op.h b/paddle/fluid/operators/sequence_concat_op.h
index 9f04c4199130de3cead6f23ef111453ca752c0e3..71c9f45287c29628a2f2c8c649e9e5270317ef6a 100644
--- a/paddle/fluid/operators/sequence_concat_op.h
+++ b/paddle/fluid/operators/sequence_concat_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 
diff --git a/paddle/fluid/operators/sequence_conv_op.h b/paddle/fluid/operators/sequence_conv_op.h
index ee48339c52e348e7b3060bbdd462177375aee9f5..b59504bb9893b720247841bdad5aa577992b7fb6 100644
--- a/paddle/fluid/operators/sequence_conv_op.h
+++ b/paddle/fluid/operators/sequence_conv_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/context_project.h"
 #include "paddle/fluid/operators/math/math_function.h"
diff --git a/paddle/fluid/operators/sequence_erase_op.cc b/paddle/fluid/operators/sequence_erase_op.cc
index 32b9d7f7c1528a365cd21122e4df0e3c1407a49e..73c0e89512972cda002bd902ee0c78b4b77d8502 100644
--- a/paddle/fluid/operators/sequence_erase_op.cc
+++ b/paddle/fluid/operators/sequence_erase_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_erase_op.h"
+#include <vector>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sequence_erase_op.h b/paddle/fluid/operators/sequence_erase_op.h
index b490c34f543875f73e0862c08c25bcb57611e2f4..265390528a15aa060900276f98128d754fc907fe 100644
--- a/paddle/fluid/operators/sequence_erase_op.h
+++ b/paddle/fluid/operators/sequence_erase_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/sequence_expand_op.cc b/paddle/fluid/operators/sequence_expand_op.cc
index 786fe63e7580ce16b946d5049a490eed2c3c6ced..ae52849162ae4d78cc69ddbb98f58059f55683cb 100644
--- a/paddle/fluid/operators/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_expand_op.cc
@@ -84,12 +84,11 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
         }
       }
       out_dims[0] = out_first_dim;
-      ctx->SetOutputDim("Out", out_dims);
     } else {
       out_dims[0] = -1;
-      ctx->SetOutputDim("Out", out_dims);
-      ctx->ShareLoD("X", /*->*/ "Out");
     }
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu
index bb51bb2902eea797de3449fcb6c8b52b4f0e7fbf..c00765e5d59af068e5682b39ebace5f3d7a62250 100644
--- a/paddle/fluid/operators/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_expand_op.cu
@@ -12,8 +12,135 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#define EIGEN_USE_GPU
+#include <algorithm>
 #include "paddle/fluid/operators/sequence_expand_op.h"
+#include "paddle/fluid/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+__global__ void sequence_expand_kernel(const T* x_data, const size_t* x_lod,
+                                       const size_t* ref_lod,
+                                       const size_t* offset,
+                                       const size_t lod_size,
+                                       /* default=1,
+                                          the instance length*/
+                                       const int x_item_length, T* out_data) {
+  int bid = blockIdx.x;
+  if (bid >= lod_size - 1) return;
+
+  int x_item_count = x_lod[bid + 1] - x_lod[bid];
+  int repeats = ref_lod[bid + 1] - ref_lod[bid];
+  int out_offset = static_cast<int>(offset[bid]);
+  int x_offset = x_lod[bid];
+  for (int tid_z = threadIdx.z; tid_z < repeats; tid_z += blockDim.z) {
+    for (int tid_y = threadIdx.y; tid_y < x_item_count; tid_y += blockDim.y) {
+      for (int tid_x = threadIdx.x; tid_x < x_item_length;
+           tid_x += blockDim.x) {
+        out_data[(out_offset + tid_z * x_item_count + tid_y) * x_item_length +
+                 tid_x] = x_data[(x_offset + tid_y) * x_item_length + tid_x];
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void sequence_expand_grad_kernel(
+    const T* dout_data, const size_t* ref_lod, const size_t* dx_lod,
+    const size_t* offset, const size_t lod_size,
+    /* default=1,
+       the instance length*/
+    const int x_item_length, T* dx_data) {
+  int bid = blockIdx.x;
+  if (bid >= lod_size - 1) return;
+  int x_item_count = dx_lod[bid + 1] - dx_lod[bid];
+  int repeats = ref_lod[bid + 1] - ref_lod[bid];
+  int out_offset = static_cast<int>(offset[bid]);
+  int x_offset = dx_lod[bid];
+
+  for (int tid_z = threadIdx.z; tid_z < repeats; tid_z += blockDim.z) {
+    for (int tid_y = threadIdx.y; tid_y < x_item_count; tid_y += blockDim.y) {
+      for (int tid_x = threadIdx.x; tid_x < x_item_length;
+           tid_x += blockDim.x) {
+        platform::CudaAtomicAdd(
+            &dx_data[(x_offset + tid_y) * x_item_length + tid_x],
+            dout_data[(out_offset + tid_z * x_item_count + tid_y) *
+                          x_item_length +
+                      tid_x]);
+      }
+    }
+  }
+}
+
+void GetOutputOffset(const framework::Vector<size_t>& x_lod,
+                     const framework::Vector<size_t>& ref_lod,
+                     framework::Vector<size_t>* out_offset) {
+  size_t offset = 0;
+  int lod_size = static_cast<int>(x_lod.size());
+  for (int i = 0; i < static_cast<int>(x_lod.size()); ++i) {
+    (*out_offset)[i] = offset;
+    if (i < lod_size - 1) {
+      offset += (ref_lod[i + 1] - ref_lod[i]) * (x_lod[i + 1] - x_lod[i]);
+    }
+  }
+}
+
+template <typename T>
+struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
+  void operator()(
+      const platform::CUDADeviceContext& context, const LoDTensor& x,
+      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
+      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
+      LoDTensor* out) {
+    int x_item_length = x.numel() / x.dims()[0];
+    framework::Vector<size_t> out_offset(x_lod.size());
+    GetOutputOffset(x_lod, ref_lod, &out_offset);
+
+    int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
+    int thread_y = 16;
+    int thread_z = 1024 / thread_x / thread_y;
+    int block_x = static_cast<int>(ref_lod.size());
+    dim3 block_size(thread_x, thread_y, thread_z);
+    dim3 grid_size(block_x, 1);
+
+    sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
+        x.data<T>(), x_lod.CUDAData(context.GetPlace()),
+        ref_lod.CUDAData(context.GetPlace()),
+        out_offset.CUDAData(context.GetPlace()), x_lod.size(), x_item_length,
+        out->mutable_data<T>(context.GetPlace()));
+  }
+};
+
+template <typename T>
+struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const LoDTensor& dout,
+                  const framework::Vector<size_t>& x_lod, /*expand source lod*/
+                  const framework::Vector<size_t>& ref_lod, /*expand based lod*/
+                  LoDTensor* dx) {
+    int x_item_length = framework::product(dx->dims()) / dx->dims()[0];
+    framework::Vector<size_t> out_offset(x_lod.size());
+    GetOutputOffset(x_lod, ref_lod, &out_offset);
+
+    int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
+    int thread_y = 16;
+    int thread_z = 1024 / thread_x / thread_y;
+    int block_x = static_cast<int>(ref_lod.size());
+    dim3 block_size(thread_x, thread_y, thread_z);
+    dim3 grid_size(block_x, 1);
+    sequence_expand_grad_kernel<<<grid_size, block_size, 0, context.stream()>>>(
+        dout.data<T>(), ref_lod.CUDAData(context.GetPlace()),
+        x_lod.CUDAData(context.GetPlace()),
+        out_offset.CUDAData(context.GetPlace()), ref_lod.size(), x_item_length,
+        dx->mutable_data<T>(context.GetPlace()));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h
index db7d8bd6821fabd9714a160970558291ec47197f..d62c387c3eebf9df0ab532f4e891da006f239468 100644
--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <numeric>  // std::iota
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
@@ -26,6 +27,57 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
+template <typename DeviceContext, typename T>
+struct SequenceExpandFunctor {
+  void operator()(
+      const DeviceContext& ctx, const LoDTensor& x,
+      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
+      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
+      LoDTensor* out);
+};
+
+template <typename DeviceContext, typename T>
+struct SequenceExpandGradFunctor {
+  void operator()(
+      const DeviceContext& ctx, const LoDTensor& dout,
+      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
+      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
+      LoDTensor* dx);
+};
+
+template <typename T>
+struct SequenceExpandFunctor<platform::CPUDeviceContext, T> {
+  void operator()(
+      const platform::CPUDeviceContext& context, const LoDTensor& x,
+      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
+      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
+      LoDTensor* out) {
+    int out_offset = 0;
+    auto& eigen_place = *context.eigen_device();
+    for (size_t i = 1; i < ref_lod.size(); ++i) {
+      int repeat_num = ref_lod[i] - ref_lod[i - 1];
+      int x_start = x_lod[i - 1];
+      int x_end = x_lod[i];
+      int x_seq_len = x_end - x_start;
+      if (repeat_num > 0) {
+        auto x_sub_tensor = x.Slice(x_start, x_end);
+        x_sub_tensor.Resize({1, x_sub_tensor.numel()});
+        int out_start = out_offset;
+        if (out->lod().size() == 1) {
+          out_start = out->lod()[0][out_offset];
+        }
+        auto out_sub_tensor =
+            out->Slice(out_start, out_start + x_seq_len * repeat_num);
+        out_sub_tensor.Resize({repeat_num, x_sub_tensor.dims()[1]});
+        EigenMatrix<T>::From(out_sub_tensor).device(eigen_place) =
+            EigenMatrix<T>::From(x_sub_tensor)
+                .broadcast(Eigen::array<int, 2>({{repeat_num, 1}}));
+      }
+      out_offset += repeat_num;
+    }
+  }
+};
+
 template <typename DeviceContext, typename T>
 class SequenceExpandKernel : public framework::OpKernel<T> {
  public:
@@ -47,45 +99,36 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
       return;
     }
 
-    auto& out_lod = *out->mutable_lod();
+    // x lod level is at most 1.
+    framework::Vector<size_t> out_lod;
     if (x_lod.size() == 1) {
-      out_lod.resize(1);
-      out_lod[0] = {0};
-    }
-
-    int out_offset = 0;
-    auto& eigen_place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
-      int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
-      int x_start = i - 1;
-      int x_end = i;
-      if (x_lod.size() == 1) {
-        x_start = x_lod[0][i - 1];
-        x_end = x_lod[0][i];
-      }
-      int x_seq_len = x_end - x_start;
-      if (repeat_num > 0) {
-        auto x_sub_tensor = x->Slice(x_start, x_end);
-        x_sub_tensor.Resize({1, x_sub_tensor.numel()});
-        int out_start = out_offset;
-        if (x_lod.size() == 1) {
-          out_start = out_lod[0][out_offset];
-        }
-        auto out_sub_tensor =
-            out->Slice(out_start, out_start + x_seq_len * repeat_num);
-        out_sub_tensor.Resize({repeat_num, x_sub_tensor.dims()[1]});
-        EigenMatrix<T>::From(out_sub_tensor).device(eigen_place) =
-            EigenMatrix<T>::From(x_sub_tensor)
-                .broadcast(Eigen::array<int, 2>({{repeat_num, 1}}));
-      }
-      for (int j = 0; j < repeat_num; ++j) {
-        if (x_lod.size() == 1) {
-          out_lod[0].push_back(out_lod[0].back() + x_seq_len);
+      out_lod.push_back(0);
+      int out_offset = 0;
+      for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
+        int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
+        int x_start = x_lod[0][i - 1];
+        int x_end = x_lod[0][i];
+        int x_seq_len = x_end - x_start;
+        for (int j = 0; j < repeat_num; ++j) {
+          out_lod.push_back(out_lod.back() + x_seq_len);
+          out_offset++;
         }
-        out_offset++;
       }
+      // write lod to out if x has lod
+      auto& ref_lod = *out->mutable_lod();
+      ref_lod[0] = out_lod;
     }
+    framework::Vector<size_t> ref_x_lod;
+    if (x->lod().size() == 1) {
+      ref_x_lod = x->lod()[0];
+    } else {
+      // x_lod doesn't has lod, use fake x lod, level = 0
+      ref_x_lod.resize(x->dims()[0] + 1);
+      std::iota(ref_x_lod.begin(), ref_x_lod.end(), 0);
+    }
+    SequenceExpandFunctor<DeviceContext, T> functor;
+    functor(context.template device_context<DeviceContext>(), *x, ref_x_lod,
+            y_lod[ref_level], out);
   }
 };
 
@@ -101,6 +144,36 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
  *    Grad(X).lod = Input(X).lod
  *
  * */
+template <typename T>
+struct SequenceExpandGradFunctor<platform::CPUDeviceContext, T> {
+  void operator()(
+      const platform::CPUDeviceContext& context, const LoDTensor& dout,
+      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
+      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
+      LoDTensor* dx) {
+    math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+    set_zero(context, dx, static_cast<T>(0));
+
+    int dout_offset = 0;
+    for (size_t i = 1; i < ref_lod.size(); ++i) {
+      int repeat_num = ref_lod[i] - ref_lod[i - 1];
+      if (repeat_num > 0) {
+        int x_start = x_lod[i - 1];
+        int x_end = x_lod[i];
+        int x_seq_len = x_end - x_start;
+        auto dx_sub = dx->Slice(x_start, x_end);
+        dx_sub.Resize(flatten_to_1d(dx_sub.dims()));
+        int dout_end = dout_offset + repeat_num * x_seq_len;
+        auto dout_sub = dout.Slice(dout_offset, dout_end);
+        dout_sub.Resize({repeat_num, dx_sub.dims()[0]});
+        math::ColwiseSum<platform::CPUDeviceContext, T> col_sum;
+        col_sum(context, dout_sub, &dx_sub);
+        dout_offset += repeat_num * x_seq_len;
+      }
+    }
+  }
+};
+
 template <typename DeviceContext, typename T>
 class SequenceExpandGradKernel : public framework::OpKernel<T> {
  public:
@@ -114,43 +187,26 @@ class SequenceExpandGradKernel : public framework::OpKernel<T> {
     g_x->mutable_data<T>(context.GetPlace());
     g_x->set_lod(x->lod());
 
-    auto& x_lod = x->lod();
     auto& y_lod = y->lod();
-
     if (ref_level == -1) ref_level = y_lod.size() - 1;
-
     // just copy the gradient
     if (y_lod[ref_level].size() <= 1) {
       framework::TensorCopy(*g_out, context.GetPlace(), g_x);
       return;
     }
 
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    math::SetConstant<DeviceContext, T> set_zero;
-    set_zero(dev_ctx, g_x, static_cast<T>(0));
-
-    int g_out_offset = 0;
-    for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
-      int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
-      if (repeat_num > 0) {
-        int x_start = i - 1;
-        int x_end = i;
-        if (x_lod.size() == 1) {
-          x_start = x_lod[0][i - 1];
-          x_end = x_lod[0][i];
-        }
-        int x_seq_len = x_end - x_start;
-        auto g_x_sub = g_x->Slice(x_start, x_end);
-        g_x_sub.Resize(flatten_to_1d(g_x_sub.dims()));
-        int g_out_end = g_out_offset + repeat_num * x_seq_len;
-        auto g_out_sub = g_out->Slice(g_out_offset, g_out_end);
-        g_out_sub.Resize({repeat_num, g_x_sub.dims()[0]});
-        math::ColwiseSum<DeviceContext, T> col_sum;
-        col_sum(dev_ctx, g_out_sub, &g_x_sub);
-        g_out_offset += repeat_num * x_seq_len;
-      }
+    framework::Vector<size_t> ref_x_lod;
+    framework::Vector<size_t> ref_lod = y_lod[ref_level];
+    if (x->lod().size() == 1) {
+      ref_x_lod = x->lod()[0];
+    } else {
+      // x_lod doesn't has lod, use fake x lod, level = 0
+      ref_x_lod.resize(x->dims()[0] + 1);
+      std::iota(ref_x_lod.begin(), ref_x_lod.end(), 0);
     }
+    SequenceExpandGradFunctor<DeviceContext, T> functor;
+    functor(context.template device_context<DeviceContext>(), *g_out, ref_x_lod,
+            ref_lod, g_x);
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_pool_op.cc b/paddle/fluid/operators/sequence_pool_op.cc
index 3d4d54a3a3f292d34b1a7645a0db4bdd3208ba6d..933c8c26239d49221819a583f999389ed6fb6cb6 100644
--- a/paddle/fluid/operators/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_pool_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_pool_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sequence_pool_op.h b/paddle/fluid/operators/sequence_pool_op.h
index c58d677c92b7a20eb54dc5f9a447566e91bdc3d4..2aa20792f24305a106c500a3d7a6e3d363bc31d8 100644
--- a/paddle/fluid/operators/sequence_pool_op.h
+++ b/paddle/fluid/operators/sequence_pool_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
diff --git a/paddle/fluid/operators/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_softmax_op.cc
index e8b4df04286d327f568f4c43886f9fcf89cc4a88..d2c1317bef95deca36f7f4198407f5350a1be035 100644
--- a/paddle/fluid/operators/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_softmax_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sgd_op.h b/paddle/fluid/operators/sgd_op.h
index 8d2bdf75903b4958e14605781f65c5a214cb5300..cfc8793e1e05a7d4fa9207ae77a664b391b9a986 100644
--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
@@ -65,7 +65,8 @@ class SGDOpKernel : public framework::OpKernel<T> {
         auto &grad_rows = grad->rows();
 
         size_t grad_row_numel = grad_value.numel() / grad_rows.size();
-        PADDLE_ENFORCE_EQ(grad_row_numel, param_out->numel() / grad_height);
+        PADDLE_ENFORCE_EQ(static_cast<int64_t>(grad_row_numel),
+                          param_out->numel() / grad_height);
 
         auto *grad_data = grad_value.data<T>();
         auto *out_data = param_out->data<T>();
@@ -73,7 +74,7 @@ class SGDOpKernel : public framework::OpKernel<T> {
         for (size_t i = 0; i < grad_rows.size(); i++) {
           PADDLE_ENFORCE(grad_rows[i] < grad_height,
                          "Input rows index should less than height");
-          for (int64_t j = 0; j < grad_row_numel; j++) {
+          for (size_t j = 0; j < grad_row_numel; j++) {
             out_data[grad_rows[i] * grad_row_numel + j] -=
                 lr[0] * grad_data[i * grad_row_numel + j];
           }
@@ -107,7 +108,7 @@ class SGDOpKernel : public framework::OpKernel<T> {
         PADDLE_ENFORCE(grad.rows()[i] < grad.height(),
                        "Input rows index should less than height");
         int64_t id_index = param.index(grad.rows()[i]);
-        for (int64_t j = 0; j < grad_row_width; j++) {
+        for (size_t j = 0; j < grad_row_width; j++) {
           out_data[id_index * grad_row_width + j] -=
               lr[0] * grad_data[i * grad_row_width + j];
         }
diff --git a/paddle/fluid/operators/softmax_mkldnn_op.cc b/paddle/fluid/operators/softmax_mkldnn_op.cc
index cf0244e8662e827a90d8472a097315680579ff6d..dc2f1763446b2aaf72b20c72e8e37ec920abd120 100644
--- a/paddle/fluid/operators/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/softmax_mkldnn_op.cc
@@ -12,12 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <iostream>
 #include "mkldnn.hpp"
 #include "paddle/fluid/operators/softmax_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
-#include <iostream>
-
 namespace paddle {
 namespace operators {
 
@@ -63,9 +62,11 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
                                               softmax_md, 1 /*dim: C*/);
     // create memory primitives
     auto softmax_src_memory =
-        memory({softmax_md, mkldnn_engine}, (void*)input_data);
+        memory({softmax_md, mkldnn_engine},
+               static_cast<void*>(const_cast<T*>(input_data)));
     auto softmax_dst_memory =
-        memory({softmax_md, mkldnn_engine}, (void*)output_data);
+        memory({softmax_md, mkldnn_engine},
+               static_cast<void*>(const_cast<T*>(output_data)));
     auto softmax_prim_desc =
         softmax_forward::primitive_desc(softmax_desc, mkldnn_engine);
     auto softmax = softmax_forward(softmax_prim_desc, softmax_src_memory,
diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h
index ba1e903dbb6daaa86b1b664322d100a800fd16b3..d263426e073d95ad6d584c7370baf596587a993d 100644
--- a/paddle/fluid/operators/split_ids_op.h
+++ b/paddle/fluid/operators/split_ids_op.h
@@ -60,7 +60,9 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
     } else if (ids_var->IsType<framework::SelectedRows>()) {
       const auto *ids_selected_rows = ctx.Input<framework::SelectedRows>("Ids");
       auto &ids_dims = ids_selected_rows->value().dims();
-      PADDLE_ENFORCE_EQ(ids_dims[0], ids_selected_rows->rows().size(), "");
+      PADDLE_ENFORCE_EQ(ids_dims[0],
+                        static_cast<int64_t>(ids_selected_rows->rows().size()),
+                        "");
       const T *ids = ids_selected_rows->value().data<T>();
       const auto &ids_rows = ids_selected_rows->rows();
       auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
@@ -77,7 +79,7 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
         framework::DDim ddim = framework::make_ddim(
             {static_cast<int64_t>(out->rows().size()), row_width});
         T *output = out->mutable_value()->mutable_data<T>(ddim, place);
-        for (size_t i = 0; i < ddim[0]; ++i) {
+        for (int64_t i = 0; i < ddim[0]; ++i) {
           memcpy(output + i * row_width, ids + out->rows()[i] * row_width,
                  row_width * sizeof(T));
         }
diff --git a/paddle/fluid/operators/split_op.h b/paddle/fluid/operators/split_op.h
index ae8562c0c503fec13dff61e04845ba0832848f5f..e2c41f44ab3ea3c42837974dae749278c9356ba5 100644
--- a/paddle/fluid/operators/split_op.h
+++ b/paddle/fluid/operators/split_op.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include <chrono>
+#include <chrono>  // NOLINT
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index 87699362b2b5a14750a01345098ec5e6cc9be115..acaefaacdaa593c090d81084fdc1b3665314833f 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -24,7 +24,19 @@ template <typename T>
 class CPUUniformRandomKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* tensor = ctx.Output<framework::Tensor>("Out");
+    framework::Tensor* tensor = nullptr;
+    auto out_var = ctx.OutputVar("Out");
+    if (out_var->IsType<framework::LoDTensor>()) {
+      tensor = out_var->GetMutable<framework::LoDTensor>();
+    } else if (out_var->IsType<framework::SelectedRows>()) {
+      auto shape = ctx.Attr<std::vector<int>>("shape");
+      tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value();
+      tensor->Resize(framework::make_ddim(shape));
+    } else {
+      PADDLE_THROW(
+          "uniform_random_op's output only"
+          "supports SelectedRows and Tensor");
+    }
     T* data = tensor->mutable_data<T>(ctx.GetPlace());
     unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
     std::minstd_rand engine;
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index 1232cd1eb332441b12e59a34b2c2f75669925fd0..e1c7323a30233f4ec4f60e46aa6088ee6d8601b7 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -43,7 +43,19 @@ template <typename T>
 class GPUUniformRandomKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* tensor = context.Output<framework::Tensor>("Out");
+    framework::Tensor* tensor = nullptr;
+    auto out_var = context.OutputVar("Out");
+    if (out_var->IsType<framework::LoDTensor>()) {
+      tensor = out_var->GetMutable<framework::LoDTensor>();
+    } else if (out_var->IsType<framework::SelectedRows>()) {
+      auto shape = context.Attr<std::vector<int>>("shape");
+      tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value();
+      tensor->Resize(framework::make_ddim(shape));
+    } else {
+      PADDLE_THROW(
+          "uniform_random_op's output only"
+          "supports SelectedRows and Tensor");
+    }
     T* data = tensor->mutable_data<T>(context.GetPlace());
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
     if (seed == 0) {
diff --git a/paddle/fluid/platform/cuda_profiler.h b/paddle/fluid/platform/cuda_profiler.h
index ebd6aebd7688549c6fb14466cfa461b90a9fdde0..41d7c121469edd24c67b4288793cb95159fd4b62 100644
--- a/paddle/fluid/platform/cuda_profiler.h
+++ b/paddle/fluid/platform/cuda_profiler.h
@@ -11,12 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
+
 #include <cuda_profiler_api.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
+
+#include <string>
+
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/details/device_ptr_cast.h b/paddle/fluid/platform/details/cuda_transform_iterator_cast.h
similarity index 50%
rename from paddle/fluid/platform/details/device_ptr_cast.h
rename to paddle/fluid/platform/details/cuda_transform_iterator_cast.h
index 1c502a19c056c7fe434e68d568a0f59bf6315b95..06afc44c257bbeb0729323e1a42e1eead23ff075 100644
--- a/paddle/fluid/platform/details/device_ptr_cast.h
+++ b/paddle/fluid/platform/details/cuda_transform_iterator_cast.h
@@ -18,16 +18,22 @@ limitations under the License. */
 #error device_ptr_cast must be include by .cu file
 #endif
 
-#include <thrust/device_ptr.h>
+#include <type_traits>  // For std::remove_pointer and std::is_pointer.
+
+#include "thrust/device_ptr.h"
 
 namespace paddle {
 namespace platform {
 namespace details {
+
+// PointerToThrustDevicePtr has two speicalizations, one casts a (CUDA
+// device) pointer into thrust::device_ptr, the other keeps rest types
+// un-casted.
 template <typename T, bool is_ptr>
-struct DevicePtrCast;
+struct PointerToThrustDevicePtr;
 
 template <typename T>
-struct DevicePtrCast<T, true> {
+struct PointerToThrustDevicePtr<T, true> {
   using ELEM = typename std::remove_pointer<T>::type;
   using RTYPE = thrust::device_ptr<ELEM>;
 
@@ -37,17 +43,26 @@ struct DevicePtrCast<T, true> {
 };
 
 template <typename T>
-struct DevicePtrCast<T, false> {
+struct PointerToThrustDevicePtr<T, false> {
   using RTYPE = T;
   inline RTYPE operator()(RTYPE it) const { return it; }
 };
 
-// Cast T to thrust::device_ptr if T is a pointer.
-// Otherwise, e.g., T is a iterator, return T itself.
+// CastToCUDATransformIterator casts a pointer to thrust::device_ptr
+// so it could be used as the iterator of thrust::transform.  It
+// doesn't cast other types.
+//
+// We need CastToCUDATransformIterator because it is often that we
+// want to use device memory pointers as transform iterators, e.g., to
+// transform a block of float32 to float16.  In this case, we want
+// CastToCUDATransformIterator to cast float16/32 pointers to
+// thrust::device_ptr, otherwise they cannot work as the iterator
+// required by thrust::transform.  At the same time, we don't want to
+// cast thrust::device_ptr to thrust::device_ptr repeatedly.
 template <typename T>
-auto DevPtrCast(T t) ->
-    typename DevicePtrCast<T, std::is_pointer<T>::value>::RTYPE {
-  DevicePtrCast<T, std::is_pointer<T>::value> cast;
+auto CastToCUDATransformIterator(T t) ->
+    typename PointerToThrustDevicePtr<T, std::is_pointer<T>::value>::RTYPE {
+  PointerToThrustDevicePtr<T, std::is_pointer<T>::value> cast;
   return cast(t);
 }
 
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index f03165fae5ca16c5c263ce0683af7ec56e6a3766..1f733d71bdfb777d4a2f316a5fefc3c874879862 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -175,7 +175,7 @@ CUDADeviceContext::~CUDADeviceContext() {
 Place CUDADeviceContext::GetPlace() const { return place_; }
 
 void CUDADeviceContext::Wait() const {
-  std::lock_guard<std::mutex> guard(mutex_);
+  std::lock_guard<std::recursive_mutex> guard(mutex_);
   PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
   PADDLE_ENFORCE(cudaGetLastError());
 }
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index b17558337914e0ca8fdba283edf4024d94e85f0f..a9c1984616bc731e0557f2cb89282423aa9c3bac 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -98,13 +98,20 @@ class CUDADeviceContext : public DeviceContext {
   /*! \brief  Return cuda stream in the device context. */
   cudaStream_t stream() const;
 
+  template <typename Callback>
+  void RecordEvent(cudaEvent_t ev, Callback callback) {
+    std::lock_guard<std::recursive_mutex> guard(mutex_);
+    callback();
+    PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
+  }
+
  private:
   CUDAPlace place_;
 
   std::unique_ptr<Eigen::GpuDevice> eigen_device_;
   std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
 
-  mutable std::mutex mutex_;
+  mutable std::recursive_mutex mutex_;
   cudaStream_t stream_;
   cudnnHandle_t cudnn_handle_;
   cublasHandle_t cublas_handle_;
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 84dac2937de02b3374156ebc83e19dac9f9a3e7a..364c4901b297dbd647faae85b01f682a1daace9c 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -1,6 +1,11 @@
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
 
 list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc nccl.cc)
+if (TENSORRT_FOUND)
+  list(APPEND CUDA_SRCS tensorrt.cc)
+endif()
+
+
 configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h)
 if (CUPTI_FOUND)
     list(APPEND CUDA_SRCS cupti.cc)
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 3c1ccc7445ed27c711ab250aa223c66ae0da45dc..19c01dc5a968c7e1d2b0f15cf9a0e8427004e58b 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -45,6 +45,10 @@ DEFINE_string(nccl_dir, "",
 
 DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
 
+DEFINE_string(
+    tensorrt_dir, "",
+    "Specify path for loading tensorrt library, such as libnvinfer.so.");
+
 namespace paddle {
 namespace platform {
 namespace dynload {
@@ -194,6 +198,14 @@ void* GetNCCLDsoHandle() {
 #endif
 }
 
+void* GetTensorRtDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so");
+#endif
+}
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
index 4c85093a43e0e8d75b64c5b29d1ec68db1b44909..0de3559b6088086cb52c254535b6ec42da7dd724 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -25,6 +25,7 @@ void* GetCurandDsoHandle();
 void* GetWarpCTCDsoHandle();
 void* GetLapackDsoHandle();
 void* GetNCCLDsoHandle();
+void* GetTensorRtDsoHandle();
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/tensorrt.cc b/paddle/fluid/platform/dynload/tensorrt.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f3c8e27944ca9b6419de87d752df3a83751039b1
--- /dev/null
+++ b/paddle/fluid/platform/dynload/tensorrt.cc
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/tensorrt.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag tensorrt_dso_flag;
+void *tensorrt_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+TENSORRT_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h
new file mode 100644
index 0000000000000000000000000000000000000000..f584a49da0fefe0b064b5fb55b01ec132225ce5e
--- /dev/null
+++ b/paddle/fluid/platform/dynload/tensorrt.h
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <NvInfer.h>
+#include <dlfcn.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag tensorrt_dso_flag;
+extern void* tensorrt_dso_handle;
+
+#ifdef PADDLE_USE_DSO
+
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name)                      \
+  struct DynLoad__##__name {                                            \
+    template <typename... Args>                                         \
+    auto operator()(Args... args) -> decltype(__name(args...)) {        \
+      using tensorrt_func = decltype(__name(args...)) (*)(Args...);     \
+      std::call_once(tensorrt_dso_flag, []() {                          \
+        tensorrt_dso_handle =                                           \
+            paddle::platform::dynload::GetTensorRtDsoHandle();          \
+        PADDLE_ENFORCE(tensorrt_dso_handle, "load tensorrt so failed"); \
+      });                                                               \
+      void* p_##__name = dlsym(tensorrt_dso_handle, #__name);           \
+      PADDLE_ENFORCE(p_##__name, "load %s failed", #__name);            \
+      return reinterpret_cast<tensorrt_func>(p_##__name)(args...);      \
+    }                                                                   \
+  };                                                                    \
+  extern DynLoad__##__name __name
+
+#else
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name) \
+  struct DynLoad__##__name {                       \
+    template <typename... Args>                    \
+    tensorrtResult_t operator()(Args... args) {    \
+      return __name(args...);                      \
+    }                                              \
+  };                                               \
+  extern DynLoad__##__name __name
+#endif
+
+#define TENSORRT_RAND_ROUTINE_EACH(__macro) \
+  __macro(createInferBuilder_INTERNAL);     \
+  __macro(createInferRuntime_INTERNAL);
+
+TENSORRT_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP)
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h
index 673e1bcae4af6d039bc969f1de6e4bcab3748cb5..ffd183af68514dbb1a8b3de39000c9ca3f56ddc3 100644
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -873,6 +873,11 @@ HOSTDEVICE inline bool(isfinite)(const float16& a) {
   return !((isnan)(a)) && !((isinf)(a));
 }
 
+inline std::ostream& operator<<(std::ostream& os, const float16& a) {
+  os << static_cast<float>(a);
+  return os;
+}
+
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc
index d60aecf96c8828a5656f81fd3602cfb2e66990cf..a589e32b61a9b6a44bdc4529eee715d987d6922c 100644
--- a/paddle/fluid/platform/float16_test.cc
+++ b/paddle/fluid/platform/float16_test.cc
@@ -141,5 +141,10 @@ TEST(float16, lod_tensor_cpu) {
   }
 }
 
+TEST(float16, print) {
+  float16 a = float16(1.0f);
+  std::cout << a << std::endl;
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/transform.h b/paddle/fluid/platform/transform.h
index 917c48b47f8d70cd821d45dfbc6bafa494710ffa..7877d3e41c1c993662f5d91b263cbcb71db74c36 100644
--- a/paddle/fluid/platform/transform.h
+++ b/paddle/fluid/platform/transform.h
@@ -14,29 +14,44 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
+#include <type_traits>
+
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/hostdevice.h"
 #include "paddle/fluid/platform/place.h"
 
-#include <algorithm>
-#include <type_traits>
 #ifdef __NVCC__
 #include <thrust/execution_policy.h>
 #include <thrust/transform.h>
-#include "paddle/fluid/platform/details/device_ptr_cast.h"
+#include "paddle/fluid/platform/details/cuda_transform_iterator_cast.h"
 #endif
 
 namespace paddle {
 namespace platform {
 
-// Transform on host or device. It provides the same API in std library.
+// Transform applys a unary or a binary functor on each element in a
+// range defined by a pair of iterators.
+//
+// - The specialization for CPU calls std::transform.
+// - The specialization for CUDA calls thrust::tranform.
+//
+// NOTE: We need to define InputIter and OutputIter defined as
+//       different types, because the InputIter points op's inputs and
+//       OutputIter pints to op's outputs.
+//
+// NOTE: We don't assume that InputIter to be const InputType* and
+//       OutputIter to be OutputType*, because we might use a iterator
+//       class, paddle::fluid::operators::RowwiseTRansformIterator.
 template <typename DeviceContext>
 struct Transform {
+  // The unary version.
   template <typename InputIter, typename OutputIter, typename UnaryOperation>
   void operator()(const DeviceContext& context, InputIter first, InputIter last,
                   OutputIter result, UnaryOperation op);
 
+  // The binary version.
   template <typename InputIter1, typename InputIter2, typename OutputIter,
             typename BinaryOperation>
   void operator()(const DeviceContext& context, InputIter1 first1,
@@ -70,8 +85,9 @@ struct Transform<platform::CUDADeviceContext> {
     auto place = context.GetPlace();
     PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
     thrust::transform(thrust::cuda::par.on(context.stream()),
-                      details::DevPtrCast(first), details::DevPtrCast(last),
-                      details::DevPtrCast(result), op);
+                      details::CastToCUDATransformIterator(first),
+                      details::CastToCUDATransformIterator(last),
+                      details::CastToCUDATransformIterator(result), op);
   }
 
   template <typename InputIter1, typename InputIter2, typename OutputIter,
@@ -82,9 +98,10 @@ struct Transform<platform::CUDADeviceContext> {
     auto place = context.GetPlace();
     PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
     thrust::transform(thrust::cuda::par.on(context.stream()),
-                      details::DevPtrCast(first1), details::DevPtrCast(last1),
-                      details::DevPtrCast(first2), details::DevPtrCast(result),
-                      op);
+                      details::CastToCUDATransformIterator(first1),
+                      details::CastToCUDATransformIterator(last1),
+                      details::CastToCUDATransformIterator(first2),
+                      details::CastToCUDATransformIterator(result), op);
   }
 };
 #endif
diff --git a/paddle/fluid/platform/transform_test.cu b/paddle/fluid/platform/transform_test.cu
index 7b5cfd8f43473dc6bc784e98bd26fdd9e0ba9994..f65d1f60100edc85ba9745ed36f26a0ed160d80f 100644
--- a/paddle/fluid/platform/transform_test.cu
+++ b/paddle/fluid/platform/transform_test.cu
@@ -18,11 +18,12 @@ limitations under the License. */
 #include "paddle/fluid/platform/hostdevice.h"
 #include "paddle/fluid/platform/transform.h"
 
+namespace {
+
 template <typename T>
 class Scale {
  public:
   explicit Scale(const T& scale) : scale_(scale) {}
-
   HOSTDEVICE T operator()(const T& a) const { return a * scale_; }
 
  private:
@@ -35,11 +36,23 @@ class Multiply {
   HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
 };
 
+}  // namespace
+
+using paddle::memory::Alloc;
+using paddle::memory::Free;
+using paddle::memory::Copy;
+
+using paddle::platform::CPUPlace;
+using paddle::platform::CUDAPlace;
+using paddle::platform::CPUDeviceContext;
+using paddle::platform::CUDADeviceContext;
+
+using paddle::platform::Transform;
+
 TEST(Transform, CPUUnary) {
-  using namespace paddle::platform;
   CPUDeviceContext ctx;
   float buf[4] = {0.1, 0.2, 0.3, 0.4};
-  Transform<paddle::platform::CPUDeviceContext> trans;
+  Transform<CPUDeviceContext> trans;
   trans(ctx, buf, buf + 4, buf, Scale<float>(10));
   for (int i = 0; i < 4; ++i) {
     ASSERT_NEAR(buf[i], static_cast<float>(i + 1), 1e-5);
@@ -47,14 +60,12 @@ TEST(Transform, CPUUnary) {
 }
 
 TEST(Transform, GPUUnary) {
-  using namespace paddle::platform;
-  using namespace paddle::memory;
   CUDAPlace gpu0(0);
   CUDADeviceContext ctx(gpu0);
   float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
   float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4));
   Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx.stream());
-  Transform<paddle::platform::CUDADeviceContext> trans;
+  Transform<CUDADeviceContext> trans;
   trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
   ctx.Wait();
   Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx.stream());
@@ -65,10 +76,8 @@ TEST(Transform, GPUUnary) {
 }
 
 TEST(Transform, CPUBinary) {
-  using namespace paddle::platform;
-  using namespace paddle::memory;
   int buf[4] = {1, 2, 3, 4};
-  Transform<paddle::platform::CPUDeviceContext> trans;
+  Transform<CPUDeviceContext> trans;
   CPUDeviceContext ctx;
   trans(ctx, buf, buf + 4, buf, buf, Multiply<int>());
   for (int i = 0; i < 4; ++i) {
@@ -77,14 +86,12 @@ TEST(Transform, CPUBinary) {
 }
 
 TEST(Transform, GPUBinary) {
-  using namespace paddle::platform;
-  using namespace paddle::memory;
   int buf[4] = {1, 2, 3, 4};
   CUDAPlace gpu0(0);
   CUDADeviceContext ctx(gpu0);
   int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf)));
   Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());
-  Transform<paddle::platform::CUDADeviceContext> trans;
+  Transform<CUDADeviceContext> trans;
   trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
   ctx.Wait();
   Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx.stream());
diff --git a/paddle/fluid/platform/variant.h b/paddle/fluid/platform/variant.h
index 05ca33137de8db5291c8e38fc03457d05092cea8..45f60fc9d76560b133fa06198a24c7eaccc24088 100644
--- a/paddle/fluid/platform/variant.h
+++ b/paddle/fluid/platform/variant.h
@@ -14,29 +14,25 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef __CUDACC__
-#ifdef __CUDACC_VER_MAJOR__
-// CUDA 9 define `__CUDACC_VER__` as a warning message, manually define
-// __CUDACC_VER__ instead.
+// Boost 1.41.0 requires __CUDACC_VER__, but in CUDA 9 __CUDACC_VER__
+// is removed, so we have to manually define __CUDACC_VER__ instead.
+// For details, please refer to
+// https://github.com/PaddlePaddle/Paddle/issues/6626
+#if defined(__CUDACC__) && defined(__CUDACC_VER_MAJOR__)
 #undef __CUDACC_VER__
-
-#define __CUDACC_VER__                                         \
-  (__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 + \
-   __CUDACC_VER_BUILD__)
-#endif
-
+#define __CUDACC_VER__                                  \
+  __CUDACC_VER_BUILD__ + __CUDACC_VER_MAJOR__ * 10000 + \
+      __CUDACC_VER_MINOR__ * 100
 #endif
 
-#include <boost/config.hpp>
+#include "boost/config.hpp"
 
-#ifdef PADDLE_WITH_CUDA
-
-// Because boost's variadic templates has bug on nvcc, boost will disable
-// variadic template support when GPU enabled on nvcc.
-// Define BOOST_NO_CXX11_VARIADIC_TEMPLATES on gcc/clang to generate same
-// function symbols.
-//
+// Because Boost 1.41.0's variadic templates has bug on nvcc, boost
+// will disable variadic template support in NVCC mode.  Define
+// BOOST_NO_CXX11_VARIADIC_TEMPLATES on gcc/clang to generate same
+// function symbols.  For details,
 // https://github.com/PaddlePaddle/Paddle/issues/3386
+#ifdef PADDLE_WITH_CUDA
 #ifndef BOOST_NO_CXX11_VARIADIC_TEMPLATES
 #define BOOST_NO_CXX11_VARIADIC_TEMPLATES
 #endif
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index a1e8ff6399f0812773a7bb753c90e4400b1763d9..19bd30d9665dc1e8f9d475868cabbf14c8847352 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -505,11 +505,19 @@ All parameter, weight, gradient are variables in Paddle.
                                   scope, local_scopes, allow_op_delay);
            })
       .def("bcast_params", &ParallelExecutor::BCastParamsToGPUs)
+      // NOTE: even we return a vec<Scope*>* to Python use reference policy.
+      // We still cannot get local_scope from this vector, since the element
+      // of vec<Scope*> will be freed by Python GC. We can only return Scope*
+      // one by one and mark them as reference.
       .def("local_scopes",
            [](ParallelExecutor &self) -> std::vector<Scope *> * {
              return &self.GetLocalScopes();
            },
            py::return_value_policy::reference)
+      .def("feed_tensors_into_local_scopes",
+           &ParallelExecutor::FeedTensorsIntoLocalScopes)
+      .def("feed_and_split_tensor_into_local_scopes",
+           &ParallelExecutor::FeedAndSplitTensorIntoLocalScopes)
       .def("run", &ParallelExecutor::Run);
 
   BindRecordIOWriter(&m);
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 4a9dbd324c90380e784cc9457845fabd858585be..159d1d5f4e70033fabf93514bd63b38f83675bff 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -190,6 +190,11 @@ void PyCUDATensorSetFromArray(
       static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
   paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
                                    cudaMemcpyHostToDevice, dev_ctx->stream());
+  // NOTE: For safety, here wait the copy complete.
+  // It because the CPU array.data() could be destroyed after this method.
+  // If we make this method async, it could be copied data from a memory buffer
+  // that has been freed.
+  dev_ctx->Wait();
 }
 
 template <>
@@ -216,6 +221,11 @@ void PyCUDATensorSetFromArray(
   paddle::platform::GpuMemcpyAsync(dst, array.data(),
                                    sizeof(uint16_t) * array.size(),
                                    cudaMemcpyHostToDevice, dev_ctx->stream());
+  // NOTE: For safety, here wait the copy complete.
+  // It because the CPU array.data() could be destroyed after this method.
+  // If we make this method async, it could be copied data from a memory buffer
+  // that has been freed.
+  dev_ctx->Wait();
 }
 
 template <typename T>
diff --git a/paddle/fluid/recordio/chunk_test.cc b/paddle/fluid/recordio/chunk_test.cc
index 98ca99b9a018db2da9aa563741ff3cf30461c4ce..5177475c016097d9a118aa79f855672354b3ef53 100644
--- a/paddle/fluid/recordio/chunk_test.cc
+++ b/paddle/fluid/recordio/chunk_test.cc
@@ -43,5 +43,5 @@ TEST(Chunk, Compressor) {
 
   ch.Clear();
   ch.Parse(ss);
-  ASSERT_EQ(ch.NumBytes(), 18);
+  ASSERT_EQ(ch.NumBytes(), 18ul);
 }
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index be1565ab533037d4bc72b6d2834c48b04638c297..2b2a904974f3756576fb47851400e344c9357c57 100755
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -198,7 +198,7 @@ EOF
     # run paddle version to install python packages first
     RUN apt-get update &&\
         ${NCCL_DEPS}\
-        apt-get install -y wget python-pip dmidecode python-tk && pip install -U pip && \
+        apt-get install -y wget python-pip dmidecode python-tk && pip install -U pip==9.0.3 && \
         pip install /*.whl; apt-get install -f -y && \
         apt-get clean -y && \
         rm -f /*.whl && \
diff --git a/paddle/utils/DynamicLoader.cpp b/paddle/utils/DynamicLoader.cpp
index 5604a90038b06d2c1a4d9db70e4185cddfd25d3e..9ac4a56c6e300d299467630b39a32567af72cf40 100644
--- a/paddle/utils/DynamicLoader.cpp
+++ b/paddle/utils/DynamicLoader.cpp
@@ -32,6 +32,8 @@ DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
 
 DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
 
+DEFINE_string(tensorrt_dir, "", "Specify path for loading libnvinfer.so.");
+
 static inline std::string join(const std::string& part1,
                                const std::string& part2) {
   // directory separator
@@ -157,3 +159,12 @@ void GetLapackDsoHandle(void** dso_handle) {
   GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so", dso_handle);
 #endif
 }
+
+void GetTensorRtDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(
+      FLAGS_tensorrt_dir, "libnvinfer.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so", dso_handle);
+#endif
+}
diff --git a/paddle/utils/DynamicLoader.h b/paddle/utils/DynamicLoader.h
index 2e5ff76a06152b6a12818f06baaeaa6a69726ba8..02f519de4b3988fb6aca323aaa1751ee2c4bd738 100644
--- a/paddle/utils/DynamicLoader.h
+++ b/paddle/utils/DynamicLoader.h
@@ -58,3 +58,11 @@ void GetWarpCTCDsoHandle(void** dso_handle);
  *
  */
 void GetLapackDsoHandle(void** dso_handle);
+
+/**
+ * @brief    load the DSO of tensorrt
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetTensorRtDsoHandle(void** dso_handle);
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 7cbd7f22bf2968b29dc0665e893101b892808b5e..c7c0812fe2238d48903aa4c75bb8f1e9ecdb16c9 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -1,5 +1,5 @@
 file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py)
-file(GLOB_RECURSE FLUID_PY_FILES ./paddle/fluid/ *.py)
+file(GLOB_RECURSE FLUID_PY_FILES ./paddle/fluid/*.py)
 set(PY_FILES paddle/__init__.py
   ${UTILS_PY_FILES}
   ${FLUID_PY_FILES})
@@ -7,7 +7,7 @@ set(PY_FILES paddle/__init__.py
 if(NOT WITH_FLUID_ONLY)
   file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
   file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
-  file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py)
+  file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/*.py)
   set(PY_FILES ${PY_FILES}
     ${TRAINER_PY_FILES}
     ${HELPERS_PY_FILES}
@@ -55,7 +55,7 @@ add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_BINARY_DIR}/python/pad
 
 add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND touch stub.cc
-    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle
+    COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python
     COMMAND cp -r ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py
index b0522b49f44d8ed0c8c7e3148e24f312fbdd1123..aa15392d7e4901e8ee23ad5b4370542232adc2a5 100644
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -1115,4 +1115,6 @@ class DistributeTranspiler:
             for op2 in find_ops:
                 if ufind.is_connected(op1, op2):
                     lr_ops.append(op1)
+                    # we only need to append op for once
+                    break
         return lr_ops
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 5c2c2dd7abebf8960d68b4c4dfd746a4e27acd03..bba8b64bd88c3edc6eda110dde38c0ced50439f6 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -77,6 +77,7 @@ __all__ = [
     'lod_reset',
     'lrn',
     'pad',
+    'label_smooth',
 ]
 
 
@@ -3678,3 +3679,68 @@ def pad(x, paddings, pad_value=0., name=None):
         attrs={'paddings': paddings,
                'pad_value': float(pad_value)})
     return out
+
+
+def label_smooth(label,
+                 prior_dist=None,
+                 epsilon=0.1,
+                 dtype="float32",
+                 name=None):
+    """
+    Label smoothing is a mechanism to regularize the classifier layer and is
+    called label-smoothing regularization (LSR). 
+    
+    Label smoothing is proposed to encourage the model to be less confident,
+    since optimizing the log-likelihood of the correct label directly may
+    cause overfitting and reduce the ability of the model to adapt. Label
+    smoothing replaces the ground-truth label :math:`y` with the weighted sum
+    of itself and some fixed distribution :math:`\mu`. For class :math:`k`,
+    i.e.
+
+    .. math::
+
+        \\tilde{y_k} = (1 - \epsilon) * y_k + \epsilon * \mu_k,
+
+    where :math:`1 - \epsilon` and :math:`\epsilon` are the weights
+    respectively, and :math:`\\tilde{y}_k` is the smoothed label. Usually
+    uniform distribution is used for :math:`\mu`.
+
+    See more details about label smoothing in https://arxiv.org/abs/1512.00567.
+
+    Args:
+        label(Variable): The input variable containing the label data. The
+                          label data should use one-hot representation.
+        prior_dist(Variable): The prior distribution to be used to smooth
+                              labels. If not provided, an uniform distribution
+                              is used. The shape of :attr:`prior_dist` should
+                              be :math:`(1, class\_num)`. 
+        epsilon(float): The weight used to mix up the original ground-truth
+                        distribution and the fixed distribution.
+        dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, 
+                                                  float_64, int etc.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        Variable: The tensor variable containing the smoothed labels.
+
+    Examples:
+        .. code-block:: python
+
+            label = layers.data(name="label", shape=[1], dtype="float32")
+            one_hot_label = layers.one_hot(input=label, depth=10)
+            smooth_label = layers.label_smooth(
+                label=one_hot_label, epsilon=0.1, dtype="float32")
+    """
+    if epsilon > 1. or epsilon < 0.:
+        raise ValueError("The value of epsilon must be between 0 and 1.")
+    helper = LayerHelper("label_smooth", **locals())
+    label.stop_gradient = True
+    smooth_label = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type="label_smooth",
+        inputs={"X": label,
+                "PriorDist": prior_dist} if prior_dist else {"X": label},
+        outputs={"Out": smooth_label},
+        attrs={"epsilon": float(epsilon)})
+    return smooth_label
diff --git a/python/paddle/fluid/memory_optimization_transpiler.py b/python/paddle/fluid/memory_optimization_transpiler.py
index 41d1eca82e8b680977f44f1756c25c37340668a4..20ed19104207c1f0aa45db8f44570377011f3cde 100644
--- a/python/paddle/fluid/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/memory_optimization_transpiler.py
@@ -29,17 +29,20 @@ dtype_to_size = {
     core.VarDesc.VarType.BOOL: 1
 }
 
-sub_block_ops = [
+SUB_BLOCK_OPS = [
     "while", "while_grad", "parallel_do", "parallel_do_grad",
     "conditional_block", "conditional_block_grad"
 ]
 
+SUB_BLOCK_PAIR = [("while", "while_grad"), ("parallel_do", "parallel_do_grad"),
+                  ("conditional_block", "conditional_block_grad")]
+
 PRINT_LOG = False
 
 
 class ControlFlowGraph(object):
-    def __init__(self, Program, ops, forward_num, skip_opt):
-        self._program = Program
+    def __init__(self, program, ops, forward_num, skip_opt):
+        self._program = program
         self._ops = ops
         self._forward_num = forward_num
         self._successors = defaultdict(set)
@@ -51,6 +54,7 @@ class ControlFlowGraph(object):
         self._skip_opt = skip_opt
 
     def _add_connections(self, connections):
+        """Populates _successors and _presuccessors for two neighbor nodes."""
         for node1, node2 in connections:
             self._add(node1, node2)
 
@@ -58,7 +62,11 @@ class ControlFlowGraph(object):
         self._successors[node1].add(node2)
         self._presuccessors[node2].add(node1)
 
+    # TODO(panyx0718): We need to have a unified way of building intermediate
+    # representation.
     def _build_graph(self):
+        """Build a graph based on op sequence.
+        """
         self.op_size = len(self._ops)
         op_node_connections = [(i, i + 1) for i in range(self.op_size - 1)]
         self._add_connections(op_node_connections)
@@ -82,15 +90,14 @@ class ControlFlowGraph(object):
                 self._live_out[i].add(new_name)
 
     def _reach_fixed_point(self, live_in, live_out):
+        """Check if the liveness set has stablized."""
         if len(live_in) != len(self._live_in):
             return False
         if len(live_out) != len(self._live_out):
             return False
         for i in range(self.op_size):
-            if live_in[i] != self._live_in[i]:
-                return False
-        for i in range(self.op_size):
-            if live_out[i] != self._live_out[i]:
+            if (live_in[i] != self._live_in[i] or
+                    live_out[i] != self._live_out[i]):
                 return False
         return True
 
@@ -98,6 +105,8 @@ class ControlFlowGraph(object):
         self._build_graph()
         live_in = defaultdict(set)
         live_out = defaultdict(set)
+        # Repeatedly apply liveness updates until the algorithm stablize
+        # on a complete set live input vars and live output vars.
         while True:
             for i in range(self.op_size, 0, -1):
                 live_in[i] = set(self._live_in[i])
@@ -141,6 +150,8 @@ class ControlFlowGraph(object):
             return False
         return True
 
+    # TODO(panyx0718): This needs to be less hacky. It seems memory optimization
+    # doesn't consider vars copied between cpu and gpu.
     def _update_skip_opt_set(self):
         for i in range(self.op_size):
             op = self._ops[i]
@@ -154,7 +165,7 @@ class ControlFlowGraph(object):
         bwd_id = 0
         for i in range(self.op_size):
             op = self._ops[i]
-            if op.type() in sub_block_ops:
+            if op.type() in SUB_BLOCK_OPS:
                 continue
             block_desc = op.block()
             is_forward = i < self._forward_num
@@ -177,13 +188,15 @@ class ControlFlowGraph(object):
         def compare_shape(x_shape, cache_shape, opt_level):
             if opt_level == 0:
                 return x_shape == cache_shape
-            if opt_level == 1:
+            elif opt_level == 1:
                 if (x_shape[0] == -1) ^ (cache_shape[0] == -1):
                     return False
                 x_size = abs(reduce(lambda x, y: x * y, x_shape))
                 cache_size = abs(reduce(lambda x, y: x * y, cache_shape))
                 if x_size <= cache_size:
                     return True
+            else:
+                raise ValueError("only support opt_level 0 or 1.")
             return False
 
         self._dataflow_analyze()
@@ -191,10 +204,9 @@ class ControlFlowGraph(object):
         self.pool = []
         for i in range(self.op_size):
             op = self._ops[i]
-            if op.type() in sub_block_ops:
+            if op.type() in SUB_BLOCK_OPS:
                 continue
             block_desc = op.block()
-            self.current_block_desc = block_desc
             is_forward = i < self._forward_num
             if self.pool:
                 defs_can_optimize = filter(
@@ -211,37 +223,40 @@ class ControlFlowGraph(object):
                     for index, cache_pair in enumerate(self.pool):
                         cache_var = cache_pair[0]
                         cache_shape = cache_pair[1]
-                        if compare_shape(x_shape, cache_shape, level):
-                            if self._has_var(block_desc, cache_var, is_forward):
-                                x_dtype = self._find_var(block_desc, x,
-                                                         is_forward).dtype()
-                                cache_dtype = self._find_var(
-                                    block_desc, cache_var, is_forward).dtype()
-                                # TODO(qijun): actually, we should compare dtype_to_size[x_dtype]
-                                # and dtype_to_size[cache_dtype]
-                                if x_dtype == cache_dtype:
-                                    if PRINT_LOG:
-                                        print(
-                                            ("Hit Cache !!!! cache pool index "
-                                             "is %d, var name is %s, "
-                                             "cached var name is %s, "
-                                             "var shape is %s ") %
-                                            (index, x, cache_var,
-                                             str(cache_shape)))
-                                    self.pool.pop(index)
-                                    if x == cache_var:
-                                        break
-                                    _rename_arg_(
-                                        self._ops, x, cache_var, begin_idx=i)
-                                    self._program.block(block_desc.id).var(
-                                        str(x)).desc = self._find_var(
-                                            block_desc, cache_var, is_forward)
-                                    self._update_graph(
-                                        x, cache_var, begin_idx=i)
-                                    break
-
-            in_diff, out_diff = self._get_diff(self._live_in[i],
-                                               self._live_out[i])
+                        if not compare_shape(x_shape, cache_shape, level):
+                            continue
+
+                        if not self._has_var(block_desc, cache_var, is_forward):
+                            continue
+
+                        x_dtype = self._find_var(block_desc, x,
+                                                 is_forward).dtype()
+                        cache_dtype = self._find_var(block_desc, cache_var,
+                                                     is_forward).dtype()
+                        # TODO(qijun): actually, we should compare
+                        # dtype_to_size[x_dtype] and dtype_to_size[cache_dtype]
+                        if x_dtype != cache_dtype:
+                            continue
+
+                        if PRINT_LOG:
+                            print(("Hit Cache !!!! cache pool index "
+                                   "is %d, var name is %s, "
+                                   "cached var name is %s, "
+                                   "var shape is %s ") % (index, x, cache_var,
+                                                          str(cache_shape)))
+                        self.pool.pop(index)
+                        if x == cache_var:
+                            break
+                        # Rename the var to the cache var already with
+                        # memory allocated in order to reuse the memory.
+                        _rename_arg_(self._ops, x, cache_var, begin_idx=i)
+                        self._program.block(block_desc.id).var(str(
+                            x)).desc = self._find_var(block_desc, cache_var,
+                                                      is_forward)
+                        self._update_graph(x, cache_var, begin_idx=i)
+                        break
+
+            in_diff, _ = self._get_diff(self._live_in[i], self._live_out[i])
             can_optimize = filter(
                 lambda x: self._check_var_validity(block_desc, x, is_forward),
                 in_diff)
@@ -252,6 +267,19 @@ class ControlFlowGraph(object):
 
 
 def _process_sub_block_pair(pdesc, sub_block_pair):
+    """Creates a list of tuple each of which tracks info of a subblock.
+
+      Note: this function doesn't handle nested subblocks yet.
+      TODO(panyx0718): assert if case nested subblocks happen.
+
+    :param pdesc: ProgramDesc.
+    :param sub_block_pair: A list op pairs. Each op pair is the forward
+        op and backward op. The ops in the list are special that they contain
+        a subblock of ops.
+    :return: A list of tuples, each tuple is (all ops in a subblock pair
+        including forward and backward, number of forward ops,
+        all output args names of the ops in the subblock pairs).
+    """
     ops_list = []
     block_desc = pdesc.block(0)
     op_size = block_desc.op_size()
@@ -308,6 +336,11 @@ def _process_sub_block_pair(pdesc, sub_block_pair):
 
 
 def _get_cfgs(input_program):
+    """Process each block and create ControlFlowGraph for each of them.
+
+    :param input_program: Program object.
+    :return: A list of ControlFlowGraph, each corresponds to a block.
+    """
     ops_list = []
     pdesc = input_program.get_desc()
     block_desc = pdesc.block(0)
@@ -316,11 +349,8 @@ def _get_cfgs(input_program):
     ops_list.append(
         ([block_desc.op(i) for i in range(op_size)], op_size, set()))
 
-    sub_block_pair = [("while", "while_grad"), ("parallel_do",
-                                                "parallel_do_grad"),
-                      ("conditional_block", "conditional_block_grad")]
-
-    ops_list.extend(_process_sub_block_pair(pdesc, sub_block_pair))
+    # Only process one level of nested subblock.
+    ops_list.extend(_process_sub_block_pair(pdesc, SUB_BLOCK_PAIR))
 
     cfgs = [
         ControlFlowGraph(input_program, ops, forward_num, skip_opt)
@@ -330,6 +360,17 @@ def _get_cfgs(input_program):
 
 
 def memory_optimize(input_program, print_log=False, level=0):
+    """Optimize memory by reusing var memory.
+
+      Note: it doesn't not support subblock nested in subblock.
+
+    :param input_program: Input Program
+    :param print_log: whether to print debug log.
+    :param level: If level=0, reuse if the shape is completely equal, o
+    :return:
+    """
+    if level != 0 and level != 1:
+        raise ValueError("only support opt_level 0 or 1.")
     global PRINT_LOG
     PRINT_LOG = print_log
     cfgs = _get_cfgs(input_program)
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index 99a81c1d4244b919a53dfec36fc5a6659c10adae..c618b02a768f2ca3e2b2914d8ee0134836d5c0d2 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -169,7 +169,7 @@ class Accuracy(MetricBase):
         return self.value / self.weight
 
 
-class ChunkEvalutor(MetricBase):
+class ChunkEvaluator(MetricBase):
     """
     Accumulate counter numbers output by chunk_eval from mini-batches and
     compute the precision recall and F1-score using the accumulated counter
@@ -177,7 +177,7 @@ class ChunkEvalutor(MetricBase):
     """
 
     def __init__(self, name=None):
-        super(ChunkEvalutor, self).__init__(name)
+        super(ChunkEvaluator, self).__init__(name)
         self.num_infer_chunks = 0
         self.num_label_chunks = 0
         self.num_correct_chunks = 0
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 5ce2aa1fc4d0b275b502af0f97e4a0f83e85de5b..07cc1e29341bd497e88097a9ee5653631b79d734 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -16,6 +16,7 @@ import core
 import multiprocessing
 import framework
 import executor
+import sys
 
 __all__ = ['ParallelExecutor']
 
@@ -123,28 +124,93 @@ class ParallelExecutor(object):
             allow_op_delay)
         self.scope = scope
 
-    def run(self, fetch_list, feed_dict={}):
+    def run(self, fetch_list, feed=None, feed_dict=None):
         """
-        :param fetch_list: A list of variable names that will be fetched.
-        :param feed_dict: A dict mapping for feed variable name to LoDTensor
-          or numpy array.
-        :return: fetched value list.
-        """
-        if not isinstance(feed_dict, dict):
-            raise TypeError("feed_dict should be a dict")
+        Run a parallel executor with fetch_list.
+
+        The feed parameter can be a dict or a list. If feed is a dict, the
+        feed data will be split into multiple devices. If feed is a list, we
+        assume the data has been splitted into multiple devices, the each
+        element in the list will be copied to each device directly.
+
+        For example, if the feed is a dict:
+        >>> exe = ParallelExecutor()
+        >>> # the image will be splitted into devices. If there is two devices
+        >>> # each device will process an image with shape (24, 1, 28, 28)
+        >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))})
+
+        For example, if the feed is a list:
+        >>> exe = ParallelExecutor()
+        >>> # each device will process each element in the list.
+        >>> # the 1st device will process an image with shape (48, 1, 28, 28)
+        >>> # the 2nd device will process an image with shape (32, 1, 28, 28)
+        >>> #
+        >>> # you can use exe.device_count to get the device number.
+        >>> exe.run(feed=[{"image": numpy.random.random(size=(48, 1, 28, 28))},
+        >>>               {"image": numpy.random.random(size=(32, 1, 28, 28))},
+        >>>              ])
+
+
+        Args:
+            fetch_list(list): The fetched variable names
+            feed(list|dict|None): The feed variables. If the feed is a dict,
+                tensors in that dict will be splitted into each devices. If
+                the feed is a list, each element of the list will be copied
+                to each device.
+            feed_dict: Alias for feed parameter, for backward compatibility.
+                This parameter is deprecated.
 
-        feed_tensor_dict = {}
-        for i, feed_name in enumerate(feed_dict):
-            feed_tensor = feed_dict[feed_name]
-            if not isinstance(feed_tensor, core.LoDTensor):
-                feed_tensor = core.LoDTensor()
-                feed_tensor.set(feed_dict[feed_name], self._act_places[0])
-            feed_tensor_dict[feed_name] = feed_tensor
+        Returns: fetched result list.
+
+        """
+        if feed is None:
+            feed = feed_dict
+            print >> sys.stderr, "`feed_dict` is deprecated. Please use `feed=`"
+
+        if isinstance(feed, dict):
+            feed_tensor_dict = dict()
+            for feed_name in feed:
+                feed_tensor = feed[feed_name]
+                if not isinstance(feed_tensor, core.LoDTensor):
+                    feed_tensor = core.LoDTensor()
+                    # always set to CPU place, since the tensor need to be splitted
+                    # it is fast in CPU
+                    feed_tensor.set(feed[feed_name], core.CPUPlace())
+                feed_tensor_dict[feed_name] = feed_tensor
+
+            self.executor.feed_and_split_tensor_into_local_scopes(
+                feed_tensor_dict)
+        elif isinstance(feed, list) or isinstance(feed, tuple):
+            if len(feed) != len(self._act_places):
+                raise ValueError(
+                    "Feed a list of tensor, the list should be the same size as places"
+                )
+
+            res = list()
+
+            for i, each in enumerate(feed):
+                if not isinstance(each, dict):
+                    raise TypeError(
+                        "Each element of feed list should be a dict")
+                res_dict = dict()
+                for feed_name in each:
+                    tensor = each[feed_name]
+                    if not isinstance(tensor, core.LoDTensor):
+                        tmp = core.LoDTensor()
+                        tmp.set(tensor, self._act_places[i])
+                        tensor = tmp
+                    res_dict[feed_name] = tensor
+                res.append(res_dict)
+            self.executor.feed_tensors_into_local_scopes(res)
 
         fetch_var_name = '@FETCHED_VAR_NAME@'
-        self.executor.run(fetch_list, fetch_var_name, feed_tensor_dict)
+        self.executor.run(fetch_list, fetch_var_name)
         arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
         return [arr[i] for i in range(len(arr))]
 
     def bcast_params(self):
         self.executor.bcast_params(set(self.persistable_vars))
+
+    @property
+    def device_count(self):
+        return len(self._act_places)
diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index 4d8bca4d2430a248ccf421572bdafdffc3a3003a..d9cd76952e31f8185512ab45f9f3ab2ce7d9da48 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -12,17 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import contextlib
 import math
-
 import numpy as np
+import os
+import time
+import unittest
+
 import paddle
 import paddle.dataset.conll05 as conll05
 import paddle.fluid as fluid
-from paddle.fluid.initializer import init_on_cpu
-import contextlib
-import time
-import unittest
-import os
 
 word_dict, verb_dict, label_dict = conll05.get_dict()
 word_dict_len = len(word_dict)
diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py
index e4997b4069f60ff4382b4254bc026ae8ae29b345..5ec6890c1b0dabd2804a92071b63c9610299e67c 100644
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -157,7 +157,6 @@ def train(nn_type,
         for ip in pserver_ips.split(","):
             eplist.append(':'.join([ip, port]))
         pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        pserver_endpoints = os.getenv("PSERVERS")
         trainers = int(os.getenv("TRAINERS"))
         current_endpoint = os.getenv("POD_IP") + ":" + port
         trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 3bd24c98a22b5db9833a312f481ed74c3d26f0ad..8f17eeea139c86055f0d4a06b21cbf66d8395cdc 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1,10 +1,13 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
-# The fully connected test is removed whe the WITH_MKLDNN flag is OFF
-# Because the fully connected layer has only one kernel (MKLDNN)
+# The MKLDNN tests are skiped when the MKLDNN flag is OFF
 if(NOT WITH_MKLDNN)
-    list(REMOVE_ITEM TEST_OPS test_fc_op)
+    foreach(src ${TEST_OPS})
+        if(${src} MATCHES ".*_mkldnn_op$")
+            list(REMOVE_ITEM TEST_OPS ${src})
+        endif()
+    endforeach()
 endif(NOT WITH_MKLDNN)
 
 if(NOT WITH_DISTRIBUTE)
@@ -34,6 +37,8 @@ function(py_test_modules TARGET_NAME)
   endif()
 endfunction()
 
+list(REMOVE_ITEM TEST_OPS test_sequence_expand)
+
 # test time consuming OPs in a separate process for expliot parallism
 list(REMOVE_ITEM TEST_OPS test_parallel_executor)
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
@@ -70,6 +75,8 @@ else()
     endforeach(TEST_OP)
 endif(WITH_FAST_BUNDLE_TEST)
 
+#
+py_test_modules(test_sequence_expand MODULES test_sequence_expand)
 # tests with high overhead
 py_test_modules(test_parallel_executor MODULES test_parallel_executor)
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR})
diff --git a/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d554c2276c9acd710d14c8f8b32c802e3e17515
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py
@@ -0,0 +1,99 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+from scipy.special import expit
+from test_activation_op import TestRelu, TestTanh, TestSqrt, TestAbs
+
+
+class TestMKLDNNReluDim2(TestRelu):
+    def setUp(self):
+        super(TestMKLDNNReluDim2, self).setUp()
+
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNTanhDim2(TestTanh):
+    def setUp(self):
+        super(TestMKLDNNTanhDim2, self).setUp()
+
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNSqrtDim2(TestSqrt):
+    def setUp(self):
+        super(TestMKLDNNSqrtDim2, self).setUp()
+
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNAbsDim2(TestAbs):
+    def setUp(self):
+        super(TestMKLDNNAbsDim2, self).setUp()
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNReluDim4(TestRelu):
+    def setUp(self):
+        super(TestMKLDNNReluDim4, self).setUp()
+
+        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
+        # The same reason with TestAbs
+        x[np.abs(x) < 0.005] = 0.02
+        out = np.maximum(x, 0)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNTanhDim4(TestTanh):
+    def setUp(self):
+        super(TestMKLDNNTanhDim4, self).setUp()
+
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
+        }
+        self.outputs = {'Out': np.tanh(self.inputs['X'])}
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNSqrtDim4(TestSqrt):
+    def setUp(self):
+        super(TestMKLDNNSqrtDim4, self).setUp()
+
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
+        }
+        self.outputs = {'Out': np.sqrt(self.inputs['X'])}
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNAbsDim4(TestAbs):
+    def setUp(self):
+        super(TestMKLDNNAbsDim4, self).setUp()
+
+        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
+        # The same reason with TestAbs
+        x[np.abs(x) < 0.005] = 0.02
+        self.inputs = {'X': x}
+        self.outputs = {'Out': np.abs(self.inputs['X'])}
+        self.attrs = {"use_mkldnn": True}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 57d4a50e913c0d2994c62600f4e479056ed4c306..c9069777faf9d141db93184e8b1e6dc2a7034980 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -1098,82 +1098,5 @@ class TestFP16Swish(TestSwish):
                 self.check_output_with_place(place, atol=1e-3)
 
 
-#--------------------test MKLDNN--------------------
-class TestMKLDNNReluDim2(TestRelu):
-    def setUp(self):
-        super(TestMKLDNNReluDim2, self).setUp()
-
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNTanhDim2(TestTanh):
-    def setUp(self):
-        super(TestMKLDNNTanhDim2, self).setUp()
-
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNSqrtDim2(TestSqrt):
-    def setUp(self):
-        super(TestMKLDNNSqrtDim2, self).setUp()
-
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNAbsDim2(TestAbs):
-    def setUp(self):
-        super(TestMKLDNNAbsDim2, self).setUp()
-
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNReluDim4(TestRelu):
-    def setUp(self):
-        super(TestMKLDNNReluDim4, self).setUp()
-
-        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
-        # The same reason with TestAbs
-        x[np.abs(x) < 0.005] = 0.02
-        out = np.maximum(x, 0)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNTanhDim4(TestTanh):
-    def setUp(self):
-        super(TestMKLDNNTanhDim4, self).setUp()
-
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
-        }
-        self.outputs = {'Out': np.tanh(self.inputs['X'])}
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNSqrtDim4(TestSqrt):
-    def setUp(self):
-        super(TestMKLDNNSqrtDim4, self).setUp()
-
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
-        }
-        self.outputs = {'Out': np.sqrt(self.inputs['X'])}
-        self.attrs = {"use_mkldnn": True}
-
-
-class TestMKLDNNAbsDim4(TestAbs):
-    def setUp(self):
-        super(TestMKLDNNAbsDim4, self).setUp()
-
-        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
-        # The same reason with TestAbs
-        x[np.abs(x) < 0.005] = 0.02
-        self.inputs = {'X': x}
-        self.outputs = {'Out': np.abs(self.inputs['X'])}
-        self.attrs = {"use_mkldnn": True}
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..db6be21baaa54d33af9f5c44d1815e4b389eb884
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride
+
+
+class TestMKLDNN(TestConv2dOp):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNWithPad(TestWithPad):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNWithStride(TestWithStride):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index 65606a0b4373b28036096cf046da5143a3b8bcd0..a478649541ba9828e55c4239090d5aee554223ac 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -373,22 +373,5 @@ class TestDepthwiseConv2(TestConv2dOp):
 #     def init_op_type(self):
 #         self.op_type = "conv_cudnn"
 
-
-#----------------Conv2dMKLDNN----------------
-class TestMKLDNN(TestConv2dOp):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNWithPad(TestWithPad):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNWithStride(TestWithStride):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fc_op.py b/python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_fc_op.py
rename to python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index f88a6f1ce6e953c54da29f9e96199169b2cecd8b..a1be2d671ddc5c689b16319fcf5bf12dca5dde7e 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -340,6 +340,16 @@ class TestBook(unittest.TestCase):
             print(layers.lod_reset(x=x, y=y))
         print(str(program))
 
+    def test_label_smooth(self):
+        program = Program()
+        with program_guard(program):
+            label = layers.data(name="label", shape=[1], dtype="float32")
+            one_hot_label = layers.one_hot(input=label, depth=10)
+            smooth_label = layers.label_smooth(
+                label=one_hot_label, epsilon=0.1, dtype="float32")
+            self.assertIsNotNone(smooth_label)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..966a16dc870c041b9deb140bed57d907cf305fd8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py
@@ -0,0 +1,49 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from test_lrn_op import TestLRNOp
+
+
+class TestLRNMKLDNNOp(TestLRNOp):
+    def get_attrs(self):
+        attrs = TestLRNOp.get_attrs(self)
+        attrs['use_mkldnn'] = True
+        return attrs
+
+    def test_check_output(self):
+        self.check_output(atol=0.002)
+
+
+class TestLRNMKLDNNOpWithIsTest(TestLRNMKLDNNOp):
+    def get_attrs(self):
+        attrs = TestLRNMKLDNNOp.get_attrs(self)
+        attrs['is_test'] = True
+        return attrs
+
+    def test_check_grad_normal(self):
+        def check_raise_is_test():
+            try:
+                self.check_grad(['X'], 'Out', max_relative_error=0.01)
+            except Exception as e:
+                t = \
+                "is_test attribute should be set to False in training phase."
+                if t in str(e):
+                    raise AttributeError
+
+        self.assertRaises(AttributeError, check_raise_is_test)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lrn_op.py b/python/paddle/fluid/tests/unittests/test_lrn_op.py
index 8fa480b9bce84d2936f23cce9e41e8e54014b074..eaff45cbb2a58798e9d55149510bec72eea370cd 100644
--- a/python/paddle/fluid/tests/unittests/test_lrn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lrn_op.py
@@ -87,34 +87,5 @@ class TestLRNOp(OpTest):
         self.check_grad(['X'], 'Out', max_relative_error=0.01)
 
 
-class TestLRNMKLDNNOp(TestLRNOp):
-    def get_attrs(self):
-        attrs = TestLRNOp.get_attrs(self)
-        attrs['use_mkldnn'] = True
-        return attrs
-
-    def test_check_output(self):
-        self.check_output(atol=0.002)
-
-
-class TestLRNMKLDNNOpWithIsTest(TestLRNMKLDNNOp):
-    def get_attrs(self):
-        attrs = TestLRNMKLDNNOp.get_attrs(self)
-        attrs['is_test'] = True
-        return attrs
-
-    def test_check_grad_normal(self):
-        def check_raise_is_test():
-            try:
-                self.check_grad(['X'], 'Out', max_relative_error=0.01)
-            except Exception as e:
-                t = \
-                "is_test attribute should be set to False in training phase."
-                if t in str(e):
-                    raise AttributeError
-
-        self.assertRaises(AttributeError, check_raise_is_test)
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
index 95845ea4de54ad43754ec5811d28ed52a8a3ae86..3ddafbbc57b29d506158bcb57188ab96f814e0d3 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
@@ -203,31 +203,32 @@ class TestParallelExecutorBase(unittest.TestCase):
                                   iter=10,
                                   batch_size=None,
                                   allow_op_delay=False,
-                                  feed_dict={}):
+                                  feed_dict=None):
         main = fluid.Program()
         startup = fluid.Program()
+        startup.random_seed = 1  # Fix random seed
         with fluid.program_guard(main, startup):
-            loss = method(use_feed=len(feed_dict) > 0)
+            loss = method(use_feed=feed_dict is not None)
             adam = fluid.optimizer.Adam()
             adam.minimize(loss)
             if memory_opt:
                 fluid.memory_optimize(main)
-
             place = fluid.CUDAPlace(0)
             startup_exe = fluid.Executor(place)
             startup_exe.run(startup)
 
-            exe = fluid.ParallelExecutor(True, loss_name=loss.name)
+            exe = fluid.ParallelExecutor(
+                True, loss_name=loss.name, allow_op_delay=allow_op_delay)
             if batch_size is not None:
                 batch_size *= fluid.core.get_cuda_device_count()
             begin = time.time()
-            first_loss, = exe.run([loss.name], feed_dict=feed_dict)
+            first_loss, = exe.run([loss.name], feed=feed_dict)
             first_loss = numpy.array(first_loss)
 
             for i in xrange(iter):
-                exe.run([], feed_dict=feed_dict)
+                exe.run([], feed=feed_dict)
 
-            last_loss, = exe.run([loss.name], feed_dict=feed_dict)
+            last_loss, = exe.run([loss.name], feed=feed_dict)
             end = time.time()
 
             if batch_size is not None:
@@ -505,3 +506,148 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
                         train_loss, test_loss, atol=1e-8),
                     "Train loss: " + str(train_loss) + "\n Test loss:" +
                     str(test_loss))
+
+
+import paddle.dataset.conll05 as conll05
+import paddle.fluid as fluid
+
+word_dict, verb_dict, label_dict = conll05.get_dict()
+word_dict_len = len(word_dict)
+label_dict_len = len(label_dict)
+pred_dict_len = len(verb_dict)
+mark_dict_len = 2
+word_dim = 32
+mark_dim = 5
+hidden_dim = 512
+depth = 8
+mix_hidden_lr = 1e-3
+embedding_name = 'emb'
+
+
+def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
+            **ignored):
+    # 8 features
+    predicate_embedding = fluid.layers.embedding(
+        input=predicate,
+        size=[pred_dict_len, word_dim],
+        dtype='float32',
+        param_attr='vemb')
+
+    mark_embedding = fluid.layers.embedding(
+        input=mark, size=[mark_dict_len, mark_dim], dtype='float32')
+
+    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+    emb_layers = [
+        fluid.layers.embedding(
+            size=[word_dict_len, word_dim],
+            input=x,
+            param_attr=fluid.ParamAttr(
+                name=embedding_name, trainable=False)) for x in word_input
+    ]
+    emb_layers.append(predicate_embedding)
+    emb_layers.append(mark_embedding)
+
+    hidden_0_layers = [
+        fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
+        for emb in emb_layers
+    ]
+
+    hidden_0 = fluid.layers.sums(input=hidden_0_layers)
+
+    lstm_0 = fluid.layers.dynamic_lstm(
+        input=hidden_0,
+        size=hidden_dim,
+        candidate_activation='relu',
+        gate_activation='sigmoid',
+        cell_activation='sigmoid')
+
+    # stack L-LSTM and R-LSTM with direct edges
+    input_tmp = [hidden_0, lstm_0]
+
+    for i in range(1, depth):
+        mix_hidden = fluid.layers.sums(input=[
+            fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'),
+            fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh')
+        ])
+
+        lstm = fluid.layers.dynamic_lstm(
+            input=mix_hidden,
+            size=hidden_dim,
+            candidate_activation='relu',
+            gate_activation='sigmoid',
+            cell_activation='sigmoid',
+            is_reverse=((i % 2) == 1))
+
+        input_tmp = [mix_hidden, lstm]
+
+    feature_out = fluid.layers.sums(input=[
+        fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'),
+        fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh')
+    ])
+
+    return feature_out
+
+
+class TestCRFModel(unittest.TestCase):
+    def test_all(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            word = fluid.layers.data(
+                name='word_data', shape=[1], dtype='int64', lod_level=1)
+            predicate = fluid.layers.data(
+                name='verb_data', shape=[1], dtype='int64', lod_level=1)
+            ctx_n2 = fluid.layers.data(
+                name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
+            ctx_n1 = fluid.layers.data(
+                name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
+            ctx_0 = fluid.layers.data(
+                name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
+            ctx_p1 = fluid.layers.data(
+                name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
+            ctx_p2 = fluid.layers.data(
+                name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
+            mark = fluid.layers.data(
+                name='mark_data', shape=[1], dtype='int64', lod_level=1)
+            feature_out = db_lstm(**locals())
+            target = fluid.layers.data(
+                name='target', shape=[1], dtype='int64', lod_level=1)
+            crf_cost = fluid.layers.linear_chain_crf(
+                input=feature_out,
+                label=target,
+                param_attr=fluid.ParamAttr(
+                    name='crfw', learning_rate=1e-1))
+            avg_cost = fluid.layers.mean(crf_cost)
+
+            sgd_optimizer = fluid.optimizer.SGD(
+                learning_rate=fluid.layers.exponential_decay(
+                    learning_rate=0.01,
+                    decay_steps=100000,
+                    decay_rate=0.5,
+                    staircase=True))
+            sgd_optimizer.minimize(avg_cost)
+
+            train_data = paddle.batch(
+                paddle.reader.shuffle(
+                    paddle.dataset.conll05.test(), buf_size=8192),
+                batch_size=16)
+
+            place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(startup)
+
+            pe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
+
+            feeder = fluid.DataFeeder(
+                feed_list=[
+                    word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate,
+                    mark, target
+                ],
+                place=fluid.CPUPlace())
+
+            data = train_data()
+            for i in xrange(10):
+                cur_batch = next(data)
+                print map(numpy.array,
+                          pe.run(feed=feeder.feed(cur_batch),
+                                 fetch_list=[avg_cost.name]))[0]
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..003ebba18b26198427d9f313596ae85656ac24fa
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
@@ -0,0 +1,50 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from test_pool2d_op import TestPool2d_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5
+
+
+class TestMKLDNNCase1(TestPool2d_Op):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNCase2(TestCase1):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNCase3(TestCase2):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNCase4(TestCase3):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNCase5(TestCase4):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNCase6(TestCase5):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
index 764fa575fba1615de3171e848890b3836e640849..328a9ffd25b9fce3fd45bbe847e365f090acd17c 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -317,36 +317,5 @@ class TestCeilModeCase4(TestCase2):
         self.ceil_mode = True
 
 
-#--------------------test pool2d MKLDNN--------------------
-class TestMKLDNNCase1(TestPool2d_Op):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNCase2(TestCase1):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNCase3(TestCase2):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNCase4(TestCase3):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNCase5(TestCase4):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
-class TestMKLDNNCase6(TestCase5):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_expand.py b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
index 7feb509c4d6f5768552fc2515081f7e68f420967..4c8ec1426c6e103498af544ea5928ec630707d46 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
@@ -47,8 +47,10 @@ class TestSequenceExpand(OpTest):
             x_len = x_idx[i] - x_idx[i - 1]
             if repeat_num > 0:
                 x_sub = x_data[x_idx[i - 1]:x_idx[i], :]
-                x_sub = np.repeat(x_sub, repeat_num, axis=0)
-                out = np.vstack((out, x_sub))
+                stacked_x_sub = x_sub
+                for r in range(repeat_num - 1):
+                    stacked_x_sub = np.vstack((stacked_x_sub, x_sub))
+                out = np.vstack((out, stacked_x_sub))
                 if x_lod is not None:
                     for j in xrange(repeat_num):
                         out_lod[0].append(out_lod[0][-1] + x_len)
@@ -101,11 +103,11 @@ class TestSequenceExpandCase3(TestSequenceExpand):
 
 class TestSequenceExpandCase4(TestSequenceExpand):
     def set_data(self):
-        data = [0.1, 0.3, 0.2, 0.15, 0.25, 0.2, 0.15, 0.25, 0.1, 0.3]
+        data = np.random.uniform(0.1, 1, [5 * 2, 1])
         x_data = np.array(data).reshape([5, 2]).astype('float32')
         x_lod = [[0, 2, 5]]
-        y_data = np.random.uniform(0.1, 1, [2, 1]).astype('float32')
-        y_lod = [[0, 1, 2], [0, 1, 2]]
+        y_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
+        y_lod = [[0, 1, 3], [0, 1, 3]]
         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index 75ff85a55fc4fd504ecd032e17f7e189c17192fb..346a949b6e7c96b5535f5e65ddbada11e110a0a7 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -15,6 +15,16 @@
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+
+
+def output_hist(out):
+    hist, _ = np.histogram(out, range=(-5, 10))
+    hist = hist.astype("float32")
+    hist /= float(out.size)
+    prob = 0.1 * np.ones((10))
+    return hist, prob
 
 
 class TestUniformRandomOp(OpTest):
@@ -33,11 +43,37 @@ class TestUniformRandomOp(OpTest):
         self.check_output_customized(self.verify_output)
 
     def verify_output(self, outs):
-        tensor = outs[0]
-        hist, _ = np.histogram(outs[0], range=(-5, 10))
-        hist = hist.astype("float32")
-        hist /= float(outs[0].size)
-        prob = 0.1 * np.ones((10))
+        hist, prob = output_hist(np.array(outs[0]))
+        self.assertTrue(
+            np.allclose(
+                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+
+
+class TestUniformRandomOpSelectedRows(unittest.TestCase):
+    def get_places(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        return places
+
+    def test_check_output(self):
+        for place in self.get_places():
+            self.check_with_place(place)
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+        out = scope.var("X").get_selected_rows()
+
+        op = Operator(
+            "uniform_random",
+            Out="X",
+            shape=[4, 784],
+            min=-5.0,
+            max=10.0,
+            seed=10)
+        op.run(scope, place)
+        self.assertEqual(out.get_tensor().shape(), [4, 784])
+        hist, prob = output_hist(np.array(out.get_tensor()))
         self.assertTrue(
             np.allclose(
                 hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
diff --git a/python/paddle/v2/reader/__init__.py b/python/paddle/v2/reader/__init__.py
index 3b059735a924d58714cd88a761eb83143f1192d6..12efdc4a0fec83fed57bdcbf687aaec69d13ba91 100644
--- a/python/paddle/v2/reader/__init__.py
+++ b/python/paddle/v2/reader/__init__.py
@@ -50,7 +50,7 @@ An example implementation for single item data reader creator:
         def reader():
             while True:
                 yield numpy.random.uniform(-1, 1, size=width*height)
-    return reader
+        return reader
 
 An example implementation for multiple item data reader creator:
 
@@ -60,7 +60,7 @@ An example implementation for multiple item data reader creator:
         def reader():
             while True:
                 yield numpy.random.uniform(-1, 1, size=width*height), label
-    return reader
+        return reader
 
 
 TODO(yuyang18): Should we add whole design doc here?
diff --git a/tools/aws_benchmarking/README.md b/tools/aws_benchmarking/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..837fcbb8512bce027ecd09a7f39b806151e9154b
--- /dev/null
+++ b/tools/aws_benchmarking/README.md
@@ -0,0 +1,160 @@
+# AWS benchmark testing tool
+This is an automation tool for deploying paddlepaddle benchmark testing to AWS.
+
+## Features
+
+ - subnet creation to fit just the amount of ec2 instances required.
+ - pserver and trainer ec2 instances allocation, and instance state verification
+ - nvidia-docker ready for GPU training
+ - Instances and network element garbage collection when a task is accomplished or an error occurred
+ - Test log is collected in realtime
+ - Web service for checking log or tearing down the testing setup
+ - No testing code change needed
+ - Lots of optional configuration options
+
+ ## Usages
+
+ ### Prerequisites
+
+ - You have a working AWS account
+ - You have [AWS Command Line Interface](https://aws.amazon.com/cli/) installed
+ - Your AWS cli is bind with a account which has `AmazonEC2FullAccess` permission, and it's set as default credential.
+ - You have key pair created and pem file downloaded.
+ - You have a default VPC in the region you want to run the test.
+ - You have a Security Group created for the VPC mentioned above, which allows port 22 and the port you want to expose your control web service (5436 by default)
+ - If your test is supposed to run in a GPU machine, especially a multi card GPU machine (p2, p3 series), you might need to contact amazon to raise the limit which allows no more than 1 GPU instance at a time.
+
+ ### Start a benchmark test
+
+#### Create training image
+
+*What to expect in this step:*
+
+*You will have your training logic packed with paddle runtime in a docker image, and be able to be picked up by AWS instance for training.*
+
+Training python script and PaddlePaddle runtime are supposed to be packed into one docker image. Use PaddlePaddle production images as base image and create the training images with the docker file as follows:
+
+```Dockerfile
+FROM paddlepaddle/paddle:latest-gpu
+
+ENV HOME /root
+COPY ./ /root/
+WORKDIR /root
+RUN pip install -r /root/requirements.txt
+ENTRYPOINT ["python", "my_training.py"]
+```
+
+***Please Note***
+Training nodes will run your `ENTRYPOINT` script with the following environment variables:
+
+ - `TASK_NAME`: unique name to identify this training process.
+ - `TRAINING_ROLE`: current node's role in this training process, either "PSERVER" or "TRAINER"
+ - `PSERVER_HOSTS`: comma separated value of pserver end points, I.E. "192.168.1.2:5436,192.168.1.3:5436"
+ - `PSERVERS`: same as above
+ - `TRAINERS`: trainer count
+ - `SERVER_ENDPOINT`: current server end point if the node role is a pserver
+ - `TRAINER_INDEX`: an integer to identify the index of current trainer if the node role is a trainer.
+ - `PADDLE_INIT_TRAINER_ID`: same as above
+
+ Now we have a working distributed training script which takes advantage of node environment variables and docker file to generate the training image. Run the following command:
+
+ ```bash
+ docker build -t myreponname/paddle_benchmark .
+ ```
+
+ Now you have the image built and tagged with `myreponame/paddle_benchmark`, let's push it to dockerhub so that it can be picked up by out AWS instance.
+
+ ```bash
+ docker push myreponame/paddle_benchmark
+ ```
+
+#### Create instances and start training
+
+*What to expect in this step*
+
+*you will be asked to provide some basic settings to config your training, and this tool will have your training started and monitored*
+
+Now let's start the training process:
+
+```bash
+docker run -i -v $HOME/.aws:/root/.aws -v <full path to your pem file>:/root/<key pare name>.pem \
+putcn/paddle_aws_client \
+--action create \
+--key_name <your key pare name> \
+--security_group_id <your security group id> \
+--docker_image myreponame/paddle_benchmark \
+--pserver_count 2 \
+--trainer_count 2
+```
+
+Now just wait until you see this:
+```
+master server finished init process, visit http://XXX:XXX/status to check master log
+```
+That means you can turn off your laptop and your cluster is creating instances, starting training process, collecting logs and eventually shut all pservers and trainers down when training is finished.
+
+#### Post creation operations
+
+To access the master log:
+
+```bash
+docker run -i -v $HOME/.aws:/root/.aws \
+putcn/paddle_aws_client \
+--action status \
+--master_server_public_ip <master ip> \
+--master_server_port <master port>
+```
+
+To tear down the training setup:
+
+```bash
+docker run -i -v $HOME/.aws:/root/.aws \
+putcn/paddle_aws_client \
+--action cleanup \
+--master_server_public_ip <master ip> \
+--master_server_port <master port>
+```
+
+To retrieve training logs
+TBD
+
+### Tech details
+
+*What to expect in this step*
+
+*You will understand what is happening behind the scene, and how to check the training log, how to tear down the training on the fly, etc.*
+
+Let's understand what is happening under the hood when you run above command in your laptop
+
+![alt](diagram.png)
+
+There are 4 roles in the figure above:
+ - client: your laptop
+ - master: who tasks to aws api server to create/tear down instances, and monitor training process
+ - AWS api server: the one who actually creates and manages instances
+ - pservers and trainers: training instances
+
+When you run the `docker run` command above, what it actually does is to ask aws api service to create a subnet (step 1) and a master instance (step 2), and pass all the parameters the client collected or generated (step 3). The master is kept as minimum hardware config to keep the running cost low.
+
+Then when the master is up and running, it will ask the aws api server to create the heavy lifting training instances who are expensive to run (step 4). And the master will start training process as soon as they are done initializing (step 5).
+
+Meanwhile, the master will expose a web service for client to check training log or even tear the training setup down by a web service call.
+
+if you are creating the training with client docker container, and also monitoring your aws dashboard, you will initially see a instance tagged with `ROLE=MASTER` and `TASK_NAME=<yourtask name>_master` starts, then you will see several instances tagged with `ROLE=PSERVER` and `ROLE=TRAINER` starts.
+When the training is finished, pservers and trainers will be terminated. All their logs are kept in master node's docker env.
+
+Master exposes 4 major services:
+
+ - GET `/status`: return master log
+ - GET `/logs`: return list of log file names
+ - GET `/log/<logfile name>`: return a particular log by log file name
+ - POST `/cleanup`: teardown the whole setup
+
+
+### Parameters
+
+TBD, please refer to client/cluster_launcher.py for now
+
+### Trouble shooting
+
+TBD
diff --git a/tools/aws_benchmarking/client/Dockerfile b/tools/aws_benchmarking/client/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..812c5d4bce0adff404577ce6b5fd3f0f4a91118c
--- /dev/null
+++ b/tools/aws_benchmarking/client/Dockerfile
@@ -0,0 +1,7 @@
+FROM python:2.7.14-stretch
+
+ENV HOME /root
+COPY ./ /root/
+WORKDIR /root
+RUN pip install -r /root/requirements.txt
+ENTRYPOINT ["python", "cluster_launcher.py"]
\ No newline at end of file
diff --git a/tools/aws_benchmarking/client/cluster_launcher.py b/tools/aws_benchmarking/client/cluster_launcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..594378ff8fc0744a4b11b1c11e2e3b270be7aed0
--- /dev/null
+++ b/tools/aws_benchmarking/client/cluster_launcher.py
@@ -0,0 +1,407 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import time
+import math
+import logging
+import copy
+
+import netaddr
+import boto3
+import namesgenerator
+import paramiko
+from scp import SCPClient
+import requests
+
+
+def str2bool(v):
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--key_name', type=str, default="", help="required, key pair name")
+parser.add_argument(
+    '--security_group_id',
+    type=str,
+    default="",
+    help="required, the security group id associated with your VPC")
+
+parser.add_argument(
+    '--vpc_id',
+    type=str,
+    default="",
+    help="The VPC in which you wish to run test")
+parser.add_argument(
+    '--subnet_id',
+    type=str,
+    default="",
+    help="The Subnet_id in which you wish to run test")
+
+parser.add_argument(
+    '--pserver_instance_type',
+    type=str,
+    default="c5.2xlarge",
+    help="your pserver instance type, c5.2xlarge by default")
+parser.add_argument(
+    '--trainer_instance_type',
+    type=str,
+    default="p2.8xlarge",
+    help="your trainer instance type, p2.8xlarge by default")
+
+parser.add_argument(
+    '--task_name',
+    type=str,
+    default="",
+    help="the name you want to identify your job")
+parser.add_argument(
+    '--pserver_image_id',
+    type=str,
+    default="ami-da2c1cbf",
+    help="ami id for system image, default one has nvidia-docker ready, \
+    use ami-1ae93962 for us-east-2")
+
+parser.add_argument(
+    '--pserver_command', type=str, default="", help="pserver start command")
+
+parser.add_argument(
+    '--trainer_image_id',
+    type=str,
+    default="ami-da2c1cbf",
+    help="ami id for system image, default one has nvidia-docker ready, \
+    use ami-1ae93962 for us-west-2")
+
+parser.add_argument(
+    '--trainer_command', type=str, default="", help="trainer start command")
+
+parser.add_argument(
+    '--availability_zone',
+    type=str,
+    default="us-east-2a",
+    help="aws zone id to place ec2 instances")
+
+parser.add_argument(
+    '--trainer_count', type=int, default=1, help="Trainer count")
+
+parser.add_argument(
+    '--pserver_count', type=int, default=1, help="Pserver count")
+
+parser.add_argument(
+    '--action', type=str, default="create", help="create|cleanup|status")
+
+parser.add_argument('--pem_path', type=str, help="private key file")
+
+parser.add_argument(
+    '--pserver_port', type=str, default="5436", help="pserver port")
+
+parser.add_argument(
+    '--docker_image', type=str, default="busybox", help="training docker image")
+
+parser.add_argument(
+    '--master_server_port', type=int, default=5436, help="master server port")
+
+parser.add_argument(
+    '--master_server_public_ip', type=str, help="master server public ip")
+
+parser.add_argument(
+    '--master_docker_image',
+    type=str,
+    default="putcn/paddle_aws_master:latest",
+    help="master docker image id")
+
+parser.add_argument(
+    '--no_clean_up',
+    type=str2bool,
+    default=False,
+    help="whether to clean up after training")
+
+args = parser.parse_args()
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
+
+ec2client = boto3.client('ec2')
+
+
+def print_arguments():
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def create_subnet():
+    # if no vpc id provided, list vpcs
+    logging.info("start creating subnet")
+    if not args.vpc_id:
+        logging.info("no vpc provided, trying to find the default one")
+        vpcs_desc = ec2client.describe_vpcs(
+            Filters=[{
+                "Name": "isDefault",
+                "Values": ["true", ]
+            }], )
+        if len(vpcs_desc["Vpcs"]) == 0:
+            raise ValueError('No default VPC')
+        args.vpc_id = vpcs_desc["Vpcs"][0]["VpcId"]
+        vpc_cidrBlock = vpcs_desc["Vpcs"][0]["CidrBlock"]
+
+        logging.info("default vpc fount with id %s and CidrBlock %s" %
+                     (args.vpc_id, vpc_cidrBlock))
+
+    if not vpc_cidrBlock:
+        logging.info("trying to find cidrblock for vpc")
+        vpcs_desc = ec2client.describe_vpcs(
+            Filters=[{
+                "Name": "vpc-id",
+                "Values": [args.vpc_id, ],
+            }], )
+        if len(vpcs_desc["Vpcs"]) == 0:
+            raise ValueError('No VPC found')
+        vpc_cidrBlock = vpcs_desc["Vpcs"][0]["CidrBlock"]
+        logging.info("cidrblock for vpc is %s" % vpc_cidrBlock)
+
+    # list subnets in vpc in order to create a new one
+
+    logging.info("trying to find ip blocks for new subnet")
+    subnets_desc = ec2client.describe_subnets(
+        Filters=[{
+            "Name": "vpc-id",
+            "Values": [args.vpc_id, ],
+        }], )
+
+    ips_taken = []
+    for subnet_dec in subnets_desc["Subnets"]:
+        ips_taken.append(subnet_dec["CidrBlock"])
+
+    ip_blocks_avaliable = netaddr.IPSet(
+        [vpc_cidrBlock]) ^ netaddr.IPSet(ips_taken)
+    # adding 10 addresses as buffer
+    cidr_prefix = 32 - math.ceil(
+        math.log(args.pserver_count + args.trainer_count + 10, 2))
+    if cidr_prefix <= 16:
+        raise ValueError('Too many nodes to fit in current VPC')
+
+    for ipnetwork in ip_blocks_avaliable.iter_cidrs():
+        try:
+            subnet_cidr = ipnetwork.subnet(int(cidr_prefix)).next()
+            logging.info("subnet ip block found %s" % (subnet_cidr))
+            break
+        except Exception:
+            pass
+
+    if not subnet_cidr:
+        raise ValueError(
+            'No avaliable subnet to fit required nodes in current VPC')
+
+    logging.info("trying to create subnet")
+    subnet_desc = ec2client.create_subnet(
+        CidrBlock=str(subnet_cidr),
+        VpcId=args.vpc_id,
+        AvailabilityZone=args.availability_zone)
+
+    subnet_id = subnet_desc["Subnet"]["SubnetId"]
+
+    subnet_waiter = ec2client.get_waiter('subnet_available')
+    # sleep for 1s before checking its state
+    time.sleep(1)
+    subnet_waiter.wait(SubnetIds=[subnet_id, ])
+
+    logging.info("subnet created")
+
+    logging.info("adding tags to newly created subnet")
+    ec2client.create_tags(
+        Resources=[subnet_id, ],
+        Tags=[{
+            "Key": "Task_name",
+            'Value': args.task_name
+        }])
+    return subnet_id
+
+
+def run_instances(image_id, instance_type, count=1, role="MASTER", cmd=""):
+    response = ec2client.run_instances(
+        ImageId=image_id,
+        InstanceType=instance_type,
+        MaxCount=count,
+        MinCount=count,
+        UserData=cmd,
+        DryRun=False,
+        InstanceInitiatedShutdownBehavior="stop",
+        KeyName=args.key_name,
+        Placement={'AvailabilityZone': args.availability_zone},
+        NetworkInterfaces=[{
+            'DeviceIndex': 0,
+            'SubnetId': args.subnet_id,
+            "AssociatePublicIpAddress": True,
+            'Groups': args.security_group_ids
+        }],
+        TagSpecifications=[{
+            'ResourceType': "instance",
+            'Tags': [{
+                "Key": 'Task_name',
+                "Value": args.task_name + "_master"
+            }, {
+                "Key": 'Role',
+                "Value": role
+            }]
+        }])
+
+    instance_ids = []
+    for instance in response["Instances"]:
+        instance_ids.append(instance["InstanceId"])
+
+    if len(instance_ids) > 0:
+        logging.info(str(len(instance_ids)) + " instance(s) created")
+    else:
+        logging.info("no instance created")
+    #create waiter to make sure it's running
+
+    logging.info("waiting for instance to become accessible")
+    waiter = ec2client.get_waiter('instance_status_ok')
+    waiter.wait(
+        Filters=[{
+            "Name": "instance-status.status",
+            "Values": ["ok"]
+        }, {
+            "Name": "instance-status.reachability",
+            "Values": ["passed"]
+        }, {
+            "Name": "instance-state-name",
+            "Values": ["running"]
+        }],
+        InstanceIds=instance_ids)
+
+    instances_response = ec2client.describe_instances(InstanceIds=instance_ids)
+
+    return instances_response["Reservations"][0]["Instances"]
+
+
+def generate_task_name():
+    return namesgenerator.get_random_name()
+
+
+def init_args():
+
+    if not args.task_name:
+        args.task_name = generate_task_name()
+        logging.info("task name generated %s" % (args.task_name))
+
+    if not args.pem_path:
+        args.pem_path = os.path.expanduser("~") + "/" + args.key_name + ".pem"
+    if args.security_group_id:
+        args.security_group_ids = (args.security_group_id, )
+
+
+def create():
+
+    init_args()
+
+    # create subnet
+    if not args.subnet_id:
+        args.subnet_id = create_subnet()
+
+    # create master node
+
+    master_instance_response = run_instances(
+        image_id="ami-7a05351f", instance_type="t2.nano")
+
+    logging.info("master server started")
+
+    args.master_server_public_ip = master_instance_response[0][
+        "PublicIpAddress"]
+    args.master_server_ip = master_instance_response[0]["PrivateIpAddress"]
+
+    logging.info("master server started, master_ip=%s, task_name=%s" %
+                 (args.master_server_public_ip, args.task_name))
+
+    # cp config file and pems to master node
+
+    ssh_key = paramiko.RSAKey.from_private_key_file(args.pem_path)
+    ssh_client = paramiko.SSHClient()
+    ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+    ssh_client.connect(
+        hostname=args.master_server_public_ip, username="ubuntu", pkey=ssh_key)
+
+    with SCPClient(ssh_client.get_transport()) as scp:
+        scp.put(os.path.expanduser("~") + "/" + ".aws",
+                recursive=True,
+                remote_path='/home/ubuntu/')
+        scp.put(args.pem_path,
+                remote_path='/home/ubuntu/' + args.key_name + ".pem")
+
+    logging.info("credentials and pem copied to master")
+
+    # set arguments and start docker
+    kick_off_cmd = "docker run -d -v /home/ubuntu/.aws:/root/.aws/"
+    kick_off_cmd += " -v /home/ubuntu/" + args.key_name + ".pem:/root/" + args.key_name + ".pem"
+    kick_off_cmd += " -v /home/ubuntu/logs/:/root/logs/"
+    kick_off_cmd += " -p " + str(args.master_server_port) + ":" + str(
+        args.master_server_port)
+    kick_off_cmd += " " + args.master_docker_image
+
+    args_to_pass = copy.copy(args)
+    args_to_pass.action = "serve"
+    del args_to_pass.pem_path
+    del args_to_pass.security_group_ids
+    del args_to_pass.master_docker_image
+    del args_to_pass.master_server_public_ip
+    for arg, value in sorted(vars(args_to_pass).iteritems()):
+        if value:
+            kick_off_cmd += ' --%s %s' % (arg, value)
+
+    logging.info(kick_off_cmd)
+    stdin, stdout, stderr = ssh_client.exec_command(command=kick_off_cmd)
+    return_code = stdout.channel.recv_exit_status()
+    logging.info(return_code)
+    if return_code != 0:
+        raise Exception("Error while kicking off master")
+
+    logging.info(
+        "master server finished init process, visit %s to check master log" %
+        (get_master_web_url("/status")))
+
+
+def cleanup():
+    print requests.post(get_master_web_url("/cleanup")).text
+
+
+def status():
+    print requests.post(get_master_web_url("/status")).text
+
+
+def get_master_web_url(path):
+    return "http://" + args.master_server_public_ip + ":" + str(
+        args.master_server_port) + path
+
+
+if __name__ == "__main__":
+    print_arguments()
+    if args.action == "create":
+        if not args.key_name or not args.security_group_id:
+            raise ValueError("key_name and security_group_id are required")
+        create()
+    elif args.action == "cleanup":
+        if not args.master_server_public_ip:
+            raise ValueError("master_server_public_ip is required")
+        cleanup()
+    elif args.action == "status":
+        if not args.master_server_public_ip:
+            raise ValueError("master_server_public_ip is required")
+        status()
diff --git a/tools/aws_benchmarking/client/requirements.txt b/tools/aws_benchmarking/client/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9454801f2025671cfd1a2c3b71cf4c2ac07cb8fb
--- /dev/null
+++ b/tools/aws_benchmarking/client/requirements.txt
@@ -0,0 +1,6 @@
+netaddr==0.7.19
+boto3==1.6.21
+namesgenerator==0.3
+paramiko==2.4.1
+scp
+requests
diff --git a/tools/aws_benchmarking/diagram.png b/tools/aws_benchmarking/diagram.png
new file mode 100644
index 0000000000000000000000000000000000000000..b97909c5fe78b59d0e636ff73c2ed3e63a0be722
Binary files /dev/null and b/tools/aws_benchmarking/diagram.png differ
diff --git a/tools/aws_benchmarking/server/Dockerfile b/tools/aws_benchmarking/server/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..333523abcdb6fbe7dc01bbaf7d32ce1d8e866028
--- /dev/null
+++ b/tools/aws_benchmarking/server/Dockerfile
@@ -0,0 +1,7 @@
+FROM python:2.7.14-stretch
+
+ENV HOME /root
+COPY ./ /root/
+WORKDIR /root
+RUN pip install -r /root/requirements.txt
+ENTRYPOINT ["python", "cluster_master.py"]
\ No newline at end of file
diff --git a/tools/aws_benchmarking/server/cluster_master.py b/tools/aws_benchmarking/server/cluster_master.py
new file mode 100644
index 0000000000000000000000000000000000000000..21f85a5fc43e951897eb6b785367630abda722c0
--- /dev/null
+++ b/tools/aws_benchmarking/server/cluster_master.py
@@ -0,0 +1,673 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import json
+import math
+import time
+import threading
+import logging
+
+import netaddr
+import boto3
+import namesgenerator
+import paramiko
+
+from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
+
+
+# You must have aws_access_key_id, aws_secret_access_key, region set in
+# ~/.aws/credentials and ~/.aws/config
+def str2bool(v):
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--key_name', type=str, default="", help="required, key pair name")
+parser.add_argument(
+    '--security_group_id',
+    type=str,
+    default="",
+    help="required, the security group id associated with your VPC")
+
+parser.add_argument(
+    '--vpc_id',
+    type=str,
+    default="",
+    help="The VPC in which you wish to run test")
+parser.add_argument(
+    '--subnet_id',
+    type=str,
+    default="",
+    help="The Subnet_id in which you wish to run test")
+
+parser.add_argument(
+    '--pserver_instance_type',
+    type=str,
+    default="c5.2xlarge",
+    help="your pserver instance type, c5.2xlarge by default")
+parser.add_argument(
+    '--trainer_instance_type',
+    type=str,
+    default="p2.8xlarge",
+    help="your trainer instance type, p2.8xlarge by default")
+
+parser.add_argument(
+    '--task_name',
+    type=str,
+    default="",
+    help="the name you want to identify your job")
+parser.add_argument(
+    '--pserver_image_id',
+    type=str,
+    default="ami-da2c1cbf",
+    help="ami id for system image, default one has nvidia-docker ready, use ami-1ae93962 for us-east-2"
+)
+parser.add_argument(
+    '--trainer_image_id',
+    type=str,
+    default="ami-da2c1cbf",
+    help="ami id for system image, default one has nvidia-docker ready, use ami-1ae93962 for us-west-2"
+)
+
+parser.add_argument(
+    '--availability_zone',
+    type=str,
+    default="us-east-2a",
+    help="aws zone id to place ec2 instances")
+
+parser.add_argument(
+    '--trainer_count', type=int, default=1, help="Trainer count")
+
+parser.add_argument(
+    '--pserver_count', type=int, default=1, help="Pserver count")
+
+parser.add_argument(
+    '--pserver_bash_file',
+    type=str,
+    default=os.path.join(os.path.dirname(__file__), "pserver.sh.template"),
+    help="pserver bash file path")
+
+parser.add_argument(
+    '--pserver_command', type=str, default="", help="pserver start command")
+
+parser.add_argument(
+    '--trainer_bash_file',
+    type=str,
+    default=os.path.join(os.path.dirname(__file__), "trainer.sh.template"),
+    help="trainer bash file path")
+
+parser.add_argument(
+    '--trainer_command', type=str, default="", help="trainer start command")
+
+parser.add_argument(
+    '--action', type=str, default="serve", help="create|cleanup|serve")
+
+parser.add_argument('--pem_path', type=str, help="private key file")
+
+parser.add_argument(
+    '--pserver_port', type=str, default="5436", help="pserver port")
+
+parser.add_argument(
+    '--docker_image', type=str, default="busybox", help="training docker image")
+
+parser.add_argument(
+    '--master_server_port', type=int, default=5436, help="master server port")
+
+parser.add_argument(
+    '--master_server_ip', type=str, default="", help="master server private ip")
+
+parser.add_argument(
+    '--no_clean_up',
+    type=str2bool,
+    default=False,
+    help="whether to clean up after training")
+
+args = parser.parse_args()
+
+ec2client = boto3.client('ec2')
+
+args.log_path = os.path.join(os.path.dirname(__file__), "logs/")
+
+logging.basicConfig(
+    filename=args.log_path + 'master.log',
+    level=logging.INFO,
+    format='%(asctime)s %(message)s')
+
+log_files = ["master.log"]
+
+
+def create_subnet():
+    # if no vpc id provided, list vpcs
+    logging.info("start creating subnet")
+    if not args.vpc_id:
+        logging.info("no vpc provided, trying to find the default one")
+        vpcs_desc = ec2client.describe_vpcs(
+            Filters=[{
+                "Name": "isDefault",
+                "Values": ["true", ]
+            }], )
+        if len(vpcs_desc["Vpcs"]) == 0:
+            raise ValueError('No default VPC')
+        args.vpc_id = vpcs_desc["Vpcs"][0]["VpcId"]
+        vpc_cidrBlock = vpcs_desc["Vpcs"][0]["CidrBlock"]
+
+        logging.info("default vpc fount with id %s and CidrBlock %s" %
+                     (args.vpc_id, vpc_cidrBlock))
+
+    if not vpc_cidrBlock:
+        logging.info("trying to find cidrblock for vpc")
+        vpcs_desc = ec2client.describe_vpcs(
+            Filters=[{
+                "Name": "vpc-id",
+                "Values": [args.vpc_id, ],
+            }], )
+        if len(vpcs_desc["Vpcs"]) == 0:
+            raise ValueError('No VPC found')
+        vpc_cidrBlock = vpcs_desc["Vpcs"][0]["CidrBlock"]
+        logging.info("cidrblock for vpc is %s" % vpc_cidrBlock)
+
+    # list subnets in vpc in order to create a new one
+
+    logging.info("trying to find ip blocks for new subnet")
+    subnets_desc = ec2client.describe_subnets(
+        Filters=[{
+            "Name": "vpc-id",
+            "Values": [args.vpc_id, ],
+        }], )
+
+    ips_taken = []
+    for subnet_dec in subnets_desc["Subnets"]:
+        ips_taken.append(subnet_dec["CidrBlock"])
+
+    ip_blocks_avaliable = netaddr.IPSet(
+        [vpc_cidrBlock]) ^ netaddr.IPSet(ips_taken)
+    # adding 10 addresses as buffer
+    cidr_prefix = 32 - math.ceil(
+        math.log(args.pserver_count + args.trainer_count + 10, 2))
+    if cidr_prefix <= 16:
+        raise ValueError('Too many nodes to fit in current VPC')
+
+    for ipnetwork in ip_blocks_avaliable.iter_cidrs():
+        try:
+            subnet_cidr = ipnetwork.subnet(int(cidr_prefix)).next()
+            logging.info("subnet ip block found %s" % (subnet_cidr))
+            break
+        except Exception:
+            pass
+
+    if not subnet_cidr:
+        raise ValueError(
+            'No avaliable subnet to fit required nodes in current VPC')
+
+    logging.info("trying to create subnet")
+    subnet_desc = ec2client.create_subnet(
+        CidrBlock=str(subnet_cidr),
+        VpcId=args.vpc_id,
+        AvailabilityZone=args.availability_zone)
+
+    subnet_id = subnet_desc["Subnet"]["SubnetId"]
+
+    subnet_waiter = ec2client.get_waiter('subnet_available')
+    # sleep for 1s before checking its state
+    time.sleep(1)
+    subnet_waiter.wait(SubnetIds=[subnet_id, ])
+
+    logging.info("subnet created")
+
+    logging.info("adding tags to newly created subnet")
+    ec2client.create_tags(
+        Resources=[subnet_id, ],
+        Tags=[{
+            "Key": "Task_name",
+            'Value': args.task_name
+        }])
+    return subnet_id
+
+
+def generate_task_name():
+    return namesgenerator.get_random_name()
+
+
+def script_to_str(file_path):
+    if not file_path:
+        return "echo $PSERVER_HOSTS"
+    file = open(file_path, 'r')
+    text = file.read().strip()
+    file.close()
+    return text
+
+
+def run_instances(image_id, instance_type, count, role, cmd=""):
+    response = ec2client.run_instances(
+        ImageId=image_id,
+        InstanceType=instance_type,
+        MaxCount=count,
+        MinCount=count,
+        UserData=cmd,
+        DryRun=False,
+        InstanceInitiatedShutdownBehavior="stop",
+        KeyName=args.key_name,
+        Placement={'AvailabilityZone': args.availability_zone},
+        NetworkInterfaces=[{
+            'DeviceIndex': 0,
+            'SubnetId': args.subnet_id,
+            "AssociatePublicIpAddress": True,
+            'Groups': args.security_group_ids
+        }],
+        TagSpecifications=[{
+            'ResourceType': "instance",
+            'Tags': [{
+                "Key": 'Task_name',
+                "Value": args.task_name
+            }, {
+                "Key": 'Role',
+                "Value": role
+            }]
+        }])
+
+    instance_ids = []
+    for instance in response["Instances"]:
+        instance_ids.append(instance["InstanceId"])
+
+    if len(instance_ids) > 0:
+        logging.info(str(len(instance_ids)) + " instance(s) created")
+    else:
+        logging.info("no instance created")
+    #create waiter to make sure it's running
+
+    logging.info("waiting for instance to become accessible")
+    waiter = ec2client.get_waiter('instance_status_ok')
+    waiter.wait(
+        Filters=[{
+            "Name": "instance-status.status",
+            "Values": ["ok"]
+        }, {
+            "Name": "instance-status.reachability",
+            "Values": ["passed"]
+        }, {
+            "Name": "instance-state-name",
+            "Values": ["running"]
+        }],
+        InstanceIds=instance_ids)
+
+    instances_response = ec2client.describe_instances(InstanceIds=instance_ids)
+
+    return instances_response["Reservations"][0]["Instances"]
+
+
+def create_pservers():
+    try:
+        return run_instances(
+            image_id=args.pserver_image_id,
+            instance_type=args.pserver_instance_type,
+            count=args.pserver_count,
+            role="PSERVER", )
+    except Exception:
+        logging.exception("error while trying to create pservers")
+        cleanup(args.task_name)
+
+
+def log_to_file(source, filename):
+    if not filename in log_files:
+        log_files.append(filename)
+    with open(args.log_path + filename, "a") as log_file:
+        for line in iter(source.readline, ""):
+            log_file.write(line)
+
+
+def create_trainers(kickoff_cmd, pserver_endpoints_str):
+    def create_and_start_trainer(trainer_index):
+        logging.info("trainer " + str(trainer_index) + " is starting")
+
+        instance_response = run_instances(
+            image_id=args.trainer_image_id,
+            instance_type=args.trainer_instance_type,
+            count=1,
+            role="TRAINER", )[0]
+        trainer_ip = instance_response["PrivateIpAddress"]
+
+        logging.info("trainer " + str(trainer_index) + " started")
+
+        ssh_key = paramiko.RSAKey.from_private_key_file(args.pem_path)
+        ssh_client = paramiko.SSHClient()
+        ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+        ssh_client.connect(hostname=trainer_ip, username="ubuntu", pkey=ssh_key)
+
+        logging.info("trainer " + str(trainer_index) +
+                     " terminal connected via ssh")
+
+        cmd = kickoff_cmd.format(
+            PSERVER_HOSTS=pserver_endpoints_str,
+            DOCKER_IMAGE=args.docker_image,
+            TRAINER_INDEX=str(trainer_index),
+            TASK_NAME=args.task_name,
+            TRAINER_COUNT=args.trainer_count,
+            COMMAND=args.trainer_command,
+            MASTER_ENDPOINT=args.master_server_ip + ":" +
+            str(args.master_server_port))
+        logging.info(cmd)
+
+        stdin, stdout, stderr = ssh_client.exec_command(command=cmd)
+
+        # read and save output log
+
+        logging.info("trainer " + str(trainer_index) +
+                     " command executed, keep fetching log")
+
+        stdout_thread = threading.Thread(
+            target=log_to_file,
+            args=(
+                stdout,
+                "trainer_" + str(trainer_index) + ".log", ))
+        stderr_thread = threading.Thread(
+            target=log_to_file,
+            args=(
+                stderr,
+                "trainer_" + str(trainer_index) + "_err.log", ))
+        stdout_thread.start()
+        stderr_thread.start()
+
+        stdout_thread.join()
+        stderr_thread.join()
+
+        return_code = stdout.channel.recv_exit_status()
+        if return_code != 0:
+            trainer_create_results[trainer_index] = {'has_error': True}
+            raise ValueError("trainer didn't finish with exit code 0")
+
+        ssh_client.close()
+
+    # multi thread starting trainer instance and run kickoff command
+
+    trainer_threads = []
+    trainer_create_results = {}
+    try:
+        for i in xrange(args.trainer_count):
+            logging.info("starting tread for trainer " + str(i))
+            trainer_thread = threading.Thread(
+                target=create_and_start_trainer, args=(i, ))
+            trainer_thread.start()
+            trainer_threads.append(trainer_thread)
+
+        for trainer_thread in trainer_threads:
+            trainer_thread.join()
+
+        for result in trainer_create_results:
+            if result["has_error"]:
+                logging.error(
+                    "error during trainer starting or training, destorying the while cluster "
+                )
+                cleanup(args.task_name)
+                break
+
+        logging.info("all trainers stopped")
+    except Exception, e:
+        logging.info(
+            "Training exception, clean up resources, please check log for more info"
+        )
+    finally:
+        cleanup(args.task_name)
+
+
+def cleanup(task_name):
+    if args.no_clean_up:
+        logging.info("no clean up option set, going to leave the setup running")
+        return
+    #shutdown all ec2 instances
+    print("going to clean up " + task_name + " instances")
+    instances_response = ec2client.describe_instances(Filters=[{
+        "Name": "tag:Task_name",
+        "Values": [task_name]
+    }])
+
+    instance_ids = []
+    if len(instances_response["Reservations"]) > 0:
+        for reservation in instances_response["Reservations"]:
+            for instance in reservation["Instances"]:
+                instance_ids.append(instance["InstanceId"])
+
+        ec2client.terminate_instances(InstanceIds=instance_ids)
+
+        instance_termination_waiter = ec2client.get_waiter(
+            'instance_terminated')
+        instance_termination_waiter.wait(InstanceIds=instance_ids)
+
+    #delete the subnet created
+
+    subnet = ec2client.describe_subnets(Filters=[{
+        "Name": "tag:Task_name",
+        "Values": [task_name]
+    }])
+
+    if len(subnet["Subnets"]) > 0:
+        ec2client.delete_subnet(SubnetId=subnet["Subnets"][0]["SubnetId"])
+    # no subnet delete waiter, just leave it.
+    logging.info("Clearnup done")
+    return
+
+
+def kickoff_pserver(host, pserver_endpoints_str):
+    try:
+        ssh_key = paramiko.RSAKey.from_private_key_file(args.pem_path)
+        ssh_client = paramiko.SSHClient()
+        ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+        ssh_client.connect(hostname=host, username="ubuntu", pkey=ssh_key)
+        cmd = (script_to_str(args.pserver_bash_file)).format(
+            PSERVER_HOSTS=pserver_endpoints_str,
+            DOCKER_IMAGE=args.docker_image,
+            PSERVER_PORT=args.pserver_port,
+            TASK_NAME=args.task_name,
+            COMMAND=args.pserver_command,
+            TRAINER_COUNT=args.trainer_count,
+            TRAINER_INDEX=0,
+            # there is no way to use 0.0.0.0:port to start pserver
+            # has to docker --network="host" with host ip to make this work
+            SERVER_ENDPOINT=host + ":" + str(args.pserver_port),
+            MASTER_ENDPOINT=args.master_server_ip + ":" +
+            str(args.master_server_port))
+        logging.info(cmd)
+        stdin, stdout, stderr = ssh_client.exec_command(command=cmd)
+
+        stdout_thread = threading.Thread(
+            target=log_to_file, args=(
+                stdout,
+                "pserver_" + host + ".log", ))
+        stderr_thread = threading.Thread(
+            target=log_to_file, args=(
+                stderr,
+                "pserver_" + host + "_err.log", ))
+        stdout_thread.start()
+        stderr_thread.start()
+
+        stdout_thread.join()
+        stderr_thread.join()
+
+        return_code = stdout.channel.recv_exit_status()
+        logging.info(return_code)
+        if return_code != 0:
+            raise Exception("Error while kicking off pserver training process")
+    except Exception:
+        logging.exception("Error while kicking off pserver training process")
+        cleanup(args.task_name)
+    finally:
+        ssh_client.close()
+
+
+def init_args():
+
+    if not args.task_name:
+        args.task_name = generate_task_name()
+        logging.info("task name generated %s" % (args.task_name))
+
+    if not args.pem_path:
+        args.pem_path = os.path.expanduser("~") + "/" + args.key_name + ".pem"
+    if args.security_group_id:
+        args.security_group_ids = (args.security_group_id, )
+
+    args.trainers_job_done_count = 0
+
+
+def create_cluster():
+
+    if not args.subnet_id:
+        logging.info("creating subnet for this task")
+        args.subnet_id = create_subnet()
+        logging.info("subnet %s created" % (args.subnet_id))
+
+    logging.info("creating pservers")
+    pserver_create_response = create_pservers()
+    logging.info("pserver created, collecting pserver ips")
+
+    pserver_endpoints = []
+    for pserver in pserver_create_response:
+        pserver_endpoints.append(pserver["NetworkInterfaces"][0][
+            "PrivateIpAddress"] + ":" + args.pserver_port)
+
+    pserver_endpoints_str = ",".join(pserver_endpoints)
+
+    logging.info("kicking off pserver training process")
+    pserver_threads = []
+    for pserver in pserver_create_response:
+        pserver_thread = threading.Thread(
+            target=kickoff_pserver,
+            args=(pserver["PrivateIpAddress"], pserver_endpoints_str))
+        pserver_thread.start()
+        pserver_threads.append(pserver_thread)
+
+    logging.info("all pserver training process started")
+
+    logging.info("creating trainers and kicking off trainer training process")
+    create_trainers(
+        kickoff_cmd=script_to_str(args.trainer_bash_file),
+        pserver_endpoints_str=pserver_endpoints_str)
+
+    for pserver_thread in pserver_threads:
+        pserver_thread.join()
+
+    logging.info("all process ended")
+
+
+def start_server(args):
+    class S(BaseHTTPRequestHandler):
+        def _set_headers(self):
+            self.send_response(200)
+            self.send_header('Content-type', 'text/text')
+            self.end_headers()
+
+        def do_HEAD(self):
+            self._set_headers()
+
+        def do_404(self):
+            self.send_response(404)
+            self.send_header('Content-type', 'text/text')
+            self.end_headers()
+            logging.info("Received invalid GET request" + self.path)
+            self.wfile.write("NO ACTION FOUND")
+
+        def do_GET(self):
+
+            request_path = self.path
+            if request_path == "/status" or request_path == "/master_logs":
+                self._set_headers()
+                logging.info("Received request to return status")
+                with open(args.log_path + "master.log", "r") as logfile:
+                    self.wfile.write(logfile.read().strip())
+            elif request_path == "/list_logs" or request_path == "/logs":
+                self._set_headers()
+                self.wfile.write("\n".join(log_files))
+            elif "/log/" in request_path:
+                self._set_headers()
+                log_file_path = request_path.replace("/log/", "")
+                logging.info("requesting log file path is" + args.log_path +
+                             log_file_path)
+                with open(args.log_path + log_file_path, "r") as logfile:
+                    self.wfile.write(logfile.read().strip())
+            else:
+                self.do_404()
+
+        def do_POST(self):
+
+            request_path = self.path
+
+            if request_path == "/save_data":
+                self._set_headers()
+                logging.info("Received request to save data")
+                self.wfile.write("DATA SAVED!")
+                content_length = int(self.headers['Content-Length'])
+                post_data = self.rfile.read(content_length)
+                if args.task_name:
+                    with open(args.task_name + ".txt", "a") as text_file:
+                        text_file.write(post_data + "\n")
+
+            elif request_path == "/cleanup":
+                self._set_headers()
+                logging.info("Received request to cleanup cluster")
+                cleanup(args.task_name)
+                self.wfile.write("cleanup in progress")
+
+            else:
+                self.do_404()
+
+    server_address = ('', args.master_server_port)
+    httpd = HTTPServer(server_address, S)
+    logging.info("HTTP server is starting")
+    httpd.serve_forever()
+
+
+def print_arguments():
+    logging.info('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        logging.info('%s: %s' % (arg, value))
+    logging.info('------------------------------------------------')
+
+
+if __name__ == "__main__":
+    print_arguments()
+    if args.action == "create":
+        logging.info("going to create cluster")
+        if not args.key_name or not args.security_group_id:
+            raise ValueError("key_name and security_group_id are required")
+        init_args()
+        create_cluster()
+    elif args.action == "cleanup":
+        logging.info("going to cleanup cluster")
+        if not args.task_name:
+            raise ValueError("task_name is required")
+        cleanup(args.task_name)
+    elif args.action == "serve":
+        # serve mode
+        if not args.master_server_ip:
+            raise ValueError(
+                "No master server ip set, please run with --action create")
+
+        logging.info("going to start serve and create cluster")
+
+        init_args()
+
+        logging.info("starting server in another thread")
+        server_thread = threading.Thread(target=start_server, args=(args, ))
+        server_thread.start()
+
+        create_cluster()
+        server_thread.join()
+    elif args.action == "test":
+        start_server(args)
diff --git a/tools/aws_benchmarking/server/logs/master.log b/tools/aws_benchmarking/server/logs/master.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tools/aws_benchmarking/server/pserver.sh.template b/tools/aws_benchmarking/server/pserver.sh.template
new file mode 100644
index 0000000000000000000000000000000000000000..2612856d1e6273fe2642f82e8c616eb9ff24f8a4
--- /dev/null
+++ b/tools/aws_benchmarking/server/pserver.sh.template
@@ -0,0 +1,2 @@
+#!/bin/bash
+docker run --network="host" -i -e "SERVER_ENDPOINT={SERVER_ENDPOINT}" -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "TRAINING_ROLE=PSERVER" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} --device CPU
\ No newline at end of file
diff --git a/tools/aws_benchmarking/server/requirements.txt b/tools/aws_benchmarking/server/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5c523854f28b0a6f024fba2b2f344b53ba967a2f
--- /dev/null
+++ b/tools/aws_benchmarking/server/requirements.txt
@@ -0,0 +1,4 @@
+netaddr==0.7.19
+boto3==1.6.21
+namesgenerator==0.3
+paramiko==2.4.1
diff --git a/tools/aws_benchmarking/server/trainer.sh.template b/tools/aws_benchmarking/server/trainer.sh.template
new file mode 100644
index 0000000000000000000000000000000000000000..a4b2876b08cdf05e90e50589f897d74ca5f90443
--- /dev/null
+++ b/tools/aws_benchmarking/server/trainer.sh.template
@@ -0,0 +1,2 @@
+#!/bin/bash 
+nvidia-docker run --network="host" -i  -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "TRAINER_INDEX={TRAINER_INDEX}"  -e "PADDLE_INIT_TRAINER_ID={TRAINER_INDEX}" -e "TRAINING_ROLE=TRAINER"  -e "PSERVER_HOSTS={PSERVER_HOSTS}"  -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} --device GPU
\ No newline at end of file
diff --git a/tools/manylinux1/Dockerfile.android b/tools/manylinux1/Dockerfile.android
index b6cae228a0c45ab70ba8ecc80ae4df7e0fa5bdbc..7eb040902b0f8f3cc9f7a31ec9f96467de654c3e 100644
--- a/tools/manylinux1/Dockerfile.android
+++ b/tools/manylinux1/Dockerfile.android
@@ -37,7 +37,7 @@ RUN git config --global credential.helper store
 # Fix locales to en_US.UTF-8
 RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 
-RUN pip install --upgrade pip && \
+RUN pip install --upgrade pip==9.0.3 && \
     pip install -U 'protobuf==3.1.0' && \
     pip install -U wheel sphinx && \
     pip install pre-commit