diff --git a/CMakeLists.txt b/CMakeLists.txt
index efa68c9ba243af3c7cdca52b915cc14d307ae89f..1594e798a2ba3f735a28a43ef933d80b3b3f8964 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -54,7 +54,7 @@ option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
 option(WITH_DOUBLE      "Compile PaddlePaddle with double precision"    OFF)
 option(WITH_RDMA        "Compile PaddlePaddle with RDMA support"        OFF)
 option(WITH_TIMER       "Compile PaddlePaddle with stats timer"         OFF)
-option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler"        OFF)
+option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler and gperftools"        OFF)
 option(WITH_DOC         "Compile PaddlePaddle with documentation"       OFF)
 option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
@@ -254,6 +254,12 @@ elseif()
     set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in MKL only now." FORCE)
 endif()
 
+if (WITH_PROFILER)
+    find_package(Gperftools REQUIRED)
+    include_directories(${GPERFTOOLS_INCLUDE_DIR})
+    add_definitions(-DWITH_GPERFTOOLS)
+endif()
+
 include(generic)            # simplify cmake module
 include(package)            # set paddle packages
 include(ccache)             # set ccache for compilation
diff --git a/README.md b/README.md
index 56d6c10c642787836abb55cb2974bda0b8d22da4..c535e9514e1cac9aff51edfcd9bcdc5d34ccd9fd 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@
 
 
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
 
@@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
 
 
-### Latest PaddlePaddle Release: [Fluid 1.1.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.1)
+### Latest PaddlePaddle Release: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2)
 ### Install Latest Stable Release:
 ```
 # Linux CPU
@@ -27,9 +27,9 @@ pip install paddlepaddle
 # Linux GPU cuda9cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda8cudnn7
-pip install paddlepaddle-gpu==1.1.0.post87
+pip install paddlepaddle-gpu==1.2.0.post87
 # Linux GPU cuda8cudnn5
-pip install paddlepaddle-gpu==1.1.0.post85
+pip install paddlepaddle-gpu==1.2.0.post85
 
 # For installation on other platform, refer to http://paddlepaddle.org/
 ```
@@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==1.1.0.post85
 
 ## Installation
 
-It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) on our website.
+It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) on our website.
 
 ## Documentation
 
-We provide [English](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html) and
-[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) documentation.
+We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) and
+[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) documentation.
 
 - [Deep Learning 101](https://github.com/PaddlePaddle/book)
 
   You might want to start from this online interactive book that can run in a Jupyter Notebook.
 
-- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.1/user_guides/howto/training/cluster_howto.html)
+- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html)
 
   You can run distributed training jobs on MPI clusters.
 
-- [Python API](http://paddlepaddle.org/documentation/api/zh/1.1/fluid.html)
+- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html)
 
    Our new API enables much shorter programs.
 
-- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.1/advanced_usage/development/contribute_to_paddle.html)
+- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html)
 
    We appreciate your contributions!
 
diff --git a/cmake/FindGperftools.cmake b/cmake/FindGperftools.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..928f573a4fb82391859e334d50e6c8ed0e26aae2
--- /dev/null
+++ b/cmake/FindGperftools.cmake
@@ -0,0 +1,63 @@
+# Tries to find Gperftools.
+#
+# Usage of this module as follows:
+#
+#     find_package(Gperftools)
+#
+# Variables used by this module, they can change the default behaviour and need
+# to be set before calling find_package:
+#
+#  Gperftools_ROOT_DIR  Set this variable to the root installation of
+#                       Gperftools if the module has problems finding
+#                       the proper installation path.
+#
+# Variables defined by this module:
+#
+#  GPERFTOOLS_FOUND              System has Gperftools libs/headers
+#  GPERFTOOLS_LIBRARIES          The Gperftools libraries (tcmalloc & profiler)
+#  GPERFTOOLS_INCLUDE_DIR        The location of Gperftools headers
+
+find_library(GPERFTOOLS_TCMALLOC
+  NAMES tcmalloc
+  HINTS ${Gperftools_ROOT_DIR}/lib)
+
+find_library(GPERFTOOLS_PROFILER
+  NAMES profiler
+  HINTS ${Gperftools_ROOT_DIR}/lib)
+
+find_library(GPERFTOOLS_TCMALLOC_AND_PROFILER
+  NAMES tcmalloc_and_profiler
+  HINTS ${Gperftools_ROOT_DIR}/lib)
+
+find_path(GPERFTOOLS_INCLUDE_DIR
+  NAMES gperftools/heap-profiler.h
+  HINTS ${Gperftools_ROOT_DIR}/include)
+
+set(GPERFTOOLS_LIBRARIES ${GPERFTOOLS_TCMALLOC_AND_PROFILER})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(
+  Gperftools
+  DEFAULT_MSG
+  GPERFTOOLS_LIBRARIES
+  GPERFTOOLS_INCLUDE_DIR)
+
+mark_as_advanced(
+  Gperftools_ROOT_DIR
+  GPERFTOOLS_TCMALLOC
+  GPERFTOOLS_PROFILER
+  GPERFTOOLS_TCMALLOC_AND_PROFILER
+  GPERFTOOLS_LIBRARIES
+  GPERFTOOLS_INCLUDE_DIR)
+
+# create IMPORTED targets
+if (Gperftools_FOUND AND NOT TARGET gperftools::tcmalloc)
+  add_library(gperftools::tcmalloc UNKNOWN IMPORTED)
+  set_target_properties(gperftools::tcmalloc PROPERTIES
+    IMPORTED_LOCATION ${GPERFTOOLS_TCMALLOC}
+    INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}")
+  add_library(gperftools::profiler UNKNOWN IMPORTED)
+  set_target_properties(gperftools::profiler PROPERTIES
+    IMPORTED_LOCATION ${GPERFTOOLS_PROFILER}
+    INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}")
+endif()
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 4e17ddee73958106d5e2c8c8ea5661acc758518a..51f7a61631d7102b60646abe1c6dd7775692f157 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -86,6 +86,7 @@ endif(NOT WITH_GOLANG)
 
 if(WITH_GPU)
     add_definitions(-DPADDLE_WITH_CUDA)
+    add_definitions(-DEIGEN_USE_GPU)
 
     FIND_PACKAGE(CUDA REQUIRED)
 
diff --git a/cmake/external/gzstream.cmake b/cmake/external/gzstream.cmake
index 59d8e932459dd49017cb32b27e5f1919272fe387..3e36ef7ae205bbf85f345d55456309cc05a58fbd 100644
--- a/cmake/external/gzstream.cmake
+++ b/cmake/external/gzstream.cmake
@@ -27,13 +27,14 @@ SET(GZSTREAM_INCLUDE_DIR "${GZSTREAM_INSTALL_DIR}/include/" CACHE PATH "gzstream
 
 ExternalProject_Add(
         extern_gzstream
+        DEPENDS zlib
         GIT_REPOSITORY "https://github.com/jacquesqiao/gzstream.git"
         GIT_TAG ""
         PREFIX          ${GZSTREAM_SOURCES_DIR}
         UPDATE_COMMAND  ""
         CONFIGURE_COMMAND ""
         BUILD_IN_SOURCE 1
-        BUILD_COMMAND   make -j8
+        BUILD_COMMAND   make EXTERN_CPPFLAGS="-I${THIRD_PARTY_PATH}/install/zlib/include" EXTERM_LDFLAGS="-L${THIRD_PARTY_PATH}/install/zlib/lib" -j8
         INSTALL_COMMAND mkdir -p ${GZSTREAM_INSTALL_DIR}/lib/ && mkdir -p ${GZSTREAM_INSTALL_DIR}/include/
         && cp ${GZSTREAM_SOURCES_DIR}/src/extern_gzstream/libgzstream.a ${GZSTREAM_INSTALL_DIR}/lib
         && cp -r ${GZSTREAM_SOURCES_DIR}/src/extern_gzstream/gzstream.h ${GZSTREAM_INSTALL_DIR}/include
diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake
index 2e335579f32df4f146c8d88e05e684a9a8105e20..e66459fa3a1508fe4a3687f07bbe18f2a5421296 100644
--- a/cmake/external/ngraph.cmake
+++ b/cmake/external/ngraph.cmake
@@ -32,6 +32,8 @@ IF(NOT ${WITH_NGRAPH})
     return()
 ENDIF()
 
+INCLUDE(GNUInstallDirs)
+
 INCLUDE(ExternalProject)
 
 SET(NGRAPH_PROJECT         "extern_ngraph")
@@ -40,10 +42,14 @@ SET(NGRAPH_GIT_TAG         "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0")
 SET(NGRAPH_SOURCES_DIR     ${THIRD_PARTY_PATH}/ngraph)
 SET(NGRAPH_INSTALL_DIR     ${THIRD_PARTY_PATH}/install/ngraph)
 SET(NGRAPH_INC_DIR         ${NGRAPH_INSTALL_DIR}/include)
+SET(NGRAPH_LIB_DIR         ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
 SET(NGRAPH_SHARED_LIB_NAME libngraph.so.${NGRAPH_VERSION})
 SET(NGRAPH_CPU_LIB_NAME    libcpu_backend.so)
 SET(NGRAPH_TBB_LIB_NAME    libtbb.so.2)
 SET(NGRAPH_GIT_REPO        "https://github.com/NervanaSystems/ngraph.git")
+SET(NGRAPH_SHARED_LIB      ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME})
+SET(NGRAPH_CPU_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME})
+SET(NGRAPH_TBB_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME})
 
 ExternalProject_Add(
     ${NGRAPH_PROJECT}
@@ -63,18 +69,6 @@ ExternalProject_Add(
     CMAKE_ARGS          -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib
 )
 
-if(UNIX AND NOT APPLE)
-    include(GNUInstallDirs)
-    SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
-else()
-    SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/lib)
-endif()
-MESSAGE(STATUS "nGraph lib will be installed at: ${NGRAPH_LIB_DIR}")
-
-SET(NGRAPH_SHARED_LIB      ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME})
-SET(NGRAPH_CPU_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME})
-SET(NGRAPH_TBB_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME})
-
 # Workaround for nGraph expecting mklml to be in mkldnn install directory.
 ExternalProject_Add_Step(
     ${NGRAPH_PROJECT}
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 312fbaa0b3d83c37debe78be82503103eabc0bfa..a8b9dcfcf5eec39af0f59c03b1ed9bd4b71ee7bf 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -110,6 +110,14 @@ function(find_fluid_modules TARGET_NAME)
   endif()
 endfunction(find_fluid_modules)
 
+
+function(common_link TARGET_NAME)
+  if (WITH_PROFILER)
+    target_link_libraries(${TARGET_NAME} gperftools::profiler)
+  endif()
+endfunction()
+
+
 # find all third_party modules is used for paddle static library
 # for reduce the dependency when building the inference libs.
 set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY)
@@ -274,6 +282,7 @@ function(cc_library TARGET_NAME)
       endif()
       target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
       add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
+      common_link(${TARGET_NAME})
     endif()
 
     # cpplint code style
@@ -340,6 +349,7 @@ function(cc_binary TARGET_NAME)
   if(cc_binary_DEPS)
     target_link_libraries(${TARGET_NAME} ${cc_binary_DEPS})
     add_dependencies(${TARGET_NAME} ${cc_binary_DEPS})
+    common_link(${TARGET_NAME})
   endif()
 endfunction(cc_binary)
 
@@ -362,6 +372,7 @@ function(cc_test TARGET_NAME)
       target_link_libraries(${TARGET_NAME} ${win32_deps})
     endif(WIN32)
     add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    common_link(${TARGET_NAME})
     add_test(NAME ${TARGET_NAME}
              COMMAND ${TARGET_NAME} ${cc_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
@@ -420,6 +431,7 @@ function(nv_binary TARGET_NAME)
     if(nv_binary_DEPS)
       target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS})
       add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
+      common_link(${TARGET_NAME})
     endif()
   endif()
 endfunction(nv_binary)
@@ -433,6 +445,7 @@ function(nv_test TARGET_NAME)
     cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
     target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    common_link(${TARGET_NAME})
     add_test(${TARGET_NAME} ${TARGET_NAME})
     if (nv_test_SERIAL)
         set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
@@ -499,6 +512,7 @@ function(hip_binary TARGET_NAME)
     if(hip_binary_DEPS)
       target_link_libraries(${TARGET_NAME} ${hip_binary_DEPS})
       add_dependencies(${TARGET_NAME} ${hip_binary_DEPS})
+      common_link(${TARGET_NAME})
     endif()
   endif()
 endfunction(hip_binary)
@@ -518,6 +532,7 @@ function(hip_test TARGET_NAME)
     set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
     target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
     add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
+    common_link(${TARGET_NAME})
     add_test(${TARGET_NAME} ${TARGET_NAME})
   endif()
 endfunction(hip_test)
@@ -560,6 +575,7 @@ function(go_library TARGET_NAME)
   endif()
   if(go_library_DEPS)
     add_dependencies(${TARGET_NAME} ${go_library_DEPS})
+    common_link(${TARGET_NAME})
   endif(go_library_DEPS)
 
   # The "source file" of the library is `${dummyfile}` which never
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 0b95a780721b0771d55c4dbb2ddce33418612018..c679d8507d8a9d3bce48b7f38491dadd9f2fb7f6 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -129,6 +129,15 @@ if (WITH_MKLDNN)
             )
 endif ()
 
+if (WITH_NGRAPH)
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/ngraph")
+    copy(ngraph_lib
+            SRCS ${NGRAPH_INC_DIR} ${NGRAPH_LIB_DIR}
+            DSTS ${dst_dir} ${dst_dir}
+            DEPS ngraph
+            )
+endif ()
+
 if (NOT WIN32)
     if (NOT MOBILE_INFERENCE AND NOT RPI)
         set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 89726bf9859e71ee04c2f9380554090845fd44e5..2ced43f9e6c60da642f7a6252f889d9c9ab9748f 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -166,6 +166,8 @@ function(op_library TARGET)
       # Append first implemented MKLDNN activation operator
       if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
+      elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n")
       else()
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
       endif()
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index c40f6033419a2425d9996eb9a4584fc9cd1a70e3..8e6482ca981e1473a552efcc3ee043aeda137780 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -32,6 +32,13 @@ paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.c
 paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy) -> None
 paddle.fluid.create_lod_tensor ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.create_random_int_lodtensor ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DataFeedDesc.__init__ ArgSpec(args=['self', 'proto_file'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DataFeedDesc.desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DataFeedDesc.set_batch_size ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DataFeedDesc.set_dense_slots ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DataFeedDesc.set_use_slots ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'debug'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.io.save_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
@@ -59,6 +66,7 @@ paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr
 paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100))
+paddle.fluid.layers.bpr_loss ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None))
@@ -69,7 +77,7 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name']
 paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
 paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
 paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
-paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False))
+paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False))
 paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
 paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
@@ -175,7 +183,7 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None,
 paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
-paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name'], varargs=None, keywords=None, defaults=(-100, None))
 paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
@@ -187,6 +195,10 @@ paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=Non
 paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
 paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None))
+paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
+paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
@@ -291,6 +303,7 @@ paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'i
 paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None))
 paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'class_num', 'ignore_thresh', 'loss_weight_xy', 'loss_weight_wh', 'loss_weight_conf_target', 'loss_weight_conf_notarget', 'loss_weight_class', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None))
 paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
 paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1))
 paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
@@ -411,3 +424,17 @@ paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None
 paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
 paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope
 paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
+paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None)
+paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None)
+paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None)
+paddle.reader.chain ArgSpec(args=[], varargs='readers', keywords=None, defaults=None)
+paddle.reader.shuffle ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None)
+paddle.reader.firstn ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None)
+paddle.reader.xmap_readers ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,))
+paddle.reader.PipeReader.__init__ ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain'))
+paddle.reader.PipeReader.get_line ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n'))
+paddle.reader.multiprocess_reader ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000))
+paddle.reader.Fake.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.reader.creator.np_array ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
+paddle.reader.creator.text_file ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None)
+paddle.reader.creator.recordio ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,))
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index 6b526f0103ad3c530c06a68757cf89293f4fb84b..595454e90b9cd713fd2baed24538cf5fbc93934a 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_subdirectory(memory)
 add_subdirectory(platform)
 add_subdirectory(framework)
+add_subdirectory(imperative)
 add_subdirectory(operators)
 add_subdirectory(string)
 add_subdirectory(recordio)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 52946c7f11f90490b1af1347f20db236a8fe24af..6d7a69c8c9e11016f641f73e296156da09114408 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -34,6 +34,7 @@ add_subdirectory(ir)
 add_subdirectory(details)
 # ddim lib
 proto_library(framework_proto SRCS framework.proto)
+proto_library(async_executor_proto SRCS data_feed.proto)
 
 cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
@@ -71,6 +72,8 @@ cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
 
+cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory)
+
 cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
 cc_test(reader_test SRCS reader_test.cc DEPS reader)
 
@@ -117,8 +120,9 @@ cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
 
 cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context)
+cc_library(op_kernel_type SRCS op_kernel_type.cc DEPS device_context place)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
-    shape_inference data_transform lod_tensor profiler transfer_scope_cache)
+    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type)
 
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
 
@@ -126,16 +130,19 @@ cc_library(version SRCS version.cc)
 cc_test(version_test SRCS version_test.cc DEPS version)
 
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
-cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto)
-if(NOT WIN32)
-cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
-  shape_inference data_transform lod_tensor profiler)
-endif(NOT WIN32)
+
+if(WITH_NGRAPH)
+  if(NOT WIN32)
+    cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
+    cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
+      shape_inference data_transform lod_tensor profiler ngraph)
+  endif(NOT WIN32)
+endif(WITH_NGRAPH)
 
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 
-py_proto_compile(framework_py_proto SRCS framework.proto)
+py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
@@ -157,27 +164,37 @@ endif(NOT WIN32)
 cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
+cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor)
 
-cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
+cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
 
 if(WITH_DISTRIBUTE)
-  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass)
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()
-  if(NOT WIN32)
-    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator)
-  else(NOT WIN32)
-    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
-  endif(NOT WIN32)
+  if(WITH_NGRAPH)
+    if(NOT WIN32)
+      cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph ngraph_operator variable_helper)
+    else(NOT WIN32)
+      cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
+    endif(NOT WIN32)
+  else(WITH_NGRAPH)
+    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
+  endif(WITH_NGRAPH)
   cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()
 
+target_link_libraries(executor garbage_collector)
+
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
         threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
         graph build_strategy
-        fast_threaded_ssa_graph_executor)
+        fast_threaded_ssa_graph_executor variable_helper)
+
+cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper)
 
+cc_test(data_feed_test SRCS data_feed_test.cc DEPS async_executor)
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
 cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
@@ -185,7 +202,7 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
 cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
 
-cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
+cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto op_kernel_type)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
 
 cc_test(tuple_test SRCS tuple_test.cc )
diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..afb2dd2f064384da39904f6aceead4fa915a80f2
--- /dev/null
+++ b/paddle/fluid/framework/async_executor.cc
@@ -0,0 +1,138 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/async_executor.h"
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "google/protobuf/message.h"
+#include "google/protobuf/text_format.h"
+
+#include "gflags/gflags.h"
+#include "paddle/fluid/framework/data_feed_factory.h"
+#include "paddle/fluid/framework/executor_thread_worker.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/pybind/pybind.h"
+
+namespace paddle {
+namespace framework {
+AsyncExecutor::AsyncExecutor(Scope* scope, const platform::Place& place)
+    : root_scope_(scope), place_(place) {}
+
+void AsyncExecutor::CreateThreads(
+    ExecutorThreadWorker* worker, const ProgramDesc& main_program,
+    const std::shared_ptr<DataFeed>& reader,
+    const std::vector<std::string>& fetch_var_names, Scope* root_scope,
+    const int thread_index, const bool debug) {
+  worker->SetThreadId(thread_index);
+  worker->SetDebug(debug);
+  worker->SetRootScope(root_scope);
+  worker->CreateThreadResource(main_program, place_);
+  worker->SetDataFeed(reader);
+  worker->SetFetchVarNames(fetch_var_names);
+  worker->BindingDataFeedMemory();
+}
+
+void PrepareReaders(std::vector<std::shared_ptr<DataFeed>>& readers,  // NOLINT
+                    const int thread_num, const DataFeedDesc& data_feed_desc,
+                    const std::vector<std::string>& filelist) {
+  readers.resize(thread_num);
+  for (size_t i = 0; i < readers.size(); ++i) {
+    readers[i] = DataFeedFactory::CreateDataFeed(data_feed_desc.name());
+    readers[i]->Init(data_feed_desc);  // set batch_size and queue_size here
+  }
+  readers[0]->SetFileList(filelist);
+}
+
+void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
+                                const std::string& data_feed_desc_str,
+                                const std::vector<std::string>& filelist,
+                                const int thread_num,
+                                const std::vector<std::string>& fetch_var_names,
+                                const bool debug) {
+  std::vector<std::thread> threads;
+
+  auto& block = main_program.Block(0);
+  for (auto var_name : fetch_var_names) {
+    auto var_desc = block.FindVar(var_name);
+    auto shapes = var_desc->GetShape();
+    PADDLE_ENFORCE(shapes[shapes.size() - 1] == 1,
+                   "var %s: Fetched var has wrong shape, "
+                   "only variables with the last dimension size 1 supported",
+                   var_name);
+  }
+
+  DataFeedDesc data_feed_desc;
+  google::protobuf::TextFormat::ParseFromString(data_feed_desc_str,
+                                                &data_feed_desc);
+
+  int actual_thread_num = thread_num;
+  int file_cnt = filelist.size();
+  PADDLE_ENFORCE(file_cnt > 0, "File list cannot be empty");
+
+  if (actual_thread_num > file_cnt) {
+    VLOG(1) << "Thread num = " << thread_num << ", file num = " << file_cnt
+            << ". Changing thread_num = " << file_cnt;
+    actual_thread_num = file_cnt;
+  }
+
+  /*
+    readerDesc: protobuf description for reader initlization
+    argument: class_name, batch_size, use_slot, queue_size, buffer_size,
+    padding_index
+
+    reader:
+    1) each thread has a reader, reader will read input data and
+    put it into input queue
+    2) each reader has a Next() iterface, that can fetch an instance
+    from the input queue
+   */
+  // todo: should be factory method for creating datafeed
+  std::vector<std::shared_ptr<DataFeed>> readers;
+  PrepareReaders(readers, actual_thread_num, data_feed_desc, filelist);
+
+  std::vector<std::shared_ptr<ExecutorThreadWorker>> workers;
+  workers.resize(actual_thread_num);
+  for (auto& worker : workers) {
+    worker.reset(new ExecutorThreadWorker);
+  }
+
+  // prepare thread resource here
+  for (int thidx = 0; thidx < actual_thread_num; ++thidx) {
+    CreateThreads(workers[thidx].get(), main_program, readers[thidx],
+                  fetch_var_names, root_scope_, thidx, debug);
+  }
+
+  // start executing ops in multiple threads
+  for (int thidx = 0; thidx < actual_thread_num; ++thidx) {
+    threads.push_back(
+        std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get()));
+  }
+
+  for (auto& th : threads) {
+    th.join();
+  }
+
+  root_scope_->DropKids();
+
+  return;
+}
+
+}  // einit_modelnd namespace framework
+}  // end namespace paddle
diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..f4d2a79ac592e02f49ec0b988c824dc98883fbf6
--- /dev/null
+++ b/paddle/fluid/framework/async_executor.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <set>
+#include <string>
+#include <thread>  // NOLINT
+#include <typeinfo>
+#include <vector>
+#include "paddle/fluid/framework/data_feed.pb.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/executor_thread_worker.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+class AsyncExecutor {
+ public:
+  AsyncExecutor(Scope* scope, const platform::Place& place);
+  virtual ~AsyncExecutor() {}
+  void RunFromFile(const ProgramDesc& main_program,
+                   const std::string& data_feed_desc_str,
+                   const std::vector<std::string>& filelist,
+                   const int thread_num,
+                   const std::vector<std::string>& fetch_names,
+                   const bool debug = false);
+
+ private:
+  void CreateThreads(ExecutorThreadWorker* worker,
+                     const ProgramDesc& main_program,
+                     const std::shared_ptr<DataFeed>& reader,
+                     const std::vector<std::string>& fetch_var_names,
+                     Scope* root_scope, const int thread_index,
+                     const bool debug);
+
+ public:
+  Scope* root_scope_;
+  platform::Place place_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a99cf53b410433c6e4b8a19821779f28c25e678f
--- /dev/null
+++ b/paddle/fluid/framework/data_feed.cc
@@ -0,0 +1,373 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "google/protobuf/message.h"
+#include "google/protobuf/text_format.h"
+
+#include "gflags/gflags.h"
+#include "paddle/fluid/framework/data_feed.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+
+namespace paddle {
+namespace framework {
+
+std::vector<std::string> DataFeed::filelist_;
+size_t DataFeed::file_idx_;
+std::mutex DataFeed::mutex_for_pick_file_;
+bool DataFeed::finish_set_filelist_;
+
+void DataFeed::AddFeedVar(Variable* var, const std::string& name) {
+  CheckInit();
+  for (size_t i = 0; i < use_slots_.size(); ++i) {
+    if (name == use_slots_[i]) {
+      feed_vec_[i] = var->GetMutable<LoDTensor>();
+    }
+  }
+}
+
+bool DataFeed::SetFileList(const std::vector<std::string>& files) {
+  std::unique_lock<std::mutex> lock(mutex_for_pick_file_);
+  CheckInit();
+  if (finish_set_filelist_) {
+    VLOG(3) << "info: you have set the filelist.";
+    return false;
+  }
+  PADDLE_ENFORCE(files.size(), "You have set an empty filelist.");
+  filelist_.assign(files.begin(), files.end());
+  file_idx_ = 0;
+
+  finish_set_filelist_ = true;
+  return true;
+}
+
+void DataFeed::SetBatchSize(int batch_size) {
+  PADDLE_ENFORCE(batch_size > 0, "Illegal batch size: %d.", batch_size);
+  default_batch_size_ = batch_size;
+}
+
+bool DataFeed::PickOneFile(std::string* filename) {
+  std::unique_lock<std::mutex> lock(mutex_for_pick_file_);
+  if (file_idx_ == filelist_.size()) {
+    return false;
+  }
+  *filename = filelist_[file_idx_++];
+  return true;
+}
+
+void DataFeed::CheckInit() {
+  PADDLE_ENFORCE(finish_init_, "Initialization did not succeed.");
+}
+
+void DataFeed::CheckSetFileList() {
+  PADDLE_ENFORCE(finish_set_filelist_, "Set filelist did not succeed.");
+}
+
+void DataFeed::CheckStart() {
+  PADDLE_ENFORCE(finish_start_, "Datafeed has not started running yet.");
+}
+
+template <typename T>
+void PrivateQueueDataFeed<T>::SetQueueSize(int queue_size) {
+  PADDLE_ENFORCE(queue_size > 0, "Illegal queue size: %d.", queue_size);
+  queue_size_ = queue_size;
+  queue_ = std::unique_ptr<paddle::operators::reader::BlockingQueue<T>>(
+      new paddle::operators::reader::BlockingQueue<T>(queue_size_));
+}
+
+template <typename T>
+bool PrivateQueueDataFeed<T>::Start() {
+  CheckSetFileList();
+  read_thread_ = std::thread(&PrivateQueueDataFeed::ReadThread, this);
+  read_thread_.detach();
+
+  finish_start_ = true;
+  return true;
+}
+
+template <typename T>
+void PrivateQueueDataFeed<T>::ReadThread() {
+  std::string filename;
+  while (PickOneFile(&filename)) {
+    file_.open(filename.c_str());  // is_text_feed
+    PADDLE_ENFORCE(file_.good(), "Open file<%s> fail.", filename.c_str());
+    T instance;
+    while (ParseOneInstance(&instance)) {
+      queue_->Send(instance);
+    }
+    file_.close();
+  }
+  queue_->Close();
+}
+
+template <typename T>
+int PrivateQueueDataFeed<T>::Next() {
+  CheckStart();
+  int index = 0;
+  T instance;
+  T ins_vec;
+  while (index < default_batch_size_) {
+    if (!queue_->Receive(&instance)) {
+      break;
+    }
+    AddInstanceToInsVec(&ins_vec, instance, index++);
+  }
+  batch_size_ = index;
+  if (batch_size_ != 0) {
+    PutToFeedVec(ins_vec);
+  }
+  return batch_size_;
+}
+
+#ifdef _WIN32
+template class PrivateQueueDataFeed<std::vector<MultiSlotType>>;
+#endif
+
+void MultiSlotDataFeed::Init(
+    const paddle::framework::DataFeedDesc& data_feed_desc) {
+  finish_init_ = false;
+  finish_set_filelist_ = false;
+  finish_start_ = false;
+
+  PADDLE_ENFORCE(data_feed_desc.has_multi_slot_desc(),
+                 "Multi_slot_desc has not been set.");
+  paddle::framework::MultiSlotDesc multi_slot_desc =
+      data_feed_desc.multi_slot_desc();
+  SetBatchSize(data_feed_desc.batch_size());
+  SetQueueSize(data_feed_desc.batch_size());
+  size_t all_slot_num = multi_slot_desc.slots_size();
+  all_slots_.resize(all_slot_num);
+  all_slots_type_.resize(all_slot_num);
+  use_slots_index_.resize(all_slot_num);
+  use_slots_.clear();
+  use_slots_is_dense_.clear();
+  for (size_t i = 0; i < all_slot_num; ++i) {
+    const auto& slot = multi_slot_desc.slots(i);
+    all_slots_[i] = slot.name();
+    all_slots_type_[i] = slot.type();
+    use_slots_index_[i] = slot.is_used() ? use_slots_.size() : -1;
+    if (slot.is_used()) {
+      use_slots_.push_back(all_slots_[i]);
+      use_slots_is_dense_.push_back(slot.is_dense());
+    }
+  }
+  feed_vec_.resize(use_slots_.size());
+  finish_init_ = true;
+}
+
+bool MultiSlotDataFeed::CheckFile(const char* filename) {
+  CheckInit();  // get info of slots
+  std::ifstream fin(filename);
+  if (!fin.good()) {
+    VLOG(1) << "error: open file<" << filename << "> fail";
+    return false;
+  }
+  std::string line;
+  int instance_cout = 0;
+  std::string all_slots_alias = "";
+  for (const auto& alias : all_slots_) {
+    all_slots_alias += alias + " ";
+  }
+  std::string use_slots_alias = "";
+  for (const auto& alias : use_slots_) {
+    use_slots_alias += alias + " ";
+  }
+  VLOG(3) << "total slots num: " << all_slots_.size();
+  VLOG(3) << "total slots alias: " << all_slots_alias;
+  VLOG(3) << "used slots num: " << use_slots_.size();
+  VLOG(3) << "used slots alias: " << use_slots_alias;
+  while (getline(fin, line)) {
+    ++instance_cout;
+    const char* str = line.c_str();
+    char* endptr = const_cast<char*>(str);
+    int len = line.length();
+    for (size_t i = 0; i < all_slots_.size(); ++i) {
+      int num = strtol(endptr, &endptr, 10);
+      if (num < 0) {
+        VLOG(0) << "error: the number of ids is a negative number: " << num;
+        VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                << filename << ">";
+        return false;
+      } else if (num == 0) {
+        VLOG(0)
+            << "error: the number of ids can not be zero, you need "
+               "padding it in data generator; or if there is something wrong"
+               " with the data, please check if the data contains unresolvable "
+               "characters.";
+        VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                << filename << ">";
+        return false;
+      } else if (errno == ERANGE || num > INT_MAX) {
+        VLOG(0) << "error: the number of ids greater than INT_MAX";
+        VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                << filename << ">";
+        return false;
+      }
+      if (all_slots_type_[i] == "float") {
+        for (int i = 0; i < num; ++i) {
+          strtof(endptr, &endptr);
+          if (errno == ERANGE) {
+            VLOG(0) << "error: the value is out of the range of "
+                       "representable values for float";
+            VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                    << filename << ">";
+            return false;
+          }
+          if (i + 1 != num && endptr - str == len) {
+            VLOG(0) << "error: there is a wrong with the number of ids.";
+            VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                    << filename << ">";
+            return false;
+          }
+        }
+      } else if (all_slots_type_[i] == "uint64") {
+        for (int i = 0; i < num; ++i) {
+          strtoull(endptr, &endptr, 10);
+          if (errno == ERANGE) {
+            VLOG(0) << "error: the value is out of the range of "
+                       "representable values for uint64_t";
+            VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                    << filename << ">";
+            return false;
+          }
+          if (i + 1 != num && endptr - str == len) {
+            VLOG(0) << "error: there is a wrong with the number of ids.";
+            VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                    << filename << ">";
+            return false;
+          }
+        }
+      } else {
+        VLOG(0) << "error: this type<" << all_slots_type_[i]
+                << "> is not supported";
+        return false;
+      }
+    }
+    // It may be added '\t' character to the end of the output of reduce
+    // task when processes data by Hadoop(when the output of the reduce
+    // task of Hadoop has only one field, it will add a '\t' at the end
+    // of the line by default, and you can use this option to avoid it:
+    // `-D mapred.textoutputformat.ignoreseparator=true`), which does
+    // not affect the correctness of the data. Therefore, it should be
+    // judged that the data is not normal when the end of each line of
+    // data contains characters which are not spaces.
+    while (endptr - str != len) {
+      if (!isspace(*(endptr++))) {
+        VLOG(0)
+            << "error: there is some extra characters at the end of the line.";
+        VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                << filename << ">";
+        return false;
+      }
+    }
+  }
+  VLOG(3) << "instances cout: " << instance_cout;
+  VLOG(3) << "The file format is correct";
+  return true;
+}
+
+bool MultiSlotDataFeed::ParseOneInstance(std::vector<MultiSlotType>* instance) {
+  std::string line;
+  if (getline(file_, line)) {
+    int use_slots_num = use_slots_.size();
+    instance->resize(use_slots_num);
+    // parse line
+    const char* str = line.c_str();
+    char* endptr = const_cast<char*>(str);
+    int pos = 0;
+    for (size_t i = 0; i < use_slots_index_.size(); ++i) {
+      int idx = use_slots_index_[i];
+      int num = strtol(&str[pos], &endptr, 10);
+      PADDLE_ENFORCE(
+          num,
+          "The number of ids can not be zero, you need padding "
+          "it in data generator; or if there is something wrong with "
+          "the data, please check if the data contains unresolvable "
+          "characters.\nplease check this error line: %s",
+          str);
+
+      if (idx != -1) {
+        (*instance)[idx].Init(all_slots_type_[i]);
+        if ((*instance)[idx].GetType()[0] == 'f') {  // float
+          for (int j = 0; j < num; ++j) {
+            float feasign = strtof(endptr, &endptr);
+            (*instance)[idx].AddValue(feasign);
+          }
+        } else if ((*instance)[idx].GetType()[0] == 'u') {  // uint64
+          for (int j = 0; j < num; ++j) {
+            uint64_t feasign = (uint64_t)strtoull(endptr, &endptr, 10);
+            (*instance)[idx].AddValue(feasign);
+          }
+        }
+        pos = endptr - str;
+      } else {
+        for (int j = 0; j <= num; ++j) {
+          pos = line.find_first_of(' ', pos + 1);
+        }
+      }
+    }
+  } else {
+    return false;
+  }
+  return true;
+}
+
+void MultiSlotDataFeed::AddInstanceToInsVec(
+    std::vector<MultiSlotType>* ins_vec,
+    const std::vector<MultiSlotType>& instance, int index) {
+  if (index == 0) {
+    ins_vec->resize(instance.size());
+    for (size_t i = 0; i < instance.size(); ++i) {
+      (*ins_vec)[i].Init(instance[i].GetType());
+      (*ins_vec)[i].InitOffset();
+    }
+  }
+
+  for (size_t i = 0; i < instance.size(); ++i) {
+    (*ins_vec)[i].AddIns(instance[i]);
+  }
+}
+
+void MultiSlotDataFeed::PutToFeedVec(
+    const std::vector<MultiSlotType>& ins_vec) {
+  for (size_t i = 0; i < use_slots_.size(); ++i) {
+    const auto& type = ins_vec[i].GetType();
+    const auto& offset = ins_vec[i].GetOffset();
+    int total_instance = static_cast<int>(offset.back());
+
+    if (type[0] == 'f') {  // float
+      const auto& feasign = ins_vec[i].GetFloatData();
+      float* tensor_ptr = feed_vec_[i]->mutable_data<float>(
+          {total_instance, 1}, platform::CPUPlace());
+      memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
+    } else if (type[0] == 'u') {  // uint64
+      // no uint64_t type in paddlepaddle
+      const auto& feasign = ins_vec[i].GetUint64Data();
+      int64_t* tensor_ptr = feed_vec_[i]->mutable_data<int64_t>(
+          {total_instance, 1}, platform::CPUPlace());
+      memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
+    }
+
+    LoD data_lod{offset};
+    feed_vec_[i]->set_lod(data_lod);
+    if (use_slots_is_dense_[i]) {
+      int dim = total_instance / batch_size_;
+      feed_vec_[i]->Resize({batch_size_, dim});
+    }
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
new file mode 100644
index 0000000000000000000000000000000000000000..7cc6919703680c359b89075777e97676f5253c57
--- /dev/null
+++ b/paddle/fluid/framework/data_feed.h
@@ -0,0 +1,240 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <fstream>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "paddle/fluid/framework/data_feed.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/operators/reader/blocking_queue.h"
+
+namespace paddle {
+namespace framework {
+
+// DataFeed is the base virtual class for all ohther DataFeeds.
+// It is used to read files and parse the data for subsequent trainer.
+// Example:
+//   DataFeed* reader =
+//   paddle::framework::DataFeedFactory::CreateDataFeed(data_feed_name);
+//   reader->Init(data_feed_desc); // data_feed_desc is a protobuf object
+//   reader->SetFileList(filelist);
+//   const std::vector<std::string> & use_slot_alias =
+//   reader->GetUseSlotAlias();
+//   for (auto name: use_slot_alias){ // for binding memory
+//     reader->AddFeedVar(scope->Var(name), name);
+//   }
+//   reader->Start();
+//   while (reader->Next()) {
+//      // trainer do something
+//   }
+class DataFeed {
+ public:
+  DataFeed() {}
+  virtual ~DataFeed() {}
+  virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc) = 0;
+  virtual bool CheckFile(const char* filename) {
+    PADDLE_THROW("This function(CheckFile) is not implemented.");
+  }
+  // Set filelist for DataFeed.
+  // Pay attention that it must init all readers before call this function.
+  // Otherwise, Init() function will init finish_set_filelist_ flag.
+  virtual bool SetFileList(const std::vector<std::string>& files);
+  virtual bool Start() = 0;
+  // The trainer calls the Next() function, and the DataFeed will load a new
+  // batch to the feed_vec. The return value of this function is the batch
+  // size of the current batch.
+  virtual int Next() = 0;
+  // Get all slots' alias which defined in protofile
+  virtual const std::vector<std::string>& GetAllSlotAlias() {
+    return all_slots_;
+  }
+  // Get used slots' alias which defined in protofile
+  virtual const std::vector<std::string>& GetUseSlotAlias() {
+    return use_slots_;
+  }
+  // This function is used for binding feed_vec memory
+  virtual void AddFeedVar(Variable* var, const std::string& name);
+
+ protected:
+  // The following three functions are used to check if it is executed in this
+  // order:
+  //   Init() -> SetFileList() -> Start() -> Next()
+  virtual void CheckInit();
+  virtual void CheckSetFileList();
+  virtual void CheckStart();
+  virtual void SetBatchSize(
+      int batch);  // batch size will be set in Init() function
+  // This function is used to pick one file from the global filelist(thread
+  // safe).
+  virtual bool PickOneFile(std::string* filename);
+
+  static std::vector<std::string> filelist_;
+  static size_t file_idx_;
+  static std::mutex mutex_for_pick_file_;
+
+  // the alias of used slots, and its order is determined by
+  // data_feed_desc(proto object)
+  std::vector<std::string> use_slots_;
+  std::vector<bool> use_slots_is_dense_;
+
+  // the alias of all slots, and its order is determined by data_feed_desc(proto
+  // object)
+  std::vector<std::string> all_slots_;
+  std::vector<std::string> all_slots_type_;
+  std::vector<int>
+      use_slots_index_;  // -1: not used; >=0: the index of use_slots_
+
+  // The data read by DataFeed will be stored here
+  std::vector<LoDTensor*> feed_vec_;
+
+  // the batch size defined by user
+  int default_batch_size_;
+  // current batch size
+  int batch_size_;
+
+  bool finish_init_;
+  static bool finish_set_filelist_;
+  bool finish_start_;
+};
+
+// PrivateQueueDataFeed is the base virtual class for ohther DataFeeds.
+// It use a read-thread to read file and parse data to a private-queue
+// (thread level), and get data from this queue when trainer call Next().
+template <typename T>
+class PrivateQueueDataFeed : public DataFeed {
+ public:
+  PrivateQueueDataFeed() {}
+  virtual ~PrivateQueueDataFeed() {}
+  virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc) = 0;
+  virtual bool Start();
+  virtual int Next();
+
+ protected:
+  // The thread implementation function for reading file and parse.
+  virtual void ReadThread();
+  // This function is used to set private-queue size, and the most
+  // efficient when the queue size is close to the batch size.
+  virtual void SetQueueSize(int queue_size);
+  // The reading and parsing method called in the ReadThread.
+  virtual bool ParseOneInstance(T* instance) = 0;
+  // This function is used to put instance to vec_ins
+  virtual void AddInstanceToInsVec(T* vec_ins, const T& instance,
+                                   int index) = 0;
+  // This function is used to put ins_vec to feed_vec
+  virtual void PutToFeedVec(const T& ins_vec) = 0;
+
+  // The thread for read files
+  std::thread read_thread_;
+  // using ifstream one line and one line parse is faster
+  // than using fread one buffer and one buffer parse.
+  //   for a 601M real data:
+  //     ifstream one line and one line parse: 6034 ms
+  //     fread one buffer and one buffer parse: 7097 ms
+  std::ifstream file_;
+  size_t queue_size_;
+  // The queue for store parsed data
+  std::unique_ptr<paddle::operators::reader::BlockingQueue<T>> queue_;
+};
+
+// This class define the data type of instance(ins_vec) in MultiSlotDataFeed
+class MultiSlotType {
+ public:
+  MultiSlotType() {}
+  ~MultiSlotType() {}
+  void Init(const std::string& type) {
+    CheckType(type);
+    if (type_[0] == 'f') {
+      float_feasign_.clear();
+    } else if (type_[0] == 'u') {
+      uint64_feasign_.clear();
+    }
+    type_ = type;
+  }
+  void InitOffset() {
+    offset_.resize(1);
+    // LoDTensor' lod is counted from 0, the size of lod
+    // is one size larger than the size of data.
+    offset_[0] = 0;
+  }
+  const std::vector<size_t>& GetOffset() const { return offset_; }
+  void AddValue(const float v) {
+    CheckFloat();
+    float_feasign_.push_back(v);
+  }
+  void AddValue(const uint64_t v) {
+    CheckUint64();
+    uint64_feasign_.push_back(v);
+  }
+  void AddIns(const MultiSlotType& ins) {
+    if (ins.GetType()[0] == 'f') {  // float
+      CheckFloat();
+      auto& vec = ins.GetFloatData();
+      offset_.push_back(offset_.back() + vec.size());
+      float_feasign_.insert(float_feasign_.end(), vec.begin(), vec.end());
+    } else if (ins.GetType()[0] == 'u') {  // uint64
+      CheckUint64();
+      auto& vec = ins.GetUint64Data();
+      offset_.push_back(offset_.back() + vec.size());
+      uint64_feasign_.insert(uint64_feasign_.end(), vec.begin(), vec.end());
+    }
+  }
+  const std::vector<float>& GetFloatData() const { return float_feasign_; }
+  const std::vector<uint64_t>& GetUint64Data() const { return uint64_feasign_; }
+  const std::string& GetType() const { return type_; }
+
+ private:
+  void CheckType(const std::string& type) const {
+    PADDLE_ENFORCE((type == "uint64") || (type == "float"),
+                   "There is no this type<%s>.", type);
+  }
+  void CheckFloat() const {
+    PADDLE_ENFORCE(type_[0] == 'f', "Add %s value to float slot.", type_);
+  }
+  void CheckUint64() const {
+    PADDLE_ENFORCE(type_[0] == 'u', "Add %s value to uint64 slot.", type_);
+  }
+  std::vector<float> float_feasign_;
+  std::vector<uint64_t> uint64_feasign_;
+  std::string type_;
+  std::vector<size_t> offset_;
+};
+
+// This DataFeed is used to feed multi-slot type data.
+// The format of multi-slot type data:
+//   [n feasign_0 feasign_1 ... feasign_n]*
+class MultiSlotDataFeed
+    : public PrivateQueueDataFeed<std::vector<MultiSlotType>> {
+ public:
+  MultiSlotDataFeed() {}
+  virtual ~MultiSlotDataFeed() {}
+  virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc);
+  virtual bool CheckFile(const char* filename);
+
+ protected:
+  virtual void AddInstanceToInsVec(std::vector<MultiSlotType>* vec_ins,
+                                   const std::vector<MultiSlotType>& instance,
+                                   int index);
+  virtual bool ParseOneInstance(std::vector<MultiSlotType>* instance);
+  virtual void PutToFeedVec(const std::vector<MultiSlotType>& ins_vec);
+};
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/data_feed.proto b/paddle/fluid/framework/data_feed.proto
new file mode 100644
index 0000000000000000000000000000000000000000..489fec08d86ccf61ece29bbba6d0204f25530b0f
--- /dev/null
+++ b/paddle/fluid/framework/data_feed.proto
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+syntax = "proto2";
+package paddle.framework;
+
+message Slot {
+  required string name = 1;
+  required string type = 2;
+  optional bool is_dense = 3 [ default = false ];
+  optional bool is_used = 4 [ default = false ];
+}
+
+message MultiSlotDesc { repeated Slot slots = 1; }
+
+message DataFeedDesc {
+  optional string name = 1;
+  optional int32 batch_size = 2 [ default = 32 ];
+  optional MultiSlotDesc multi_slot_desc = 3;
+}
diff --git a/paddle/fluid/framework/data_feed_factory.cc b/paddle/fluid/framework/data_feed_factory.cc
new file mode 100644
index 0000000000000000000000000000000000000000..72148b9f7d343e19d60bb2be44d8270ad78d1412
--- /dev/null
+++ b/paddle/fluid/framework/data_feed_factory.cc
@@ -0,0 +1,64 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/data_feed_factory.h"
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "paddle/fluid/framework/data_feed.h"
+
+namespace paddle {
+namespace framework {
+typedef std::shared_ptr<DataFeed> (*Createdata_feedFunction)();
+typedef std::unordered_map<std::string, Createdata_feedFunction> data_feedMap;
+data_feedMap g_data_feed_map;
+
+#define REGISTER_DATAFEED_CLASS(data_feed_class)                      \
+  namespace {                                                         \
+  std::shared_ptr<DataFeed> Creator_##data_feed_class() {             \
+    return std::shared_ptr<DataFeed>(new data_feed_class);            \
+  }                                                                   \
+  class __Registerer_##data_feed_class {                              \
+   public:                                                            \
+    __Registerer_##data_feed_class() {                                \
+      g_data_feed_map[#data_feed_class] = &Creator_##data_feed_class; \
+    }                                                                 \
+  };                                                                  \
+  __Registerer_##data_feed_class g_registerer_##data_feed_class;      \
+  }  // namespace
+
+std::string DataFeedFactory::DataFeedTypeList() {
+  std::string data_feed_types;
+  for (auto iter = g_data_feed_map.begin(); iter != g_data_feed_map.end();
+       ++iter) {
+    if (iter != g_data_feed_map.begin()) {
+      data_feed_types += ", ";
+    }
+    data_feed_types += iter->first;
+  }
+  return data_feed_types;
+}
+
+std::shared_ptr<DataFeed> DataFeedFactory::CreateDataFeed(
+    std::string data_feed_class) {
+  if (g_data_feed_map.count(data_feed_class) < 1) {
+    exit(-1);
+  }
+  return g_data_feed_map[data_feed_class]();
+}
+
+REGISTER_DATAFEED_CLASS(MultiSlotDataFeed);
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/data_feed_factory.h b/paddle/fluid/framework/data_feed_factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..13678edb0b8d084a0b3016d93f6e1bc32ce0169a
--- /dev/null
+++ b/paddle/fluid/framework/data_feed_factory.h
@@ -0,0 +1,29 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "paddle/fluid/framework/data_feed.h"
+
+namespace paddle {
+namespace framework {
+class DataFeedFactory {
+ public:
+  static std::string DataFeedTypeList();
+  static std::shared_ptr<DataFeed> CreateDataFeed(std::string data_feed_class);
+};
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/data_feed_test.cc b/paddle/fluid/framework/data_feed_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b3e969871592394a7ac2fdeab8495677e7bba070
--- /dev/null
+++ b/paddle/fluid/framework/data_feed_test.cc
@@ -0,0 +1,330 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/data_feed.h"
+#include <fcntl.h>
+#include <chrono>  // NOLINT
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <mutex>  // NOLINT
+#include <set>
+#include <thread>  // NOLINT
+#include <utility>
+#include <vector>
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "google/protobuf/text_format.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/data_feed_factory.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+
+paddle::framework::DataFeedDesc load_datafeed_param_from_file(
+    const char* filename) {
+  paddle::framework::DataFeedDesc data_feed_desc;
+  int file_descriptor = open(filename, O_RDONLY);
+  PADDLE_ENFORCE(file_descriptor != -1, "Can not open %s.", filename);
+  google::protobuf::io::FileInputStream fileInput(file_descriptor);
+  google::protobuf::TextFormat::Parse(&fileInput, &data_feed_desc);
+  close(file_descriptor);
+  return data_feed_desc;
+}
+
+const std::vector<std::string> load_filelist_from_file(const char* filename) {
+  std::vector<std::string> filelist;
+  std::ifstream fin(filename);
+  PADDLE_ENFORCE(fin.good(), "Can not open %s.", filename);
+  std::string line;
+  while (getline(fin, line)) {
+    filelist.push_back(line);
+  }
+  fin.close();
+  return filelist;
+}
+
+void GenerateFileForTest(const char* protofile, const char* filelist) {
+  std::ofstream w_protofile(protofile);
+  w_protofile << "name: \"MultiSlotDataFeed\"\n"
+                 "batch_size: 2\n"
+                 "multi_slot_desc {\n"
+                 "    slots {\n"
+                 "        name: \"uint64_sparse_slot\"\n"
+                 "        type: \"uint64\"\n"
+                 "        is_dense: false\n"
+                 "        is_used: true\n"
+                 "    }\n"
+                 "    slots {\n"
+                 "        name: \"float_sparse_slot\"\n"
+                 "        type: \"float\"\n"
+                 "        is_dense: false\n"
+                 "        is_used: true\n"
+                 "    }\n"
+                 "    slots {\n"
+                 "        name: \"uint64_dense_slot\"\n"
+                 "        type: \"uint64\"\n"
+                 "        is_dense: true\n"
+                 "        is_used: true\n"
+                 "    }\n"
+                 "    slots {\n"
+                 "        name: \"float_dense_slot\"\n"
+                 "        type: \"float\"\n"
+                 "        is_dense: true\n"
+                 "        is_used: true\n"
+                 "    }\n"
+                 "    slots {\n"
+                 "        name: \"not_used_slot\"\n"
+                 "        type: \"uint64\"\n"
+                 "        is_dense: false\n"
+                 "        is_used: false\n"
+                 "    }\n"
+                 "}";
+  w_protofile.close();
+  std::ofstream w_filelist(filelist);
+  int total_file = 4;
+  for (int i = 0; i < total_file; ++i) {
+    std::string filename = "TestMultiSlotDataFeed.data." + std::to_string(i);
+    w_filelist << filename;
+    if (i + 1 != total_file) {
+      w_filelist << std::endl;
+    }
+    std::ofstream w_datafile(filename.c_str());
+    w_datafile << "3 3978 620 82 1 1926.08 1 1926 1 6.02 1 1996\n"
+                  "2 1300 2983353 1 985.211 1 8 1 0.618 1 12\n"
+                  "1 19260827 2 3.14 2.718 1 27 1 2.236 1 28\n";
+    w_datafile.close();
+  }
+  w_filelist.close();
+}
+
+class MultiTypeSet {
+ public:
+  MultiTypeSet() {
+    uint64_set_.clear();
+    float_set_.clear();
+  }
+  ~MultiTypeSet() {}
+  void AddValue(uint64_t v) { uint64_set_.insert(v); }
+  void AddValue(float v) { float_set_.insert(v); }
+  const std::set<uint64_t>& GetUint64Set() const { return uint64_set_; }
+  const std::set<float>& GetFloatSet() const { return float_set_; }
+
+ private:
+  std::set<uint64_t> uint64_set_;
+  std::set<float> float_set_;
+};
+
+void GetElemSetFromReader(std::vector<MultiTypeSet>* reader_elem_set,
+                          const paddle::framework::DataFeedDesc& data_feed_desc,
+                          const std::vector<std::string>& filelist,
+                          const int thread_num) {
+  int used_slot_num = 0;
+  for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) {
+    if (data_feed_desc.multi_slot_desc().slots(i).is_used()) {
+      ++used_slot_num;
+    }
+  }
+  reader_elem_set->resize(used_slot_num);
+  std::vector<std::thread> threads;
+  std::vector<std::shared_ptr<paddle::framework::DataFeed>> readers;
+  readers.resize(thread_num);
+  for (int i = 0; i < thread_num; ++i) {
+    readers[i] = paddle::framework::DataFeedFactory::CreateDataFeed(
+        data_feed_desc.name());
+    readers[i]->Init(data_feed_desc);
+  }
+  readers[0]->SetFileList(filelist);
+  std::mutex mu;
+  for (int idx = 0; idx < thread_num; ++idx) {
+    threads.emplace_back(std::thread([&, idx] {
+      std::unique_ptr<paddle::framework::Scope> scope(
+          new paddle::framework::Scope());
+      const auto& multi_slot_desc = data_feed_desc.multi_slot_desc();
+      std::map<std::string, const paddle::framework::LoDTensor*>
+          lodtensor_targets;
+      for (int i = 0; i < multi_slot_desc.slots_size(); ++i) {
+        const auto& slot = multi_slot_desc.slots(i);
+        if (slot.is_used()) {
+          const auto& name = slot.name();
+          readers[idx]->AddFeedVar(scope->Var(name), name);
+          lodtensor_targets[name] =
+              &scope->FindVar(name)->Get<paddle::framework::LoDTensor>();
+        }
+      }
+      readers[idx]->Start();
+      while (readers[idx]->Next()) {
+        int index = 0;
+        for (int k = 0; k < multi_slot_desc.slots_size(); ++k) {
+          const auto& slot = multi_slot_desc.slots(k);
+          if (!slot.is_used()) {
+            continue;
+          }
+          const paddle::framework::LoDTensor* tens =
+              lodtensor_targets[slot.name()];
+          if (slot.is_dense()) {  // dense branch
+            if (slot.type() == "uint64") {
+              const int64_t* data = tens->data<int64_t>();
+              int batch_size = tens->dims()[0];
+              int dim = tens->dims()[1];
+              for (int i = 0; i < batch_size; ++i) {
+                for (int j = 0; j < dim; ++j) {
+                  std::lock_guard<std::mutex> lock(mu);
+                  (*reader_elem_set)[index].AddValue(
+                      (uint64_t)data[i * dim + j]);
+                }
+              }
+            } else if (slot.type() == "float") {
+              const float* data = tens->data<float>();
+              int batch_size = tens->dims()[0];
+              int dim = tens->dims()[1];
+              for (int i = 0; i < batch_size; ++i) {
+                for (int j = 0; j < dim; ++j) {
+                  std::lock_guard<std::mutex> lock(mu);
+                  (*reader_elem_set)[index].AddValue(data[i * dim + j]);
+                }
+              }
+            } else {
+              PADDLE_THROW("Error type in proto file.");
+            }
+          } else {  // sparse branch
+            if (slot.type() == "uint64") {
+              const int64_t* data = tens->data<int64_t>();
+              for (size_t i = 0; i < tens->NumElements(); ++i) {
+                std::pair<size_t, size_t> element = tens->lod_element(0, i);
+                for (size_t j = element.first; j < element.second; ++j) {
+                  std::lock_guard<std::mutex> lock(mu);
+                  (*reader_elem_set)[index].AddValue((uint64_t)data[j]);
+                }
+              }
+            } else if (slot.type() == "float") {
+              const float* data = tens->data<float>();
+              for (size_t i = 0; i < tens->NumElements(); ++i) {
+                std::pair<size_t, size_t> element = tens->lod_element(0, i);
+                for (size_t j = element.first; j < element.second; ++j) {
+                  std::lock_guard<std::mutex> lock(mu);
+                  (*reader_elem_set)[index].AddValue(data[j]);
+                }
+              }
+            } else {
+              PADDLE_THROW("Error type in proto file.");
+            }
+          }  // end sparse branch
+          ++index;
+        }  // end slots loop
+      }    // end while Next()
+    }));   // end anonymous function
+  }
+  for (auto& th : threads) {
+    th.join();
+  }
+}
+
+void CheckIsUnorderedSame(const std::vector<MultiTypeSet>& s1,
+                          const std::vector<MultiTypeSet>& s2) {
+  EXPECT_EQ(s1.size(), s2.size());
+  for (size_t i = 0; i < s1.size(); ++i) {
+    // check for uint64
+    const std::set<uint64_t>& uint64_s1 = s1[i].GetUint64Set();
+    const std::set<uint64_t>& uint64_s2 = s2[i].GetUint64Set();
+    EXPECT_EQ(uint64_s1.size(), uint64_s2.size());
+    auto uint64_it1 = uint64_s1.begin();
+    auto uint64_it2 = uint64_s2.begin();
+    while (uint64_it1 != uint64_s1.end()) {
+      EXPECT_EQ(*uint64_it1, *uint64_it2);
+      ++uint64_it1;
+      ++uint64_it2;
+    }
+    // check for float
+    const std::set<float>& float_s1 = s1[i].GetFloatSet();
+    const std::set<float>& float_s2 = s2[i].GetFloatSet();
+    EXPECT_EQ(float_s1.size(), float_s2.size());
+    auto float_it1 = float_s1.begin();
+    auto float_it2 = float_s2.begin();
+    while (float_it1 != float_s1.end()) {
+      EXPECT_EQ(*float_it1, *float_it2);
+      ++float_it1;
+      ++float_it2;
+    }
+  }
+}
+
+void GetElemSetFromFile(std::vector<MultiTypeSet>* file_elem_set,
+                        const paddle::framework::DataFeedDesc& data_feed_desc,
+                        const std::vector<std::string>& filelist) {
+  int used_slot_num = 0;
+  for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) {
+    if (data_feed_desc.multi_slot_desc().slots(i).is_used()) {
+      ++used_slot_num;
+    }
+  }
+  file_elem_set->resize(used_slot_num);
+  for (const auto& file : filelist) {
+    std::ifstream fin(file.c_str());
+    PADDLE_ENFORCE(fin.good(), "Can not open %s.", file.c_str());
+    while (1) {
+      bool end_flag = false;
+      int index = 0;
+      for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) {
+        int num;
+        if (fin >> num) {
+          auto slot = data_feed_desc.multi_slot_desc().slots(i);
+          auto type = slot.type();
+          if (type == "uint64") {
+            while (num--) {
+              uint64_t feasign;
+              fin >> feasign;
+              if (slot.is_used()) {
+                (*file_elem_set)[index].AddValue(feasign);
+              }
+            }
+          } else if (type == "float") {
+            while (num--) {
+              float feasign;
+              fin >> feasign;
+              if (slot.is_used()) {
+                (*file_elem_set)[index].AddValue(feasign);
+              }
+            }
+          } else {
+            PADDLE_THROW("Error type in proto file.");
+          }
+          if (slot.is_used()) {
+            ++index;
+          }
+        } else {
+          end_flag = true;
+          break;
+        }
+      }
+      if (end_flag) {
+        break;
+      }
+    }
+    fin.close();
+  }
+}
+
+TEST(DataFeed, MultiSlotUnitTest) {
+  const char* protofile = "data_feed_desc.prototxt";
+  const char* filelist_name = "filelist.txt";
+  GenerateFileForTest(protofile, filelist_name);
+  const std::vector<std::string> filelist =
+      load_filelist_from_file(filelist_name);
+  paddle::framework::DataFeedDesc data_feed_desc =
+      load_datafeed_param_from_file(protofile);
+  std::vector<MultiTypeSet> reader_elem_set;
+  std::vector<MultiTypeSet> file_elem_set;
+  GetElemSetFromReader(&reader_elem_set, data_feed_desc, filelist, 4);
+  GetElemSetFromFile(&file_elem_set, data_feed_desc, filelist);
+  CheckIsUnorderedSame(reader_elem_set, file_elem_set);
+}
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index c9e3a8ac1d1e5228725bff49ecc6d91e640dfe57..5467f6d1b23c0058f06387e3da97c4193dd5ca6c 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -151,19 +151,22 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
   auto out_format =
       platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
 
-  void* in_data = GetDataFromTensor(in, in_type);
-
   // output tensor has the same dims as input. Reorder don't change dims
   out->Resize(in.dims());
 
-  auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());
-
-  auto in_memory = memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
-  auto out_memory =
-      memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
+  if (in_format != out_format) {
+    void* in_data = GetDataFromTensor(in, in_type);
+    auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());
 
-  platform::Reorder(in_memory, out_memory);
+    auto in_memory =
+        memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
+    auto out_memory =
+        memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
 
+    platform::Reorder(in_memory, out_memory);
+  } else {
+    out->ShareDataWith(in);
+  }
   out->set_layout(out_layout);
   // reset format since the out tensor will be feed to non-MKLDNN OPkernel
   out->set_format(memory::format::format_undef);
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 93288936fea1fae897dc26e6d8850da612960333..a927a3afcddb52f571543462e485b682aac163ae 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -15,14 +15,26 @@ cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_ro
 if(WITH_GPU)
     nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
             dynload_cuda variable_visitor)
-    nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda)
+    if(WITH_DISTRIBUTE)
+        nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
+            ddim dynload_cuda selected_rows_functor sendrecvop_grpc)
+    else()
+        nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
+            ddim dynload_cuda selected_rows_functor)
+    endif()
     nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
     nv_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
 
 else()
     cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
              variable_visitor)
-    cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim)
+    if(WITH_DISTRIBUTE)
+        cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
+            ddim selected_rows_functor sendrecvop_grpc)
+    else()
+        cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
+            ddim selected_rows_functor)
+    endif()
     cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
     cc_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
 endif()
@@ -33,10 +45,10 @@ cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base s
 
 cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
 
-if (WITH_GPU)
-  cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle scale_loss_grad_op_handle rpc_op_handle
-          all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
-endif()
+cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle)
+cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
+cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass)
+cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper)
 
 cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
 cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)
@@ -44,10 +56,7 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
         scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
 
-set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass) 
-if (WITH_GPU)
-  list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
-endif()
+set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass) 
 
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
 
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index a003995ae3f8e111881b4681554aa8eb17b60cc1..e8bf53e160e7382122c3c2f92a152fea058a032e 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -48,7 +48,14 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
 void AllReduceOpHandle::RunImpl() {
   platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
 
+// FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR,
+// this is a distributed or inter-process call, find a better way.
+#ifdef PADDLE_WITH_CUDA
+  if (NoDummyInputSize() == 1 &&
+      local_scopes_[0]->FindLocalVar(NCCL_ID_VARNAME) == nullptr) {
+#else
   if (NoDummyInputSize() == 1) {
+#endif
     return;  // No need to all reduce when GPU count = 1;
   } else {
     // Wait input done
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 523f9eadf2d7e2e08504c5920372fb7cdb0d7aba..d8526b3f2492992c5c0f6f5e0a85cffca7398700 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -58,10 +58,23 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
       }
     }
 
+    CollectiveContext *context = CollectiveContext::GetInstance();
+    context->endpoints_ = strategy_.trainers_endpoints_;
+    context->trainer_id_ = strategy_.trainer_id_;
+    PADDLE_ENFORCE(strategy_.trainer_id_ >= 0, "trainer_id_ >= 0");
+    if (strategy_.trainer_id_ > 0) {
+      PADDLE_ENFORCE((unsigned)(strategy_.trainer_id_) <
+                         strategy_.trainers_endpoints_.size(),
+                     "trainer_id_ < endpoints_ size");
+    }
+    VLOG(1) << "CollectiveContext:" << context->String();
+
     // Convert graph to run on multi-devices.
     auto multi_devices_pass = AppendPass("multi_devices_pass");
     multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
                                                          &strategy_);
+    multi_devices_pass->Set<int>("num_trainers",
+                                 new int(strategy_.num_trainers_));
 
     // Add a graph print pass to record a graph with device info.
     if (!strategy_.debug_graphviz_path_.empty()) {
@@ -133,16 +146,16 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
       pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
 #endif
     } else if (pass->Type() == "sequential_execution_pass") {
-      VLOG(1) << "set enable_sequential_execution:"
-              << enable_sequential_execution_;
+      LOG(INFO) << "set enable_sequential_execution:"
+                << enable_sequential_execution_;
 
       pass->Erase(kAllOpDescs);
       pass->Set<const std::vector<OpDesc *>>(
           kAllOpDescs,
           new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
     } else if (pass->Type() == "all_reduce_deps_pass") {
-      VLOG(1) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
-              << ", num_trainers:" << num_trainers_;
+      LOG(INFO) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
+                << ", num_trainers:" << num_trainers_;
 
       pass->Erase(kAllOpDescs);
       pass->Set<const std::vector<OpDesc *>>(
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 9f0a25912886cea7a1f287125cfe8612e4b336eb..c97be169575f578dfd18a6290230d1b3f3bd7596 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -74,6 +74,8 @@ struct BuildStrategy {
   bool fuse_broadcast_op_{false};
 
   int num_trainers_{1};
+  int trainer_id_{0};
+  std::vector<std::string> trainers_endpoints_;
   bool remove_unnecessary_lock_{false};
 
   // NOTE:
diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
index 7ad1e40c600c6e70cea822fac777ff20163078e6..7beb8c8de9fc49aebc66ca44de8736240aabbc30 100644
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -20,11 +20,13 @@ namespace paddle {
 namespace framework {
 namespace details {
 ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
-                                         platform::Place place)
+                                         platform::Place place,
+                                         size_t scope_idx)
     : OpHandleBase(node),
       op_(framework::OpRegistry::CreateOp(*node->Op())),
       scope_(scope),
-      place_(place) {}
+      place_(place),
+      scope_idx_(scope_idx) {}
 
 void ComputationOpHandle::RunImpl() {
   WaitInputVarGenerated(place_);
diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h
index 662a91d6b4dfcfed563fdf2e46c22f83f90b40af..601ae4f8c6de11b0bf25d4f9a92ef8eada67be3d 100644
--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@@ -28,7 +28,8 @@ namespace framework {
 namespace details {
 struct ComputationOpHandle : public OpHandleBase {
  public:
-  ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place);
+  ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place,
+                      size_t scope_idx);
 
   std::string Name() const override;
 
@@ -38,6 +39,8 @@ struct ComputationOpHandle : public OpHandleBase {
 
   void SetLockAndRecordEventFree(bool b) { is_lock_and_record_event_free_ = b; }
 
+  size_t GetScopeIdx() const { return scope_idx_; }
+
  protected:
   void RunImpl() override;
 
@@ -47,6 +50,7 @@ struct ComputationOpHandle : public OpHandleBase {
   std::unique_ptr<OperatorBase> op_;
   Scope *scope_;
   platform::Place place_;
+  size_t scope_idx_;
   bool is_lock_and_record_event_free_{false};
 };
 }  // namespace details
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..abacb11e3b018308c20a67630e3ff34cca7d3387
--- /dev/null
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -0,0 +1,122 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+EagerDeletionOpHandle::EagerDeletionOpHandle(
+    ir::Node *node, const Scope *scope, const platform::Place &place,
+    const std::unordered_set<std::string> &var_names, GarbageCollector *gc,
+    AtomicReferenceCountMap *ref_cnts)
+    : OpHandleBase(node),
+      scope_(scope),
+      var_names_(var_names),
+      gc_(gc),
+      ref_cnts_(ref_cnts) {
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_gpu_place(place)) {
+    dev_ctx_ = reinterpret_cast<platform::CUDADeviceContext *>(
+        platform::DeviceContextPool::Instance().Get(place));
+    if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
+      platform::CUDADeviceGuard guard(
+          boost::get<platform::CUDAPlace>(place).device);
+      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+      PADDLE_ENFORCE_NOT_NULL(event_);
+    }
+  }
+#endif
+}
+
+EagerDeletionOpHandle::~EagerDeletionOpHandle() {
+#ifdef PADDLE_WITH_CUDA
+  if (event_) {
+    auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
+    platform::CUDADeviceGuard guard(gpu_place.device);
+    PADDLE_ENFORCE(cudaEventDestroy(event_));
+  }
+#endif
+}
+
+std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; }
+
+void EagerDeletionOpHandle::RunImpl() {
+  auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  std::deque<std::shared_ptr<memory::Allocation>> garbages;
+  for (auto &name : var_names_) {
+    auto it = ref_cnts_->find(name);
+    // Var not found, not reference count has not decreased to 0
+    if (it == ref_cnts_->end() || it->second.fetch_sub(1) != 1) {
+      continue;
+    }
+
+    auto *var = exec_scope->FindVar(name);
+    if (var == nullptr) {
+      continue;
+    }
+
+    VLOG(2) << "Erase variable " << name;
+
+    if (var->IsType<LoDTensor>()) {
+      garbages.emplace_back(var->GetMutable<LoDTensor>()->MoveMemoryHolder());
+    } else if (var->IsType<SelectedRows>()) {
+      garbages.emplace_back(
+          var->GetMutable<SelectedRows>()->mutable_value()->MoveMemoryHolder());
+    } else if (var->IsType<LoDTensorArray>()) {
+      auto *tensor_arr = var->GetMutable<LoDTensorArray>();
+      for (auto &t : *tensor_arr) {
+        garbages.emplace_back(t.MoveMemoryHolder());
+      }
+    } else {
+      PADDLE_THROW("Type %s of %s is not supported eager deletion",
+                   var->Type().name(), name);
+    }
+  }
+
+  if (!garbages.empty()) {
+    ClearGarbages(&garbages);
+  }
+}
+
+void EagerDeletionOpHandle::ClearGarbages(
+    std::deque<std::shared_ptr<memory::Allocation>> *garbages) {
+#ifdef PADDLE_WITH_CUDA
+  if (event_) {
+    auto compute_stream = dev_ctx_->stream();
+    auto callback_stream =
+        reinterpret_cast<StreamGarbageCollector *>(gc_)->stream();
+    auto callback_func = [=]() {
+      PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
+      PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
+    };
+    gc_->Add(std::move(*garbages), callback_func);
+  } else {
+#endif
+    gc_->Add(std::move(*garbages));
+#ifdef PADDLE_WITH_CUDA
+  }
+#endif
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..64867afad5b70a2ba31e5cb315daffcf433b5935
--- /dev/null
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <deque>
+#include <string>
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace details {
+
+class EagerDeletionOpHandle : public OpHandleBase {
+ public:
+  EagerDeletionOpHandle(ir::Node *node, const Scope *scope,
+                        const platform::Place &place,
+                        const std::unordered_set<std::string> &var_names,
+                        GarbageCollector *gc,
+                        AtomicReferenceCountMap *ref_cnts);
+
+  ~EagerDeletionOpHandle();
+
+  std::string Name() const override;
+
+ protected:
+  void RunImpl() override;
+
+ private:
+  void ClearGarbages(std::deque<std::shared_ptr<memory::Allocation>> *garbages);
+
+  const Scope *scope_;
+  std::unordered_set<std::string> var_names_;
+  GarbageCollector *gc_;               // not own
+  AtomicReferenceCountMap *ref_cnts_;  // not own
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDADeviceContext *dev_ctx_{nullptr};
+  cudaEvent_t event_{nullptr};
+#endif
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4e42d0b4972d567dd769cad6ff8b9d45380ab77a
--- /dev/null
+++ b/paddle/fluid/framework/details/eager_deletion_pass.cc
@@ -0,0 +1,101 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
+#include "paddle/fluid/framework/details/eager_deletion_pass.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  auto &ref_cnts =
+      Get<std::vector<AtomicReferenceCountMap>>(kRuntimeReferenceCount);
+  PADDLE_ENFORCE(ref_cnts.empty(),
+                 "kRuntimeReferenceCount should be initialized here!");
+
+  const auto &vars = graph->Get<GraphVars>(kGraphVars);
+  ref_cnts.resize(vars.size());
+
+  const auto &last_live_ops =
+      Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
+  const auto &gcs = Get<GarbageCollectorMap>(kGarbageCollector);
+  const auto &places = Get<std::vector<platform::Place>>(kAllPlaces);
+
+  // a reverse map of last_live_ops
+  //   i.e., last op --> variable names which can be deleted.
+  std::unordered_map<ComputationOpHandle *, std::unordered_set<std::string>>
+      op_vars_map;
+
+  for (auto &var_ops_map : last_live_ops) {
+    for (auto &var_ops_pair : var_ops_map) {
+      const std::string &var_name = var_ops_pair.first;
+      for (auto *op : var_ops_pair.second) {
+        op_vars_map[op].insert(var_name);
+      }
+    }
+  }
+
+  for (auto &pair : op_vars_map) {
+    auto *op = pair.first;
+    auto &var_names = pair.second;
+
+    auto *eager_deletion_node =
+        graph->CreateEmptyNode("eager_deletion", ir::Node::Type::kOperation);
+    auto *eager_deletion_op = new EagerDeletionOpHandle(
+        eager_deletion_node, op->GetScope(), op->GetPlace(), var_names,
+        gcs.at(places[op->GetScopeIdx()]).get(),
+        &(ref_cnts[op->GetScopeIdx()]));
+
+    auto it = std::find_if(
+        op->Outputs().begin(), op->Outputs().end(), [](VarHandleBase *var) {
+          return dynamic_cast<DummyVarHandle *>(var) != nullptr;
+        });
+
+    if (it != op->Outputs().end()) {
+      eager_deletion_op->AddInput(*it);
+    } else {
+      auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
+      graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
+      op->AddOutput(dep_var);
+      eager_deletion_op->AddInput(dep_var);
+    }
+
+    auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar());
+    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dummy_leaf);
+    eager_deletion_op->AddOutput(dummy_leaf);
+  }
+
+  VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)";
+  return graph;
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(eager_deletion_pass,
+              paddle::framework::details::EagerDeletionPass)
+    .RequirePassAttr(paddle::framework::details::kRuntimeReferenceCount)
+    .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars)
+    .RequirePassAttr(paddle::framework::details::kAllPlaces)
+    .RequirePassAttr(paddle::framework::details::kGarbageCollector);
diff --git a/paddle/fluid/framework/details/eager_deletion_pass.h b/paddle/fluid/framework/details/eager_deletion_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7a7a9709d970841060778806451bc21cb2c7571
--- /dev/null
+++ b/paddle/fluid/framework/details/eager_deletion_pass.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class EagerDeletionPass : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index a36ad259265e01121f8fc0060058ed55406c9f97..8af1d62dea89343ff2d41dd7c6ac837459df7685 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -133,6 +133,7 @@ static const char kPlaces[] = "places";
 static const char kParams[] = "params";
 static const char kLocalScopes[] = "local_scopes";
 static const char kStrategy[] = "strategy";
+static const char kNumTrainers[] = "num_trainers";
 
 void MultiDevSSAGraphBuilder::Init() const {
   all_vars_.clear();
@@ -299,6 +300,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
   auto nodes = graph->ReleaseNodes();
   ir::Graph &result = *graph;
 
+  int num_trainers = Get<int>(kNumTrainers);
+
   for (auto &node : nodes) {
     if (node->IsVar() && node->Var()) {
       all_vars_.emplace(node->Name(), node->Var());
@@ -383,7 +386,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
           CreateComputationalOps(&result, node, places_.size());
         }
 
-        if (!is_forwarding && places_.size() > 1) {
+        if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) {
           // Currently, we assume that once gradient is generated, it can be
           // broadcast, and each gradient is only broadcast once.
           if (static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
@@ -562,7 +565,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result,
                                                     int dev_id) const {
   result->Get<GraphOps>(kGraphOps).emplace_back(
       new ComputationOpHandle(result->CreateOpNode(node->Op()),
-                              local_scopes_[dev_id], places_[dev_id]));
+                              local_scopes_[dev_id], places_[dev_id], dev_id));
   CreateOpHandleIOs(result, node, dev_id);
 }
 
@@ -685,8 +688,8 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result,
   for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) {
     auto p = places_[scope_idx];
     auto s = local_scopes_[scope_idx];
-    result->Get<GraphOps>(kGraphOps).emplace_back(
-        new ComputationOpHandle(result->CreateOpNode(node->Op()), s, p));
+    result->Get<GraphOps>(kGraphOps).emplace_back(new ComputationOpHandle(
+        result->CreateOpNode(node->Op()), s, p, scope_idx));
     CreateOpHandleIOs(result, node, scope_idx);
   }
 }
@@ -862,7 +865,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
       if (node->Op()->Type() == "fetch_barrier") {
         outvar_dev_id =
             GetVarDeviceID(*result, output->Name(), *sharded_var_device);
-        PADDLE_ENFORCE_NE(outvar_dev_id, -1);
+        PADDLE_ENFORCE_NE(outvar_dev_id, -1, "output name %s", output->Name());
       }
       p = places_[outvar_dev_id];
       ir::Node *new_node = nullptr;
@@ -895,4 +898,5 @@ REGISTER_PASS(multi_devices_pass,
     .RequirePassAttr(paddle::framework::details::kPlaces)
     .RequirePassAttr(paddle::framework::details::kParams)
     .RequirePassAttr(paddle::framework::details::kLocalScopes)
-    .RequirePassAttr(paddle::framework::details::kStrategy);
+    .RequirePassAttr(paddle::framework::details::kStrategy)
+    .RequirePassAttr(paddle::framework::details::kNumTrainers);
diff --git a/paddle/fluid/framework/details/op_graph_view.cc b/paddle/fluid/framework/details/op_graph_view.cc
index 4838c4198ff35ba3fb562f3a7c0563ee60179e3b..d3865c2c2919c2d43521e4f51013e5fa1b10416d 100644
--- a/paddle/fluid/framework/details/op_graph_view.cc
+++ b/paddle/fluid/framework/details/op_graph_view.cc
@@ -23,6 +23,8 @@ namespace details {
 OpGraphView::OpGraphView(const std::vector<OpHandleBase *> &ops) { Build(ops); }
 
 void OpGraphView::Build(const std::vector<OpHandleBase *> &ops) {
+  preceding_ops_.clear();
+  pending_ops_.clear();
   for (auto &op : ops) {
     preceding_ops_[op];
     pending_ops_[op];
@@ -40,6 +42,7 @@ void OpGraphView::Build(const std::vector<OpHandleBase *> &ops) {
 
 std::unordered_set<OpHandleBase *> OpGraphView::AllOps() const {
   std::unordered_set<OpHandleBase *> ret;
+  ret.reserve(preceding_ops_.size());
   for (auto &pair : preceding_ops_) {
     ret.insert(pair.first);
   }
diff --git a/paddle/fluid/framework/details/op_graph_view.h b/paddle/fluid/framework/details/op_graph_view.h
index afb3e8e59461eeba10d7027fc70b89cc170c1805..77aa02eba56acb3bb20a5c5a55c75af78a3c1c81 100644
--- a/paddle/fluid/framework/details/op_graph_view.h
+++ b/paddle/fluid/framework/details/op_graph_view.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include <memory>
+#include <queue>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
@@ -34,6 +34,11 @@ class OpGraphView {
 
   bool HasOp(OpHandleBase *op) const;
 
+  // Use a visitor to visit all pending ops of op
+  // Stop when callback returns false
+  template <typename Callback>
+  bool VisitAllPendingOps(OpHandleBase *op, Callback &&callback) const;
+
  private:
   void Build(const std::vector<OpHandleBase *> &ops);
   void EnforceHasOp(OpHandleBase *op) const;
@@ -44,6 +49,28 @@ class OpGraphView {
       pending_ops_;
 };
 
+template <typename Callback>
+bool OpGraphView::VisitAllPendingOps(OpHandleBase *op,
+                                     Callback &&callback) const {
+  EnforceHasOp(op);
+  std::unordered_set<OpHandleBase *> visited;
+  std::queue<OpHandleBase *> q;
+  q.push(op);
+  do {
+    op = q.front();
+    q.pop();
+    for (auto &pending_op : pending_ops_.at(op)) {
+      if (visited.count(pending_op) == 0) {
+        visited.insert(pending_op);
+        if (!callback(pending_op)) {
+          return false;
+        }
+      }
+    }
+  } while (!q.empty());
+  return true;
+}
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h
index bd6153c0c736f6e32378eebcbf6c4d7e402c9b42..2e5256fbd49a3f8c72840cd55dada4301cb04eb9 100644
--- a/paddle/fluid/framework/details/reduce_and_gather.h
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
@@ -53,7 +53,7 @@ struct ReduceLoDTensor {
   }
 };
 
-inline void GatherSelectedRows(
+inline void GatherLocalSelectedRows(
     const std::vector<const SelectedRows *> &src_selecte_rows_,
     const std::vector<platform::Place> &in_places,
     const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index c9f1107aeab5a21d46e828308cfcb2dde827cba6..cb864848b938e249ecd9d09e2a02f683959ce413 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -16,6 +16,12 @@
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
+#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
+#include "paddle/fluid/operators/distributed/collective_client.h"
+#include "paddle/fluid/operators/distributed/collective_server.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
+#endif
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/profiler.h"
 
 DEFINE_bool(
@@ -26,6 +32,112 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+std::once_flag CollectiveContext::init_flag_;
+std::unique_ptr<CollectiveContext> CollectiveContext::context_;
+
+static inline std::string GetRemoteVarName(const std::string &var_name,
+                                           int trainer_id) {
+  return string::Sprintf("%s_merged_tmp@trainer_%d", var_name, trainer_id);
+}
+
+void ReduceOpHandle::Wait(
+    const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes) {
+  // TODO(gongwb): use event wait?
+  for (auto &dev_ctx : dev_ctxes) {
+    dev_ctx.second->Wait();
+  }
+}
+
+#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
+template <typename DevCtx, typename DataType>
+void ReduceOpHandle::GatherSelectedRows(
+    const std::vector<const SelectedRows *> &src_selected_rows,
+    const std::vector<platform::Place> &in_places,
+    const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,
+    VarHandle *out_var_handle, const platform::Place &out_place,
+    SelectedRows *dst_selected_rows) {
+  const CollectiveContext &collective_context =
+      *CollectiveContext::GetInstance();
+
+  // 1. gather local selected rows, merge them
+  std::string gathered_var_name = out_var_handle->name_ + "_gathered_tmp";
+  auto scope = local_scopes_.at(out_var_handle->scope_idx_);
+  auto gathered_var_mid = scope->Var(gathered_var_name);
+  auto gathered_select_rows =
+      gathered_var_mid->GetMutable<framework::SelectedRows>();
+  GatherLocalSelectedRows(src_selected_rows, in_places, dev_ctxes, out_place,
+                          gathered_select_rows);
+  // FIXME(gongwb): remove this Wait.
+  Wait(dev_ctxes);
+
+  // merge them
+  auto merged_dev_ctx = dynamic_cast<DevCtx *>(dev_ctxes.at(out_place));
+  std::string merged_var_name =
+      GetRemoteVarName(out_var_handle->name_, collective_context.trainer_id_);
+  auto merged_select_rows =
+      scope->Var(merged_var_name)->GetMutable<SelectedRows>();
+  operators::math::scatter::MergeAdd<DevCtx, DataType> merge_func;
+  merge_func(*merged_dev_ctx, *gathered_select_rows, merged_select_rows);
+
+  // 2. start collective server if it doesn't exist
+  operators::distributed::CollectiveServer *server =
+      operators::distributed::CollectiveServer::GetInstance(
+          collective_context.endpoints_[collective_context.trainer_id_],
+          collective_context.endpoints_.size() - 1);
+
+  auto rpc_server = server->GetRPCServer();
+  rpc_server->RegisterVar(merged_var_name,
+                          operators::distributed::kRequestGetMonomerVariable,
+                          scope, merged_dev_ctx);
+
+  // 3. gather them from all remote nodes.
+  std::vector<const SelectedRows *> remote;
+  operators::distributed::CollectiveClient *client =
+      operators::distributed::CollectiveClient::GetInstance();
+
+  std::vector<operators::distributed::RemoteVar> vars;
+  for (unsigned int i = 0; i < collective_context.endpoints_.size(); i++) {
+    if (i == (unsigned)collective_context.trainer_id_) continue;
+
+    operators::distributed::RemoteVar var;
+    var.trainer_id_ = i;
+    var.var_name_ = GetRemoteVarName(out_var_handle->name_, i);
+    var.ep_ = collective_context.endpoints_[i];
+
+    vars.push_back(var);
+    VLOG(4) << "gather from:" << var.String();
+  }
+
+  // erase gathered vars
+  merged_dev_ctx->Wait();
+  scope->EraseVars(std::vector<std::string>{gathered_var_name});
+
+  PADDLE_ENFORCE(client->Gather(vars, &remote, *merged_dev_ctx, scope));
+  PADDLE_ENFORCE(remote.size() == vars.size());
+
+  // 4. merged local selected rows.
+  std::vector<const SelectedRows *> all;
+  all.resize(collective_context.endpoints_.size());
+  for (auto v : vars) {
+    all[v.trainer_id_] =
+        scope->FindVar(v.var_name_)->GetMutable<SelectedRows>();
+  }
+  all[collective_context.trainer_id_] = merged_select_rows;
+
+  merge_func(*merged_dev_ctx, all, dst_selected_rows);
+
+  rpc_server->WaitVarBarrier(merged_var_name);
+  rpc_server->ClearVar(merged_var_name);
+
+  // 5. clear mid vars
+  std::vector<std::string> tmp_vars{merged_var_name};
+  for (auto r : vars) {
+    tmp_vars.push_back(r.var_name_);
+  }
+  scope->EraseVars(tmp_vars);
+}
+#endif
+
 void ReduceOpHandle::RunImpl() {
   platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
 
@@ -90,8 +202,36 @@ void ReduceOpHandle::RunImpl() {
     this->RunAndRecordEvent([&] {
       std::vector<const SelectedRows *> in_selected_rows =
           GetInputValues<SelectedRows>(in_var_handles, var_scopes);
-      GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_, t_out_p,
-                         out_var->GetMutable<framework::SelectedRows>());
+
+      const CollectiveContext &collective_context =
+          *CollectiveContext::GetInstance();
+      VLOG(10) << "GatherSelectedRows CollectiveContext:"
+               << collective_context.String();
+
+      // TODO(gongwb): add cpu support
+      if (collective_context.endpoints_.size() <= 1 ||
+          is_cpu_place(in_places[0]) || is_cpu_place(t_out_p)) {
+        GatherLocalSelectedRows(in_selected_rows, in_places, dev_ctxes_,
+                                t_out_p,
+                                out_var->GetMutable<framework::SelectedRows>());
+        return;
+      }
+
+#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
+      if (framework::IsType<const float>(in_selected_rows[0]->value().type())) {
+        GatherSelectedRows<platform::CUDADeviceContext, float>(
+            in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
+            out_var->GetMutable<framework::SelectedRows>());
+      } else if (framework::IsType<const double>(
+                     in_selected_rows[0]->value().type())) {
+        GatherSelectedRows<platform::CUDADeviceContext, double>(
+            in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
+            out_var->GetMutable<framework::SelectedRows>());
+      } else {
+        PADDLE_ENFORCE(false,
+                       "only support double or float when gahter SelectedRows");
+      }
+#endif
     });
   } else {
     std::vector<const LoDTensor *> lod_tensors =
diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h
index 846839029ca65be1bdeac2f6ea497db07a01b6cf..5491f00f45e9d48c5eb7455396ac51801f2c40ab 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -30,6 +30,32 @@
 namespace paddle {
 namespace framework {
 namespace details {
+struct CollectiveContext {
+  std::vector<std::string> endpoints_;
+  int trainer_id_{0};
+
+  std::string String() const {
+    std::stringstream ss;
+    ss << "endpoints_:";
+    for (auto e : endpoints_) {
+      ss << e << ",";
+    }
+
+    ss << "trainer_id_:" << trainer_id_;
+
+    return ss.str();
+  }
+
+  static CollectiveContext *GetInstance() {
+    std::call_once(init_flag_,
+                   [&]() { context_.reset(new CollectiveContext()); });
+    return context_.get();
+  }
+
+ private:
+  static std::once_flag init_flag_;
+  static std::unique_ptr<CollectiveContext> context_;
+};
 
 struct ReduceOpHandle : public OpHandleBase {
   std::vector<Scope *> local_scopes_;
@@ -64,6 +90,19 @@ struct ReduceOpHandle : public OpHandleBase {
  protected:
   void RunImpl() override;
 
+#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
+  template <typename DevCtx, typename DataType>
+  void GatherSelectedRows(
+      const std::vector<const SelectedRows *> &src_selecte_rows_,
+      const std::vector<platform::Place> &in_places,
+      const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,
+      VarHandle *out_var_handle, const platform::Place &out_place,
+      SelectedRows *dst_selecte_rows);
+#endif
+
+  void Wait(
+      const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes);
+
   template <typename T>
   std::vector<const T *> GetInputValues(
       const std::vector<VarHandle *> &in_var_handles,
diff --git a/paddle/fluid/framework/details/reference_count_op_handle.h b/paddle/fluid/framework/details/reference_count_op_handle.h
deleted file mode 100644
index cc4ccfbdfc720284e683a8f3f59a4aa57a3a9eb1..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/reference_count_op_handle.h
+++ /dev/null
@@ -1,138 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <atomic>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/garbage_collector.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-using ReferenceCountMap = std::unordered_map<std::string, int>;
-using AtomicReferenceCountMap =
-    std::unordered_map<std::string, std::atomic<int>>;
-using DeviceReferenceCountMap =
-    std::unordered_map<int, std::unique_ptr<ReferenceCountMap>>;
-using AtomicDeviceReferenceCountMap =
-    std::unordered_map<int, std::unique_ptr<AtomicReferenceCountMap>>;
-using DeviceGarbageCollectorMap =
-    std::unordered_map<int,
-                       std::unique_ptr<GarbageCollector<framework::Tensor>>>;
-
-class ReferenceCountOpHandle : public OpHandleBase {
- public:
-  ReferenceCountOpHandle(ir::Node *node, const Scope *scope,
-                         const platform::CUDAPlace &place,
-                         const std::vector<std::string> &var_names,
-                         GarbageCollector<Tensor> *gc,
-                         AtomicReferenceCountMap *ref_cnts)
-      : OpHandleBase(node), scope_(scope), gc_(gc), ref_cnts_(ref_cnts) {
-    dev_ctx_ = static_cast<platform::CUDADeviceContext *>(
-        platform::DeviceContextPool::Instance().Get(place));
-    if (IsStreamGarabageCollector()) {
-      platform::SetDeviceId(place.device);
-      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
-    }
-
-    for (auto &name : var_names) AddVar(name);
-  }
-
-  ~ReferenceCountOpHandle() {
-    if (IsStreamGarabageCollector()) {
-      auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
-      platform::SetDeviceId(gpu_place.device);
-      PADDLE_ENFORCE(cudaEventDestroy(event_));
-    }
-  }
-
-  std::string Name() const override { return "reference_count"; }
-
-  void AddVar(const std::string &name) {
-    auto it = var_names_.find(name);
-    if (it != var_names_.end())
-      ++(it->second);
-    else
-      var_names_[name] = 1;
-  }
-
- protected:
-  void RunImpl() override {
-    auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
-    std::vector<Tensor *> tensors;
-    for (auto &pair : var_names_) {
-      auto &name = pair.first;
-      auto it = ref_cnts_->find(name);
-      if (it == ref_cnts_->end()) continue;
-
-      auto *var = exec_scope->FindVar(name);
-      if (var == nullptr) continue;
-
-      if (var->IsType<LoDTensor>()) {
-        if (it->second.fetch_sub(pair.second) <= pair.second) {
-          tensors.emplace_back(var->GetMutable<LoDTensor>());
-        }
-      } else if (var->IsType<SelectedRows>()) {
-        if (it->second.fetch_sub(pair.second) <= pair.second) {
-          tensors.emplace_back(
-              var->GetMutable<SelectedRows>()->mutable_value());
-        }
-      }
-    }
-
-    if (!tensors.empty()) {
-      ClearTensors(tensors);
-    }
-  }
-
- private:
-  void ClearTensors(const std::vector<Tensor *> &tensors) {
-    auto *gc = dynamic_cast<StreamGarbageCollector<Tensor> *>(gc_);
-    if (gc != nullptr) {
-      auto compute_stream = dev_ctx_->stream();
-      auto callback_stream = gc->stream();
-      auto callback_func = [=]() {
-        PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
-        PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
-      };
-      gc_->Add(tensors, callback_func);
-    } else {
-      gc_->Add(tensors);
-    }
-  }
-
-  bool IsStreamGarabageCollector() const {
-    return dynamic_cast<const StreamGarbageCollector<Tensor> *>(gc_) != nullptr;
-  }
-
-  const Scope *scope_;
-  platform::CUDADeviceContext *dev_ctx_;
-  std::unordered_map<std::string, int> var_names_;
-  GarbageCollector<Tensor> *gc_;       // not own
-  AtomicReferenceCountMap *ref_cnts_;  // not own
-  cudaEvent_t event_;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc
index 08783fb5f8b18329c9167edb0dac39b7dd42a746..13a042d8e6ed7f18c76387b666d681df0eabd0b5 100644
--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -14,187 +14,240 @@
 
 #include <queue>
 #include <string>
+#include <type_traits>
 #include <vector>
 
 #include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/details/op_graph_view.h"
 #include "paddle/fluid/framework/details/reference_count_pass.h"
+#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
 
-static ComputationOpHandle *FindNextComputationOpHandle(VarHandle *var_in) {
-  std::queue<VarHandleBase *> queue;
-  queue.push(var_in);
-  do {
-    auto *var = queue.front();
-    queue.pop();
-    for (auto *op : var->PendingOps()) {
-      auto *compute_op = dynamic_cast<ComputationOpHandle *>(op);
-      if (compute_op != nullptr && compute_op->GetPlace() == var_in->place_) {
-        return compute_op;
+// A functor to shrink/remove operators who depend on other operators in a set
+class ShrinkDepsOpFunctor {
+ private:
+  enum RelationShip { kSame = 0, kNoDeps = 1, kBefore = 2, kAfter = 3 };
+
+ public:
+  explicit ShrinkDepsOpFunctor(const std::vector<OpHandleBase *> &all_ops)
+      : graph_(all_ops) {}
+
+  template <typename OpSet>
+  OpSet operator()(const OpSet &op_set) const {
+    using KeyType = typename OpSet::key_type;
+    static_assert(
+        std::is_base_of<OpHandleBase,
+                        typename std::remove_pointer<KeyType>::type>::value,
+        "Key type of OpSet must be OpHandleBase, or derived of OpHandleBase");
+
+    if (op_set.size() <= 1) return op_set;
+    std::vector<OpHandleBase *> ops(op_set.begin(), op_set.end());
+    OpSet ret;
+    auto rels = GetRelations(ops);
+    auto not_before = [](RelationShip r) { return r != kBefore; };
+    for (size_t i = 0; i < rels.size(); ++i) {
+      if (std::all_of(rels[i].begin(), rels[i].end(), not_before)) {
+        ret.emplace(static_cast<KeyType>(ops[i]));
       }
-      for (auto *out_var : op->Outputs()) {
-        queue.push(out_var);
+    }
+    return ret;
+  }
+
+ private:
+  std::vector<std::vector<RelationShip>> GetRelations(
+      const std::vector<OpHandleBase *> &ops) const {
+    std::unordered_map<OpHandleBase *, size_t> op_to_idx;
+    for (size_t i = 0; i < ops.size(); ++i) {
+      PADDLE_ENFORCE(graph_.HasOp(ops[i]), "Op does not exist in graph");
+      op_to_idx[ops[i]] = i;
+    }
+
+    PADDLE_ENFORCE(op_to_idx.size() == ops.size(), "Duplicate ops");
+
+    std::vector<std::vector<RelationShip>> ret(ops.size());
+    for (auto &e : ret) {
+      e.assign(ops.size(), kSame);
+    }
+
+    size_t found_num = ops.size();
+    size_t total_num = ops.size() * ops.size();
+    auto visitor = [&](OpHandleBase *op, size_t i) {
+      auto it = op_to_idx.find(op);
+      if (it != op_to_idx.end()) {
+        size_t j = it->second;
+        if (i != j && ret[i][j] == kSame) {
+          ret[i][j] = kBefore;
+          ret[j][i] = kAfter;
+          found_num += 2;
+          if (found_num == total_num) {
+            return false;
+          }
+        }
+      }
+      return true;
+    };
+
+    for (size_t i = 0; i < ops.size(); ++i) {
+      auto sub_visitor = [&, i](OpHandleBase *op) { return visitor(op, i); };
+      if (!graph_.VisitAllPendingOps(ops[i], sub_visitor)) {
+        break;
+      }
+    }
+
+    for (size_t i = 0; i < ops.size(); ++i) {
+      for (size_t j = i + 1; j < ops.size(); ++j) {
+        if (ret[i][j] != kSame) continue;
+        ret[i][j] = kNoDeps;
+        ret[j][i] = kNoDeps;
+      }
+    }
+
+    return ret;
+  }
+
+  const OpGraphView graph_;
+};
+
+/**
+ * Find the nearest downstream computation op handle. If the op is a
+ * computation op, just return itself.
+ */
+static ComputationOpHandle *FindNextComputationOpHandleOrReturnItself(
+    OpHandleBase *op, size_t scope_idx) {
+  std::queue<OpHandleBase *> q;
+  std::unordered_set<OpHandleBase *> visited;
+  q.push(op);
+  do {
+    auto *op = q.front();
+    q.pop();
+    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op);
+    if (compute_op != nullptr && compute_op->GetScopeIdx() == scope_idx) {
+      return compute_op;
+    }
+    for (auto *out_var : op->Outputs()) {
+      for (auto *pending_op : out_var->PendingOps()) {
+        if (visited.count(pending_op)) continue;
+        visited.insert(pending_op);
       }
     }
-  } while (!queue.empty());
+  } while (!q.empty());
   return nullptr;
 }
 
-static void AddDependencyBetween(OpHandleBase *in, OpHandleBase *out,
-                                 ir::Graph *graph) {
-  auto it = std::find_if(
-      in->Outputs().begin(), in->Outputs().end(), [](VarHandleBase *var) {
-        return dynamic_cast<DummyVarHandle *>(var) != nullptr;
-      });
-
-  if (it != in->Outputs().end()) {
-    out->AddInput(*it);
-  } else {
-    auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
-    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
-    in->AddOutput(dep_var);
-    out->AddInput(dep_var);
+static std::unordered_set<ComputationOpHandle *>
+ExtractComputationOpFromLastLivedVar(VarHandle *var, size_t scope_idx,
+                                     const ShrinkDepsOpFunctor &shrink_func,
+                                     bool *ok) {
+  // stage one. Get last op for variable.
+  std::unordered_set<OpHandleBase *> candidates;
+  {
+    if (var->PendingOps().empty() && var->GeneratedOp()) {
+      // No operator depends on this variable. So the last operator is the op
+      // who generates this variable.
+      candidates.emplace(var->GeneratedOp());
+    } else {
+      candidates = var->PendingOps();
+    }
+
+    // No pending ops or generated op is nullptr
+    if (candidates.empty()) {
+      *ok = false;
+      return {};
+    }
+  }
+
+  // stage two. Try to cast them to computation op.
+  // return (*ok=false) when failed.
+  //
+  // The reason why we cannot make any types of op handle to be the last lived
+  // op is:
+  //    some op handle may operate on many DeviceContext, however, our garbage
+  //    collector can only wait one DeviceContext for now. So currently, we wait
+  //    the nearest compute op.
+  std::unordered_set<ComputationOpHandle *> computation_op;
+  {
+    for (auto *op : candidates) {
+      auto *compute_op =
+          FindNextComputationOpHandleOrReturnItself(op, scope_idx);
+      if (compute_op == nullptr) {
+        *ok = false;
+        return {};
+      }
+      computation_op.emplace(compute_op);
+    }
   }
+
+  // stage three. Try to shrink computation op if they depend on each other.
+  // Get the smallest set of the most ops.
+  *ok = true;
+  return shrink_func(computation_op);
+}
+
+static VarDesc *TryGetLatestVarDesc(const std::vector<VarHandle *> &vars) {
+  VarDesc *var_desc = nullptr;
+  std::find_if(vars.rbegin(), vars.rend(), [&](VarHandle *var_handle) -> bool {
+    var_desc = var_handle->Node()->Var();
+    return var_desc != nullptr;
+  });
+  return var_desc;
 }
 
 std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
-  auto &ref_cnts = Get<DeviceReferenceCountMap>(kGlobalReferenceCount);
-  auto &cur_ref_cnts = Get<AtomicDeviceReferenceCountMap>(kCurReferenceCount);
-  auto &gcs = Get<DeviceGarbageCollectorMap>(kGarbageCollector);
-
-  // It is not easy to find the right reference counts of varaibles in graph
-  // Step 1: Find all variables in computation ops
-  // Step 2: Find all variables in non-computation ops which refers to variables
-  // in computation ops
-  std::unordered_set<std::string> names;
-  std::unordered_map<OpHandleBase *, ReferenceCountOpHandle *>
-      compute_ref_cnt_map;
-
-  auto get_ref_cnts_from_compute_op = [&](
-      OpHandleBase *op, const std::vector<VarHandleBase *> &vars) {
-    std::vector<std::string> var_names_in_op;
-    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op);
-    if (compute_op == nullptr ||
-        !platform::is_gpu_place(compute_op->GetPlace()))
-      return var_names_in_op;
-    auto place = boost::get<platform::CUDAPlace>(compute_op->GetPlace());
-    for (VarHandleBase *var_handle_base : vars) {
-      auto *var_handle = dynamic_cast<VarHandle *>(var_handle_base);
-      if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue;
-
-      if (!platform::is_gpu_place(var_handle->place_) ||
-          boost::get<platform::CUDAPlace>(var_handle->place_) != place)
-        continue;
+  auto &ref_cnts = Get<std::vector<ReferenceCountMap>>(kGlobalReferenceCount);
+  auto &last_live_ops_of_vars =
+      Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
+
+  PADDLE_ENFORCE(last_live_ops_of_vars.empty() && ref_cnts.empty(),
+                 "Last Live Ops and Reference Counts of vars should be "
+                 "initialized at here.");
 
-      VarDesc *var_desc = var_handle->Node()->Var();
-      auto var_name = var_handle->Node()->Name();
+  const auto &vars = graph->Get<GraphVars>(kGraphVars);
 
-      // This is weird but there is really some variables without var_desc
-      // in computation_op
-      if (var_desc == nullptr) {
-        var_desc = compute_op->Node()->Op()->Block()->FindVar(var_name);
-        if (var_desc == nullptr) continue;
+  last_live_ops_of_vars.resize(vars.size());
+  ref_cnts.resize(vars.size());
+
+  ShrinkDepsOpFunctor shrink_func(
+      ir::FilterByNodeWrapper<OpHandleBase>(*graph));
+
+  for (size_t i = 0; i < vars.size(); ++i) {
+    for (auto &name_var_pair : vars[i]) {
+      // Whether this variable can be reused or deleted? If not, we do not
+      // compute reference counts and dependencies.
+      VarDesc *var_desc = TryGetLatestVarDesc(name_var_pair.second);
+
+      if (var_desc == nullptr || var_desc->Persistable()) {
+        continue;
       }
 
-      if (var_desc->Persistable()) continue;
       auto var_type = var_desc->Proto()->type().type();
       if (var_type != proto::VarType::LOD_TENSOR &&
-          var_type != proto::VarType::SELECTED_ROWS) {
+          var_type != proto::VarType::SELECTED_ROWS &&
+          var_type != proto::VarType::LOD_TENSOR_ARRAY) {
+        // Var type cannot be deleted
         continue;
       }
 
-      // compute op only runs in one device
-      if (ref_cnts[place.device]->count(var_name))
-        ++(*ref_cnts[place.device])[var_name];
-      else
-        (*ref_cnts[place.device])[var_name] = 1;
+      bool ok;
+      auto result = ExtractComputationOpFromLastLivedVar(
+          name_var_pair.second.back(), i, shrink_func, &ok);
 
-      names.insert(var_name);
-      var_names_in_op.push_back(var_name);
-    }
-    return var_names_in_op;
-  };
-
-  auto update_ref_cnts_from_non_compute_op = [&](
-      OpHandleBase *op, const std::vector<VarHandleBase *> &vars) {
-    if (dynamic_cast<ComputationOpHandle *>(op) != nullptr) return;
-    for (VarHandleBase *var_handle_base : vars) {
-      auto *var_handle = dynamic_cast<VarHandle *>(var_handle_base);
-      if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue;
-
-      auto var_name = var_handle->Node()->Name();
-      auto var_place = var_handle->place_;
-      if (!platform::is_gpu_place(var_place)) continue;
-      auto place = boost::get<platform::CUDAPlace>(var_place);
-      if (names.count(var_name) == 0) continue;
-      if (ref_cnts.count(place.device) &&
-          ref_cnts[place.device]->count(var_name)) {
-        ++(*ref_cnts[place.device])[var_name];
-
-        auto *next_compute_op = FindNextComputationOpHandle(var_handle);
-        if (next_compute_op != nullptr) {
-          if (compute_ref_cnt_map.count(next_compute_op)) {
-            compute_ref_cnt_map[next_compute_op]->AddVar(var_name);
-            VLOG(5) << "Add reference count of " << var_name << " to Operator "
-                    << next_compute_op->Name();
-          } else {
-            // Create new reference_count_op_handle
-            ir::Node *ref_cnt_node = graph->CreateEmptyNode(
-                "reference_count", ir::Node::Type::kOperation);
-            auto *ref_cnt_handle = new ReferenceCountOpHandle(
-                ref_cnt_node, next_compute_op->GetScope(), place, {var_name},
-                gcs[place.device].get(), cur_ref_cnts[place.device].get());
-            AddDependencyBetween(next_compute_op, ref_cnt_handle, graph.get());
-            compute_ref_cnt_map[next_compute_op] = ref_cnt_handle;
-          }
-        }
+      if (ok) {
+        auto &var_name = name_var_pair.first;
+        PADDLE_ENFORCE(!result.empty(), "Last living ops of %s cannot be empty",
+                       var_name);
+        ref_cnts[i].emplace(var_name, result.size());
+        last_live_ops_of_vars[i].emplace(var_name, std::move(result));
       }
     }
-  };
-
-  auto all_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
-  for (auto &op : all_ops) {
-    auto in_var_names = get_ref_cnts_from_compute_op(op, op->Inputs());
-    auto out_var_names = get_ref_cnts_from_compute_op(op, op->Outputs());
-    if (in_var_names.empty() && out_var_names.empty()) continue;
-    in_var_names.insert(in_var_names.end(), out_var_names.begin(),
-                        out_var_names.end());
-    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op);
-    auto place = boost::get<platform::CUDAPlace>(compute_op->GetPlace());
-    ir::Node *ref_cnt_node =
-        graph->CreateEmptyNode("reference_count", ir::Node::Type::kOperation);
-    auto *ref_cnt_handle = new ReferenceCountOpHandle(
-        ref_cnt_node, compute_op->GetScope(), place, in_var_names,
-        gcs[place.device].get(), cur_ref_cnts[place.device].get());
-    AddDependencyBetween(compute_op, ref_cnt_handle, graph.get());
-    compute_ref_cnt_map[compute_op] = ref_cnt_handle;
-  }
-
-  for (auto &op : all_ops) {
-    update_ref_cnts_from_non_compute_op(op, op->Inputs());
-    update_ref_cnts_from_non_compute_op(op, op->Outputs());
-  }
-
-  std::vector<OpHandleBase *> new_all_ops;
-  new_all_ops.reserve(compute_ref_cnt_map.size() + all_ops.size());
-  for (auto &op : all_ops) {
-    new_all_ops.emplace_back(std::move(op));
-    auto it = compute_ref_cnt_map.find(new_all_ops.back());
-    if (it != compute_ref_cnt_map.end()) {
-      // Add LeafNode to ReferenceCountOpHandle
-      auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar());
-      graph->Get<GraphDepVars>(kGraphDepVars).emplace(dummy_leaf);
-      it->second->AddOutput(dummy_leaf);
-      new_all_ops.emplace_back(std::move(it->second));
-    }
   }
 
-  all_ops.swap(new_all_ops);
   return graph;
 }
 
@@ -205,5 +258,4 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
 REGISTER_PASS(reference_count_pass,
               paddle::framework::details::ReferenceCountPass)
     .RequirePassAttr(paddle::framework::details::kGlobalReferenceCount)
-    .RequirePassAttr(paddle::framework::details::kCurReferenceCount)
-    .RequirePassAttr(paddle::framework::details::kGarbageCollector);
+    .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars);
diff --git a/paddle/fluid/framework/details/reference_count_pass.h b/paddle/fluid/framework/details/reference_count_pass.h
index 7081280b0600b9c1985987d02d679c298ad4b8bd..bcbef027354ef5a5fcc7da28103a9565982c7631 100644
--- a/paddle/fluid/framework/details/reference_count_pass.h
+++ b/paddle/fluid/framework/details/reference_count_pass.h
@@ -14,7 +14,6 @@
 
 #pragma once
 
-#include "paddle/fluid/framework/details/reference_count_op_handle.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"
 
@@ -22,10 +21,6 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-constexpr char kGlobalReferenceCount[] = "reference_count";
-constexpr char kCurReferenceCount[] = "current_reference_count";
-constexpr char kGarbageCollector[] = "garbage_collector";
-
 class ReferenceCountPass : public ir::Pass {
  protected:
   std::unique_ptr<ir::Graph> ApplyImpl(
diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.cc b/paddle/fluid/framework/details/reference_count_pass_helper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..89bd08c2d041d795205b29bb29aba311d1dbd932
--- /dev/null
+++ b/paddle/fluid/framework/details/reference_count_pass_helper.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.h b/paddle/fluid/framework/details/reference_count_pass_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c083dbf001b08e40a54cc89b21c3dea1f18f16a
--- /dev/null
+++ b/paddle/fluid/framework/details/reference_count_pass_helper.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <atomic>
+#include <map>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/garbage_collector.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class ComputationOpHandle;
+
+using ReferenceCountMap = std::unordered_map<std::string, size_t>;
+
+using AtomicReferenceCountMap =
+    std::unordered_map<std::string, std::atomic<size_t>>;
+
+using GarbageCollectorMap =
+    std::map<platform::Place, std::unique_ptr<GarbageCollector>>;
+
+const char kGlobalReferenceCount[] = "global_reference_count";
+const char kRuntimeReferenceCount[] = "runtime_reference_count";
+const char kGarbageCollector[] = "garbage_collector";
+const char kAllPlaces[] = "all_places";
+
+using LastLiveOpsOfVars =
+    std::unordered_map<std::string, std::unordered_set<ComputationOpHandle*>>;
+const char kLastLiveOpsOfVars[] = "last_live_ops_of_var";
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index e5b1eaa7318aecde1dbf89de8fe242a3008db97c..57f6fc66c57e2a53d9cf30d7761626a50bc379ea 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -16,11 +16,8 @@
 #include <stdexcept>
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/profiler.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/framework/details/reference_count_op_handle.h"
-#endif
 
 namespace paddle {
 namespace framework {
@@ -69,27 +66,12 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
   platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
   drop_scope_counter_ += 1;
 
-#ifdef PADDLE_WITH_CUDA
-  const std::string gc_name = "garbage_collector";
-  DeviceGarbageCollectorMap *gc =
-      Graph().Has(gc_name) ? &(Graph().Get<DeviceGarbageCollectorMap>(gc_name))
-                           : nullptr;
-#endif
-
   if (!fetch_tensors.empty() ||
       drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
     drop_scope_counter_ = 0;
     // Wait All computational streams
     for (auto p : places_) {
       platform::DeviceContextPool::Instance().Get(p)->Wait();
-#ifdef PADDLE_WITH_CUDA
-      if (gc != nullptr && platform::is_gpu_place(p)) {
-        auto gpu_place = boost::get<platform::CUDAPlace>(p);
-        auto &gc_at_place = gc->at(gpu_place.device);
-        gc_at_place->Wait();
-        gc_at_place->Reset();
-      }
-#endif
     }
     for (auto &scope : local_scopes_) {
       auto &local_scope =
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 96132a2c18233ca10d7bad4e26dfabadd39d84db..0c4bd336c5b36eff7d93e4099c96cd3c5990ac17 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -13,18 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/executor.h"
+#include <deque>
 
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/ngraph_operator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
+#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/operators/detail/macros.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 
+#ifdef PADDLE_WITH_NGRAPH
+#include "paddle/fluid/framework/ngraph_operator.h"
+#endif
+
 DECLARE_bool(benchmark);
 DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
 DEFINE_bool(use_ngraph, false, "Use NGRAPH to run");
@@ -37,11 +42,43 @@ namespace {
 int kProgramId = -1;
 }  // namespace
 
+static std::unordered_map<std::string, size_t> GetNonPersistableReferenceCounts(
+    const BlockDesc& block, const std::vector<std::string>& skip_var_list) {
+  std::unordered_map<std::string, size_t> ref_cnts;
+  std::unordered_set<std::string> skip_vars(skip_var_list.begin(),
+                                            skip_var_list.end());
+
+  auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) {
+    for (auto& name_pair : name_map) {
+      for (auto& name : name_pair.second) {
+        if (skip_vars.count(name)) continue;
+        auto* var_desc = block.FindVar(name);
+        if (var_desc == nullptr || var_desc->Persistable()) continue;
+        auto type = var_desc->Proto()->type().type();
+        if (type != proto::VarType::LOD_TENSOR &&
+            type != proto::VarType::SELECTED_ROWS &&
+            type != proto::VarType::LOD_TENSOR_ARRAY) {
+          continue;
+        }
+        ++ref_cnts[name];
+      }
+    }
+  };
+
+  for (auto op_desc : block.AllOps()) {
+    update_ref_cnts(op_desc, op_desc->Inputs());
+    update_ref_cnts(op_desc, op_desc->Outputs());
+  }
+  return ref_cnts;
+}
+
 ExecutorPrepareContext::ExecutorPrepareContext(
-    const framework::ProgramDesc& prog, size_t block_id)
+    const framework::ProgramDesc& prog, size_t block_id,
+    const std::vector<std::string>& skip_ref_cnt_vars)
     : prog_(prog), block_id_(block_id) {
   if (GetEagerDeletionThreshold() >= 0) {
-    ref_cnts_ = GetNonPersistableReferenceCount<int>(prog_, block_id_);
+    global_ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id),
+                                                        skip_ref_cnt_vars);
   }
 }
 
@@ -49,28 +86,40 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
   VLOG(5) << "destroy ExecutorPrepareContext";
 }
 
-template <typename RefCntMap>
-static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
-                                GarbageCollector<Tensor>* gc,
-                                RefCntMap* ref_cnts) {
-  std::unordered_set<Tensor*> erase_tensors;
+static void DeleteUnusedTensors(
+    const Scope& scope, const OperatorBase* op, GarbageCollector* gc,
+    std::unordered_map<std::string, size_t>* ref_cnts) {
+  std::deque<std::shared_ptr<memory::Allocation>> garbages;
 
   auto handler = [&](const VariableNameMap& name_map) {
     for (auto& name_pair : name_map) {
       for (auto& name : name_pair.second) {
         auto it = ref_cnts->find(name);
         if (it == ref_cnts->end()) continue;
-        if ((it->second)-- == 1) {
-          auto* var = scope.FindVar(name);
-          if (var != nullptr) {
-            VLOG(10) << "Erase tensor \'" << name << "\'";
-            if (var->IsType<LoDTensor>()) {
-              erase_tensors.insert(var->GetMutable<LoDTensor>());
-            } else if (var->IsType<SelectedRows>()) {
-              erase_tensors.insert(
-                  var->GetMutable<SelectedRows>()->mutable_value());
-            }
+        if (--(it->second) != 0) {
+          continue;
+        }
+        auto* var = scope.FindVar(name);
+        if (var != nullptr) {
+          continue;
+        }
+
+        VLOG(2) << "Erase variable " << name;
+        if (var->IsType<LoDTensor>()) {
+          garbages.emplace_back(
+              var->GetMutable<LoDTensor>()->MoveMemoryHolder());
+        } else if (var->IsType<SelectedRows>()) {
+          garbages.emplace_back(var->GetMutable<SelectedRows>()
+                                    ->mutable_value()
+                                    ->MoveMemoryHolder());
+        } else if (var->IsType<LoDTensorArray>()) {
+          auto* lod_tensor_arr = var->GetMutable<LoDTensorArray>();
+          for (auto& t : *lod_tensor_arr) {
+            garbages.emplace_back(t.MoveMemoryHolder());
           }
+        } else {
+          PADDLE_THROW("Type %s of %s is not supported eager deletion",
+                       var->Type().name(), name);
         }
       }
     }
@@ -79,19 +128,19 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
   handler(op->Inputs());
   handler(op->Outputs());
 
-  if (!erase_tensors.empty()) {
-    gc->Add(erase_tensors);
+  if (!garbages.empty()) {
+    gc->Add(std::move(garbages));
   }
 }
 
 static void EnableFusedOp(ExecutorPrepareContext* ctx) {
 #ifdef PADDLE_WITH_NGRAPH
   VLOG(3) << "use_ngraph=True";
-  auto intervals = FusedOperator::FusedOpIntervals(&ctx->ops_);
+  auto intervals = NgraphOperator::NgraphOpIntervals(&ctx->ops_);
   for (auto& interval : intervals) {
-    auto* fused_op = new FusedOperator(ctx->prog_, ctx->block_id_,
-                                       interval.at(0), interval.at(1));
-    *interval[0] = std::unique_ptr<OperatorBase>(fused_op);
+    auto* ng_op = new NgraphOperator(ctx->prog_, ctx->block_id_, interval.at(0),
+                                     interval.at(1));
+    *interval[0] = std::unique_ptr<OperatorBase>(ng_op);
   }
   for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
     ctx->ops_.erase(it->at(0) + 1, it->at(1));
@@ -114,36 +163,6 @@ void Executor::Close() {
 #endif
 }
 
-void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
-  if (var_type == proto::VarType::LOD_TENSOR) {
-    var->GetMutable<LoDTensor>();
-  } else if (var_type == proto::VarType::SELECTED_ROWS) {
-    var->GetMutable<SelectedRows>();
-  } else if (var_type == proto::VarType::FEED_MINIBATCH) {
-    var->GetMutable<FeedFetchList>();
-  } else if (var_type == proto::VarType::FETCH_LIST) {
-    var->GetMutable<FeedFetchList>();
-  } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope*>>();
-  } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
-    var->GetMutable<LoDRankTable>();
-  } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
-    var->GetMutable<LoDTensorArray>();
-  } else if (var_type == proto::VarType::PLACE_LIST) {
-    var->GetMutable<platform::PlaceList>();
-  } else if (var_type == proto::VarType::READER) {
-    var->GetMutable<ReaderHolder>();
-  } else if (var_type == proto::VarType::RAW) {
-    // GetMutable will be called in operator
-  } else {
-    PADDLE_THROW(
-        "Variable type %d is not in "
-        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
-        "LOD_RANK_TABLE, PLACE_LIST, READER, RAW]",
-        var_type);
-  }
-}
-
 void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
                                int block_id) {
   auto& global_block = pdesc.Block(block_id);
@@ -351,9 +370,10 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
 }
 
 std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
-    const ProgramDesc& program, int block_id) {
+    const ProgramDesc& program, int block_id,
+    const std::vector<std::string>& skip_ref_cnt_vars) {
   std::unique_ptr<ExecutorPrepareContext> ctx(
-      new ExecutorPrepareContext(program, block_id));
+      new ExecutorPrepareContext(program, block_id, skip_ref_cnt_vars));
   PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
   auto& block = program.Block(block_id);
   for (auto& op_desc : block.AllOps()) {
@@ -364,16 +384,28 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
 }
 
 std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
-    const ProgramDesc& program, const std::vector<int>& block_ids) {
+    const ProgramDesc& program, const std::vector<int>& block_ids,
+    const std::vector<std::vector<std::string>>& skip_ref_cnt_vars) {
+  PADDLE_ENFORCE(
+      skip_ref_cnt_vars.empty() || skip_ref_cnt_vars.size() == block_ids.size(),
+      "skip_ref_cnt_vars should be either empty or equals to block number %d",
+      block_ids.size());
   std::vector<std::shared_ptr<ExecutorPrepareContext>> result;
+  size_t idx = 0;
   for (auto& bid : block_ids) {
-    auto* ctx = new ExecutorPrepareContext(program, bid);
+    ExecutorPrepareContext* ctx;
+    if (skip_ref_cnt_vars.empty()) {
+      ctx = new ExecutorPrepareContext(program, bid);
+    } else {
+      ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx]);
+    }
     PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
     auto& block = program.Block(bid);
     for (auto& op_desc : block.AllOps()) {
       ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
     }
     result.push_back(std::shared_ptr<ExecutorPrepareContext>(ctx));
+    ++idx;
   }
   return result;
 }
@@ -391,22 +423,23 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   }
 
   int64_t max_memory_size = GetEagerDeletionThreshold();
-  std::unique_ptr<GarbageCollector<Tensor>> gc;
-  // WhileOp would set keep_kids to true,
-  // because WhileGradOp needs the scopes created in WhileOp.
-  // Perhaps, we should not perform eager deletion in WhileOp
-  // The scopes and variables created by WhileOp would be deleted
-  // in WhileGradOp.
+  std::unique_ptr<GarbageCollector> gc;
+  // skip while_op and while_grad_op temporarily
   if (max_memory_size >= 0 && !keep_kids) {
     ctx->ResetReferenceCount();
 #ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(place_)) {
-      gc.reset(new DefaultStreamGarbageCollector<Tensor>(
-          boost::get<platform::CUDAPlace>(place_), max_memory_size));
-    } else {
+      if (IsFastEagerDeletionModeEnabled()) {
+        gc.reset(new UnsafeFastGPUGarbageCollector(
+            boost::get<platform::CUDAPlace>(place_), max_memory_size));
+      } else {
+        gc.reset(new DefaultStreamGarbageCollector(
+            boost::get<platform::CUDAPlace>(place_), max_memory_size));
+      }
+    } else if (platform::is_cpu_place(place_)) {
 #endif
-      gc.reset(new CPUGarbageCollector<Tensor>(
-          boost::get<platform::CPUPlace>(place_), max_memory_size));
+      gc.reset(new CPUGarbageCollector(boost::get<platform::CPUPlace>(place_),
+                                       max_memory_size));
 #ifdef PADDLE_WITH_CUDA
     }
 #endif
@@ -415,17 +448,13 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   for (auto& op : ctx->ops_) {
     op->Run(*local_scope, place_);
 
-    if (gc != nullptr) {
+    if (gc) {
       DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
-                          &(ctx->cur_ref_cnts_));
+                          &(ctx->runtime_ref_cnts_));
     }
   }
 
-  if (gc != nullptr) {
-    gc->Wait();
-  } else {
-    platform::DeviceContextPool::Instance().Get(place_)->Wait();
-  }
+  platform::DeviceContextPool::Instance().Get(place_)->Wait();
 
   if (local_scope != scope) {
     scope->DeleteScope(local_scope);
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index 36b36d49c2728dbef93042158dffa26d8f56d529..5a040ac641588ad4d89d1f6e4c0d6c296eff38eb 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -26,54 +26,22 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-extern void InitializeVariable(Variable* var, proto::VarType::Type var_type);
-
-template <typename T>
-std::unordered_map<std::string, T> GetNonPersistableReferenceCount(
-    const ProgramDesc& prog, size_t block_id) {
-  auto& block = prog.Block(block_id);
-  std::unordered_map<std::string, T> ref_cnts;
-
-  auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) {
-    for (auto& name_pair : name_map) {
-      for (auto& name : name_pair.second) {
-        auto* var_desc = block.FindVar(name);
-        if (var_desc == nullptr || var_desc->Persistable()) continue;
-        auto type = var_desc->Proto()->type().type();
-        if (type != proto::VarType::LOD_TENSOR &&
-            type != proto::VarType::SELECTED_ROWS) {
-          continue;
-        }
-
-        auto it = ref_cnts.find(name);
-        if (it != ref_cnts.end()) {
-          ++it->second;
-        } else {
-          ref_cnts[name] = 1;
-        }
-      }
-    }
-  };
-
-  for (auto op_desc : block.AllOps()) {
-    update_ref_cnts(op_desc, op_desc->Inputs());
-    update_ref_cnts(op_desc, op_desc->Outputs());
-  }
-  return ref_cnts;
-}
 
 struct ExecutorPrepareContext {
-  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id);
+  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id,
+                         const std::vector<std::string>& skip_ref_cnt_vars =
+                             std::vector<std::string>());
+
   ~ExecutorPrepareContext();
 
-  void ResetReferenceCount() { cur_ref_cnts_ = ref_cnts_; }
+  void ResetReferenceCount() { runtime_ref_cnts_ = global_ref_cnts_; }
 
   const framework::ProgramDesc& prog_;
   size_t block_id_;
   std::vector<std::unique_ptr<OperatorBase>> ops_;
 
-  std::unordered_map<std::string, int> ref_cnts_;
-  std::unordered_map<std::string, int> cur_ref_cnts_;
+  std::unordered_map<std::string, size_t> global_ref_cnts_;
+  std::unordered_map<std::string, size_t> runtime_ref_cnts_;
 };
 
 class Executor {
@@ -109,10 +77,14 @@ class Executor {
            const std::string& fetch_holder_name = "fetch");
 
   static std::unique_ptr<ExecutorPrepareContext> Prepare(
-      const ProgramDesc& program, int block_id);
+      const ProgramDesc& program, int block_id,
+      const std::vector<std::string>& skip_ref_cnt_vars =
+          std::vector<std::string>());
 
   static std::vector<std::shared_ptr<ExecutorPrepareContext>> Prepare(
-      const ProgramDesc& program, const std::vector<int>& block_ids);
+      const ProgramDesc& program, const std::vector<int>& block_ids,
+      const std::vector<std::vector<std::string>>& skip_ref_cnt_vars =
+          std::vector<std::vector<std::string>>());
 
   void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id);
 
diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5fc5aeb662ae4af37321db88661cfc8a4dabe4d3
--- /dev/null
+++ b/paddle/fluid/framework/executor_thread_worker.cc
@@ -0,0 +1,226 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/executor_thread_worker.h"
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "google/protobuf/message.h"
+#include "google/protobuf/text_format.h"
+
+#include "gflags/gflags.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/cpu_helper.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/pybind/pybind.h"
+namespace paddle {
+namespace framework {
+
+void ExecutorThreadWorker::CreateThreadOperators(const ProgramDesc& program) {
+  auto& block = program.Block(0);
+  op_names_.clear();
+  for (auto& op_desc : block.AllOps()) {
+    std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc);
+    op_names_.push_back(op_desc->Type());
+    OperatorBase* local_op_ptr = local_op.release();
+    ops_.push_back(local_op_ptr);
+    continue;
+  }
+}
+
+void ExecutorThreadWorker::CreateThreadResource(
+    const framework::ProgramDesc& program,
+    const paddle::platform::Place& place) {
+  CreateThreadScope(program);
+  CreateThreadOperators(program);
+  SetMainProgram(program);
+  SetPlace(place);
+}
+
+void ExecutorThreadWorker::CreateThreadScope(const ProgramDesc& program) {
+  auto& block = program.Block(0);
+
+  PADDLE_ENFORCE_NOT_NULL(
+      root_scope_, "root_scope should be set before creating thread scope");
+
+  thread_scope_ = &root_scope_->NewScope();
+  for (auto& var : block.AllVars()) {
+    if (var->Persistable()) {
+      auto* ptr = root_scope_->Var(var->Name());
+      InitializeVariable(ptr, var->GetType());
+    } else {
+      auto* ptr = thread_scope_->Var(var->Name());
+      InitializeVariable(ptr, var->GetType());
+    }
+  }
+}
+
+void ExecutorThreadWorker::SetDataFeed(
+    const std::shared_ptr<DataFeed>& datafeed) {
+  thread_reader_ = datafeed;
+}
+
+void ExecutorThreadWorker::BindingDataFeedMemory() {
+  const std::vector<std::string>& input_feed =
+      thread_reader_->GetUseSlotAlias();
+  for (auto name : input_feed) {
+    thread_reader_->AddFeedVar(thread_scope_->Var(name), name);
+  }
+}
+
+void ExecutorThreadWorker::SetFetchVarNames(
+    const std::vector<std::string>& fetch_var_names) {
+  fetch_var_names_.clear();
+  fetch_var_names_.insert(fetch_var_names_.end(), fetch_var_names.begin(),
+                          fetch_var_names.end());
+}
+
+void ExecutorThreadWorker::SetDevice() {
+#if defined _WIN32 || defined __APPLE__
+  return;
+#else
+  static unsigned concurrency_cap = std::thread::hardware_concurrency();
+  int thread_id = this->thread_id_;
+
+  if (static_cast<unsigned>(thread_id) < concurrency_cap) {
+    unsigned proc = thread_id;
+
+    cpu_set_t mask;
+    CPU_ZERO(&mask);
+    CPU_SET(proc, &mask);
+
+    if (-1 == sched_setaffinity(0, sizeof(mask), &mask)) {
+      VLOG(1) << "WARNING: Failed to set thread affinity for thread "
+              << thread_id;
+    } else {
+      CPU_ZERO(&mask);
+      if ((0 != sched_getaffinity(0, sizeof(mask), &mask)) ||
+          (CPU_ISSET(proc, &mask) == 0)) {
+        VLOG(3) << "WARNING: Failed to set thread affinity for thread "
+                << thread_id;
+      }
+    }
+  } else {
+    VLOG(1) << "WARNING: Failed to set thread affinity for thread "
+            << thread_id;
+  }
+#endif
+}
+
+template <typename T>
+void print_lod_tensor(std::string var_name, const LoDTensor& lod_tensor) {
+  auto inspect = lod_tensor.data<T>();
+  auto element_num = lod_tensor.numel();
+
+  std::ostringstream sstream;
+  sstream << var_name << " (element num " << element_num << "): [";
+  sstream << inspect[0];
+  for (int j = 1; j < element_num; ++j) {
+    sstream << " " << inspect[j];
+  }
+  sstream << "]";
+
+  std::cout << sstream.str() << std::endl;
+}
+
+void print_fetch_var(Scope* scope, std::string var_name) {
+  const LoDTensor& tensor = scope->FindVar(var_name)->Get<LoDTensor>();
+
+  if (std::type_index(tensor.type()) ==
+      std::type_index(typeid(platform::float16))) {
+    print_lod_tensor<platform::float16>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) == std::type_index(typeid(float))) {
+    print_lod_tensor<float>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) ==
+             std::type_index(typeid(double))) {
+    print_lod_tensor<double>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) == std::type_index(typeid(int))) {
+    print_lod_tensor<int>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) ==
+             std::type_index(typeid(int64_t))) {
+    print_lod_tensor<int64_t>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) == std::type_index(typeid(bool))) {
+    print_lod_tensor<bool>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) ==
+             std::type_index(typeid(uint8_t))) {
+    print_lod_tensor<uint8_t>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) ==
+             std::type_index(typeid(int16_t))) {
+    print_lod_tensor<int16_t>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) ==
+             std::type_index(typeid(int8_t))) {
+    print_lod_tensor<int8_t>(var_name, tensor);
+  } else {
+    VLOG(1) << "print_fetch_var: unrecognized data type:"
+            << tensor.type().name();
+  }
+
+  return;
+}
+
+void ExecutorThreadWorker::TrainFiles() {
+  platform::SetNumThreads(1);
+
+  // todo: configurable
+  SetDevice();
+
+  int fetch_var_num = fetch_var_names_.size();
+  fetch_values_.clear();
+  fetch_values_.resize(fetch_var_num);
+
+  thread_reader_->Start();
+
+  int cur_batch;
+  int batch_cnt = 0;
+  while ((cur_batch = thread_reader_->Next()) > 0) {
+    // executor run here
+    for (auto& op : ops_) {
+      op->Run(*thread_scope_, place_);
+    }
+
+    ++batch_cnt;
+    thread_scope_->DropKids();
+
+    if (debug_ == false || thread_id_ != 0) {
+      continue;
+    }
+
+    for (int i = 0; i < fetch_var_num; ++i) {
+      print_fetch_var(thread_scope_, fetch_var_names_[i]);
+    }  // end for (int i = 0...)
+  }    // end while ()
+}
+
+void ExecutorThreadWorker::SetThreadId(int tid) { thread_id_ = tid; }
+
+void ExecutorThreadWorker::SetPlace(const platform::Place& place) {
+  place_ = place;
+}
+
+void ExecutorThreadWorker::SetMainProgram(
+    const ProgramDesc& main_program_desc) {
+  main_program_.reset(new ProgramDesc(main_program_desc));
+}
+
+void ExecutorThreadWorker::SetRootScope(Scope* g_scope) {
+  root_scope_ = g_scope;
+}
+
+}  // einit_modelnd namespace framework
+}  // end namespace paddle
diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h
new file mode 100644
index 0000000000000000000000000000000000000000..13ec2442c46459116320236bf98f23c91340f389
--- /dev/null
+++ b/paddle/fluid/framework/executor_thread_worker.h
@@ -0,0 +1,88 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <set>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+#include "paddle/fluid/framework/data_feed.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+void CreateTensor(Variable* var, proto::VarType::Type var_type);
+
+class ExecutorThreadWorker {
+ public:
+  ExecutorThreadWorker()
+      : thread_id_(-1), root_scope_(NULL), thread_scope_(NULL), debug_(false) {}
+  ~ExecutorThreadWorker() {}
+
+  void CreateThreadResource(const framework::ProgramDesc& program,
+                            const paddle::platform::Place& place);
+  void SetThreadId(int tid);
+  void SetDebug(const bool debug) { debug_ = debug; }
+  void SetRootScope(Scope* g_scope);
+  // set cpu device in this function
+  // cpu binding is used by default
+  void SetDevice();
+  // since we read data into memory that can not be accessed by program
+  // we need to bind memory of data with corresponding variables in program
+  // this function should be called after data feed is set
+  void BindingDataFeedMemory();
+  // set data feed declared in executor
+  void SetDataFeed(const std::shared_ptr<DataFeed>& datafeed);
+  // A multi-thread training function
+  void TrainFiles();
+  // set fetch variable names from python interface assigned by users
+  void SetFetchVarNames(const std::vector<std::string>& fetch_var_names);
+
+ private:
+  void CreateThreadScope(const framework::ProgramDesc& program);
+  void CreateThreadOperators(const framework::ProgramDesc& program);
+  void SetMainProgram(const ProgramDesc& main_program_desc);
+  void SetPlace(const paddle::platform::Place& place);
+
+ protected:
+  // thread index
+  std::shared_ptr<DataFeed> thread_reader_;  // shared queue, thread buffer
+  int thread_id_;
+  // operator name
+  std::vector<std::string> op_names_;
+  // thread level, local operators for forward and backward
+  std::vector<OperatorBase*> ops_;
+  // main program for training
+  std::unique_ptr<framework::ProgramDesc> main_program_;
+  // execution place
+  platform::Place place_;
+  // root scope for model parameters
+  Scope* root_scope_;
+  // a thread scope, father scope is global score which is shared
+  Scope* thread_scope_;
+
+ private:
+  std::vector<std::string> fetch_var_names_;
+  std::vector<std::vector<float>> fetch_values_;
+  bool debug_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index 3e9353f5cf67d8de62c5551f12ea786e49190549..6338be75a4b1d3c4caf7a6f7add4d05fec690340 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -16,7 +16,9 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "glog/logging.h"
+#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace framework {
@@ -53,5 +55,12 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
   return tensor;
 }
 
+LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name) {
+  Variable* var = scope.FindVar(var_name);
+  PADDLE_ENFORCE(var, "%s no in scope", var_name);
+  PADDLE_ENFORCE(var->IsType<LoDTensor>(), "Only support lod tensor now.");
+  return *var->GetMutable<LoDTensor>();
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h
index 7f504bfd232862c014cb59b6e8301eec74e0351f..031f8e01aa6128b803dcbfb990778e87d4fafc13 100644
--- a/paddle/fluid/framework/feed_fetch_method.h
+++ b/paddle/fluid/framework/feed_fetch_method.h
@@ -27,5 +27,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
 LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
                             size_t index);
 
+LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
new file mode 100644
index 0000000000000000000000000000000000000000..54d9d0dc018b08decb2ff8965659bab98e81f3ab
--- /dev/null
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -0,0 +1,89 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
+#include "paddle/fluid/framework/garbage_collector.h"
+
+namespace paddle {
+namespace framework {
+
+GarbageCollector::GarbageCollector(const platform::Place &place,
+                                   size_t max_memory_size)
+    : max_memory_size_((std::max)(max_memory_size, static_cast<size_t>(1))) {
+  garbages_.reset(new GarbageQueue());
+  dev_ctx_ = platform::DeviceContextPool::Instance().Get(place);
+}
+
+CPUGarbageCollector::CPUGarbageCollector(const platform::CPUPlace &place,
+                                         size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void CPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
+  callback();
+}
+
+#ifdef PADDLE_WITH_CUDA
+UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(
+    const platform::CUDAPlace &place, size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void UnsafeFastGPUGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  callback();
+}
+
+DefaultStreamGarbageCollector::DefaultStreamGarbageCollector(
+    const platform::CUDAPlace &place, size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void DefaultStreamGarbageCollector::Wait() const {
+  static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
+      ->WaitStreamCallback();
+}
+
+void DefaultStreamGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
+      ->AddStreamCallback(callback);
+}
+
+StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
+                                               size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {
+  platform::CUDADeviceGuard guard(place.device);
+  PADDLE_ENFORCE(cudaStreamCreate(&stream_));
+  callback_manager_.reset(new platform::StreamCallbackManager(stream_));
+}
+
+StreamGarbageCollector::~StreamGarbageCollector() {
+  auto place = boost::get<platform::CUDAPlace>(this->dev_ctx_->GetPlace());
+  platform::CUDADeviceGuard guard(place.device);
+  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
+  PADDLE_ENFORCE(cudaStreamDestroy(stream_));
+}
+
+cudaStream_t StreamGarbageCollector::stream() const { return stream_; }
+
+void StreamGarbageCollector::Wait() const { callback_manager_->Wait(); }
+
+void StreamGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  callback_manager_->AddCallback(callback);
+}
+#endif
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index 818b3334ea4171fd7a9cbaa896ee1672e8ecca51..2768671029c06562aa0d2e5eea3d3ff61d900ab5 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -14,7 +14,6 @@
 
 #pragma once
 
-#include <algorithm>
 #include <deque>
 #include <functional>
 #include <memory>
@@ -24,134 +23,74 @@
 namespace paddle {
 namespace framework {
 
-// T should have memory_size() and clear() method
-template <typename T>
 class GarbageCollector {
  public:
-  GarbageCollector(const platform::Place &place, size_t max_memory_size)
-      : max_memory_size_((std::max)(max_memory_size, static_cast<size_t>(1))) {
-    garbages_.reset(new std::deque<T *>());
-    dev_ctx_ = platform::DeviceContextPool::Instance().Get(place);
-  }
+  using GarbageQueue = std::deque<std::shared_ptr<memory::Allocation>>;
 
-  virtual ~GarbageCollector() {}
+  GarbageCollector(const platform::Place &place, size_t max_memory_size);
 
-  void Reset() {
-    std::lock_guard<std::mutex> guard(mutex_);
-    garbages_.reset(new std::deque<T *>());
-    cur_memory_size_ = 0;
-  }
+  virtual ~GarbageCollector() = default;
+
+  virtual void Wait() const {}
 
   template <typename Container>
-  void Add(const Container &objs) {
-    Add(objs, []() {});
-  }
+  void Add(Container &&objs);
 
   template <typename Container, typename Callback>
-  void Add(const Container &objs, Callback &&callback) {
-    std::shared_ptr<std::deque<T *>> clear_deque;
-    {
-      std::lock_guard<std::mutex> guard(mutex_);
-      for (auto *obj : objs) {
-        garbages_->push_back(obj);
-        cur_memory_size_ += obj->memory_size();
-      }
-      if (cur_memory_size_ >= max_memory_size_) {
-        cur_memory_size_ = 0;
-        clear_deque = garbages_;
-        garbages_.reset(new std::deque<T *>());
-      }
-    }
-
-    if (clear_deque != nullptr) {
-      callback();
-      ClearCallback([=]() {
-        for (auto *obj : *clear_deque) obj->clear();
-      });
-    }
-  }
-
-  virtual void Wait() const {}
+  void Add(Container &&objs, Callback &&callback);
 
  protected:
   virtual void ClearCallback(const std::function<void()> &callback) = 0;
 
   platform::DeviceContext *dev_ctx_;
-  std::shared_ptr<std::deque<T *>> garbages_;
+  std::unique_ptr<GarbageQueue> garbages_;
   mutable std::mutex mutex_;
   const size_t max_memory_size_;
-  size_t cur_memory_size_ = 0;
+  size_t cur_memory_size_{0};
 };
 
-template <typename T>
-class CPUGarbageCollector : public GarbageCollector<T> {
+class CPUGarbageCollector : public GarbageCollector {
  public:
-  CPUGarbageCollector(const platform::CPUPlace &place, size_t max_memory_size)
-      : GarbageCollector<T>(place, max_memory_size) {}
+  CPUGarbageCollector(const platform::CPUPlace &place, size_t max_memory_size);
 
  protected:
-  void ClearCallback(const std::function<void()> &callback) override {
-    callback();
-  }
+  void ClearCallback(const std::function<void()> &callback) override;
 };
 
 #ifdef PADDLE_WITH_CUDA
-template <typename T>
-class DefaultStreamGarbageCollector : public GarbageCollector<T> {
+class UnsafeFastGPUGarbageCollector : public GarbageCollector {
  public:
-  DefaultStreamGarbageCollector(const platform::CUDAPlace &place,
-                                size_t max_memory_size)
-      : GarbageCollector<T>(place, max_memory_size) {}
+  UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place,
+                                size_t max_memory_size);
 
-  cudaStream_t stream() const {
-    return static_cast<const platform::CUDADeviceContext *>(this->dev_ctx_)
-        ->stream();
-  }
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+};
 
-  void Wait() const override {
-    this->dev_ctx_->Wait();
-    static_cast<const platform::CUDADeviceContext *>(this->dev_ctx_)
-        ->WaitStreamCallback();
-  }
+class DefaultStreamGarbageCollector : public GarbageCollector {
+ public:
+  DefaultStreamGarbageCollector(const platform::CUDAPlace &place,
+                                size_t max_memory_size);
+
+  void Wait() const override;
 
  protected:
-  void ClearCallback(const std::function<void()> &callback) override {
-    static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
-        ->AddStreamCallback(callback);
-  }
+  void ClearCallback(const std::function<void()> &callback) override;
 };
 
-template <typename T>
-class StreamGarbageCollector : public GarbageCollector<T> {
+class StreamGarbageCollector : public GarbageCollector {
  public:
   StreamGarbageCollector(const platform::CUDAPlace &place,
-                         size_t max_memory_size)
-      : GarbageCollector<T>(place, max_memory_size) {
-    PADDLE_ENFORCE(cudaSetDevice(place.device));
-    PADDLE_ENFORCE(cudaStreamCreate(&stream_));
-    callback_manager_.reset(new platform::StreamCallbackManager(stream_));
-  }
+                         size_t max_memory_size);
 
-  ~StreamGarbageCollector() {
-    auto place = boost::get<platform::CUDAPlace>(this->dev_ctx_->GetPlace());
-    PADDLE_ENFORCE(cudaSetDevice(place.device));
-    PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
-    PADDLE_ENFORCE(cudaStreamDestroy(stream_));
-  }
+  ~StreamGarbageCollector();
 
-  void Wait() const override {
-    PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
-    std::lock_guard<std::mutex> guard(this->mutex_);
-    callback_manager_->Wait();
-  }
+  void Wait() const override;
 
-  cudaStream_t stream() const { return stream_; }
+  cudaStream_t stream() const;
 
  protected:
-  void ClearCallback(const std::function<void()> &callback) override {
-    std::lock_guard<std::mutex> guard(this->mutex_);
-    callback_manager_->AddCallback(callback);
-  }
+  void ClearCallback(const std::function<void()> &callback) override;
 
  private:
   cudaStream_t stream_;
@@ -159,5 +98,33 @@ class StreamGarbageCollector : public GarbageCollector<T> {
 };
 #endif
 
+template <typename Container>
+void GarbageCollector::Add(Container &&objs) {
+  Add(std::forward<Container>(objs), []() {});
+}
+
+template <typename Container, typename Callback>
+void GarbageCollector::Add(Container &&objs, Callback &&callback) {
+  GarbageQueue *garbage_queue = nullptr;
+  {
+    std::lock_guard<std::mutex> guard(mutex_);
+    for (auto &obj : objs) {
+      if (!obj) continue;
+      cur_memory_size_ += obj->size();
+      garbages_->push_back(std::move(obj));
+    }
+    if (cur_memory_size_ >= max_memory_size_) {
+      cur_memory_size_ = 0;
+      garbage_queue = garbages_.release();
+      garbages_.reset(new GarbageQueue());
+    }
+  }
+
+  if (garbage_queue) {
+    callback();
+    ClearCallback([garbage_queue]() { delete garbage_queue; });
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
index 449cc78be15bcd2575ce2e6846b41e475f8921f6..d4a701e0b173a96d8605dff308fee7007a0ecc0c 100644
--- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
@@ -46,14 +46,16 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
   auto* scope = param_scope();
   PADDLE_ENFORCE(scope);
 
+  std::string type = is_conv3d() ? "conv3d" : "conv2d";
+
   GraphPatternDetector gpd;
   auto* conv_input =
       gpd.mutable_pattern()
           ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
           ->AsInput()
-          ->assert_is_op_input("conv2d", "Input");
+          ->assert_is_op_input(type, "Input");
   patterns::ConvBias conv_bias_pattern(gpd.mutable_pattern(), name_scope_);
-  conv_bias_pattern(conv_input);
+  conv_bias_pattern(conv_input, is_conv3d());
   int found_conv_bias_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
@@ -109,7 +111,7 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
       desc.SetInput("Filter", std::vector<std::string>({conv_weight->Name()}));
       desc.SetInput("Bias", std::vector<std::string>({eltwise_bias->Name()}));
       desc.SetOutput("Output", std::vector<std::string>({eltwise_out->Name()}));
-      desc.SetType("conv2d");
+      desc.SetType(type);
 
       for (auto& attr : conv->Op()->GetAttrMap()) {
         desc.SetAttr(attr.first, attr.second);
@@ -135,3 +137,5 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
 }  // namespace paddle
 REGISTER_PASS(conv_bias_mkldnn_fuse_pass,
               paddle::framework::ir::ConvBiasFusePass);
+REGISTER_PASS(conv3d_bias_mkldnn_fuse_pass,
+              paddle::framework::ir::Conv3DBiasFusePass);
diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h
index 5775b83b88730ec298c421a15f5c0b83c27b0750..f3ad9f1c2bf14db418629e0c607e2510f01908b8 100644
--- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h
@@ -26,11 +26,19 @@ namespace ir {
 class ConvBiasFusePass : public FusePassBase {
  public:
   virtual ~ConvBiasFusePass() {}
+  virtual bool is_conv3d() const { return false; }
 
  protected:
   std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
   const std::string name_scope_{"conv_bias_mkldnn_fuse"};
 };
+/*
+* Fuse the Conv3D and Elementwise_add to a Conv3DBiasOp.
+*/
+class Conv3DBiasFusePass : public ConvBiasFusePass {
+ public:
+  bool is_conv3d() const override { return true; }
+};
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index fc91564bbaecf7b1725908fc1eb8b1e4d2e20d32..8679118fe28b1c68aea30caf711441823b5255c0 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -38,9 +38,8 @@ void CheckProgram(const ProgramDesc &program) {
     switch (role_id) {
       case _INT(OpRole::kForward):
         if (visit.find(_INT(OpRole::kBackward)) != visit.end()) {
-          LOG(ERROR)
-              << "Cannot add backward operator before forward operator %s."
-              << op->Type();
+          LOG(ERROR) << "Cannot add backward operator before forward operator "
+                     << op->Type();
         }
         break;
       case _INT(OpRole::kBackward):
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 947c934f0ff3e06e70f26cf9a9155e8d4b4a84ad..47fcf96a3f92b1f915e5254fff36feb8b2870730 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -73,14 +73,21 @@ class Graph {
   }
 
   bool Has(const std::string &attr_name) const {
-    return attrs_.find(attr_name) != attrs_.end();
+    return attrs_.count(attr_name) > 0;
   }
 
   template <typename AttrType>
   AttrType &Get(const std::string &attr_name) const {
     PADDLE_ENFORCE(Has(attr_name), "%s attr not registered for graph.",
                    attr_name);
-    return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
+    try {
+      return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
+    } catch (boost::bad_any_cast &) {
+      PADDLE_THROW(
+          "Invalid attribute type of %s error, expected: %s, actual: %s",
+          attr_name, typeid(AttrType *).name(),
+          attrs_.at(attr_name).type().name());
+    }
   }
 
   template <typename AttrType>
@@ -177,14 +184,13 @@ class Graph {
     return nullptr;
   }
 
-  const ProgramDesc &program() const { return program_; }
-  std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
-      const ProgramDesc &program);
-
   void ResolveHazard(
       const std::map<std::string, std::vector<ir::Node *>> &var_nodes);
 
  private:
+  std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
+      const ProgramDesc &program);
+
   // This method takes ownership of `node`.
   ir::Node *AddNode(ir::Node *node) {
     PADDLE_ENFORCE(node_set_.find(node) == node_set_.end());
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 258182b25a16d9135f55cfc300e2602d14f26d73..0118019df2f779a6409365555b530ae3b6d3971f 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1030,10 +1030,11 @@ PDNode *patterns::ElewiseAddActInplaceGrad::operator()(
 }
 
 PDNode *patterns::ConvBias::operator()(
-    paddle::framework::ir::PDNode *conv_input) {
+    paddle::framework::ir::PDNode *conv_input, bool is_conv3d) {
+  std::string type = is_conv3d ? "conv3d" : "conv2d";
   // Create Operators
-  conv_input->assert_is_op_input("conv2d", "Input");
-  auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
+  conv_input->assert_is_op_input(type, "Input");
+  auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op(type);
   auto *eltiwse_op =
       pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add");
   // Create variables
@@ -1041,11 +1042,11 @@ PDNode *patterns::ConvBias::operator()(
   auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
                               ->AsInput()
                               ->assert_is_persistable_var()
-                              ->assert_is_op_input("conv2d", "Filter");
+                              ->assert_is_op_input(type, "Filter");
   // intermediate variable, will be removed in the IR after fuse.
   auto *conv_out_var = pattern->NewNode(conv_out_repr())
                            ->AsIntermediate()
-                           ->assert_is_only_output_of_op("conv2d")
+                           ->assert_is_only_output_of_op(type)
                            ->assert_is_op_input("elementwise_add");
   // Bias stored in elementwise_add
   auto *eltwise_bias_var = pattern->NewNode(eltwise_bias_repr())
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index c12b9503fd817757ec8d1e988be3e449fc63c6ff..d044802f22d02372e0ddb72c6fd702aebf2f82c3 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -623,7 +623,7 @@ struct ElewiseAddActInplaceGrad : public PatternBase {
 struct ConvBias : public PatternBase {
   ConvBias(PDPattern* pattern, const std::string& name_scope)
       : PatternBase(pattern, name_scope, "conv_bias") {}
-  PDNode* operator()(PDNode* conv_input);
+  PDNode* operator()(PDNode* conv_input, bool is_conv3d = false);
   // declare operator node's name
   PATTERN_DECL_NODE(conv);
   PATTERN_DECL_NODE(eltwise);
diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc
index 292f232ffce48593e1827fe2dfe1b8472360054e..57cc98e2ca0175848aa62c62c8ad3b20594b3bde 100644
--- a/paddle/fluid/framework/ir/is_test_pass.cc
+++ b/paddle/fluid/framework/ir/is_test_pass.cc
@@ -38,7 +38,7 @@ std::unique_ptr<ir::Graph> IsTestPass::ApplyImpl(
   for (const Node* n : graph->Nodes()) {
     if (n->IsOp()) {
       auto* op = n->Op();
-      if (op->HasAttr("is_test")) {
+      if (op->HasAttr("is_test") || op->HasProtoAttr("is_test")) {
         op->SetAttr("is_test", true);
       } else if (std::find(begin(op_list), end(op_list), op->Type()) !=
                  end(op_list)) {
diff --git a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
index 65be69b7f5b5e363d5d0753c45f9ff9e3f329fbe..951fcb066ce759ebfec0182e1e9dca887e343170 100644
--- a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/mkldnn_placement_pass.h"
+#include <string>
 
 namespace paddle {
 namespace framework {
@@ -21,9 +22,19 @@ namespace ir {
 std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
   VLOG(3) << "Aplies MKL-DNN placement strategy.";
+  const auto& op_types_list =
+      Get<std::unordered_set<std::string>>("mkldnn_enabled_op_types");
   for (const Node* n : graph->Nodes()) {
-    if (n->IsOp() && n->Op()->HasAttr("use_mkldnn")) {
-      n->Op()->SetAttr("use_mkldnn", true);
+    if (n->IsOp()) {
+      auto* op = n->Op();
+      if (op->HasAttr("use_mkldnn") || op->HasProtoAttr("use_mkldnn")) {
+        if (op_types_list.empty()) {
+          op->SetAttr("use_mkldnn", true);
+        } else if (std::find(op_types_list.begin(), op_types_list.end(),
+                             n->Name()) != op_types_list.end()) {
+          op->SetAttr("use_mkldnn", true);
+        }
+      }
     }
   }
   return graph;
@@ -33,5 +44,5 @@ std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(mkldnn_placement_pass,
-              paddle::framework::ir::MKLDNNPlacementPass);
+REGISTER_PASS(mkldnn_placement_pass, paddle::framework::ir::MKLDNNPlacementPass)
+    .RequirePassAttr("mkldnn_enabled_op_types");
diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc
index 50d9113088903aa7681d6c6af5cc65f846d32787..eac67108e2106e986cbe1255a64c956153bc5560 100644
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/op_info.h"
 
 namespace paddle {
 namespace framework {
@@ -24,10 +25,11 @@ constexpr char Node::kControlDepVarName[];
 const char Node::kControlDepVarName[] = "__control_var";
 #endif
 
-std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
+std::unique_ptr<Node> CreateNodeForTest(const std::string &name,
                                         Node::Type type) {
   return std::unique_ptr<Node>(new Node(name, type));
 }
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index a3559247db6703d486ed01ce9f2058e671443096..27746ff1453b1b336da8c31497c066c338843b68 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -51,11 +51,18 @@ class Pass {
   AttrType &Get(const std::string &attr_name) const {
     PADDLE_ENFORCE(attrs_.find(attr_name) != attrs_.end(),
                    "%s attr not registered for pass.", attr_name);
-    return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
+    try {
+      return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
+    } catch (boost::bad_any_cast &) {
+      PADDLE_THROW(
+          "Invalid attribute type of %s error, expected: %s, actual: %s",
+          attr_name, typeid(AttrType *).name(),
+          attrs_.at(attr_name).type().name());
+    }
   }
 
   bool Has(const std::string &attr_name) const {
-    return attrs_.find(attr_name) != attrs_.end();
+    return attrs_.count(attr_name) > 0;
   }
 
   void Erase(const std::string &attr_name) {
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index e8295639520b5838dce3c9c9e443cc846bd9c1ec..f1642bc0d2b10f97295e80ee201db8f83bfd06ef 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -21,42 +21,11 @@
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/string/pretty_log.h"
 
 namespace paddle {
 namespace framework {
-
-// These code can be shared with Executor.
-static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
-  if (var_type == proto::VarType::LOD_TENSOR) {
-    var->GetMutable<LoDTensor>();
-  } else if (var_type == proto::VarType::SELECTED_ROWS) {
-    var->GetMutable<SelectedRows>();
-  } else if (var_type == proto::VarType::FEED_MINIBATCH) {
-    var->GetMutable<FeedFetchList>();
-  } else if (var_type == proto::VarType::FETCH_LIST) {
-    var->GetMutable<FeedFetchList>();
-  } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope *>>();
-  } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
-    var->GetMutable<LoDRankTable>();
-  } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
-    var->GetMutable<LoDTensorArray>();
-  } else if (var_type == proto::VarType::PLACE_LIST) {
-    var->GetMutable<platform::PlaceList>();
-  } else if (var_type == proto::VarType::READER) {
-    var->GetMutable<ReaderHolder>();
-  } else if (var_type == proto::VarType::RAW) {
-    // GetMutable will be called in operator
-  } else {
-    PADDLE_THROW(
-        "Variable type %d is not in "
-        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
-        "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]",
-        var_type);
-  }
-}
-
 void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
                             int block_id, bool with_feed_fetch_ops) {
   if (!scope) {
diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/framework/ngraph_bridge.cc
index 8177436d0bd90c3bcf8f91d5c55b66be188b19f9..a5acfd70449e92663cb66ef90a141c087ff6ec88 100644
--- a/paddle/fluid/framework/ngraph_bridge.cc
+++ b/paddle/fluid/framework/ngraph_bridge.cc
@@ -12,28 +12,109 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_NGRAPH
 #include <algorithm>
 #include <functional>
+#include <vector>
 
 #include "paddle/fluid/framework/ngraph_bridge.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
 
 #include "ngraph/ngraph.hpp"
 
 namespace paddle {
 namespace framework {
 
+static std::shared_ptr<ngraph::Node> GetNode(
+    const std::shared_ptr<OperatorBase>& op, const std::string name,
+    const VariableNameMap& var_map,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto& var_names = var_map.at(name);
+  PADDLE_ENFORCE_EQ(var_names.size(), 1,
+                    "op %s name %s expects one associated var", op->Type(),
+                    name);
+  if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) {
+    return (*ngb_node_map)[var_names[0]];
+  } else {
+    return nullptr;
+  }
+}
+
+static std::shared_ptr<ngraph::Node> GetInputNode(
+    const std::shared_ptr<OperatorBase>& op, const std::string name,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  return GetNode(op, name, op->Inputs(), ngb_node_map);
+}
+
+static std::shared_ptr<ngraph::Node> GetOutputNode(
+    const std::shared_ptr<OperatorBase>& op, const std::string name,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  return GetNode(op, name, op->Outputs(), ngb_node_map);
+}
+
+static void SetOutputNode(
+    const std::shared_ptr<OperatorBase>& op, const std::string name,
+    std::shared_ptr<ngraph::Node> node,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto& var_names = op->Outputs().at(name);
+  if (var_names.size() == 1) {
+    (*ngb_node_map)[var_names[0]] = node;
+  } else if (var_names.size() == 0) {
+    (*ngb_node_map)[""] = node;
+  } else {
+    PADDLE_THROW("name %s has more than 1 var_names.", name);
+  }
+}
+
+static bool HasOutput(const std::shared_ptr<OperatorBase>& op,
+                      const std::string name) {
+  auto& outputs = op->Outputs();
+  if (outputs.find(name) == outputs.end()) return false;
+  return outputs.at(name).size() > 0;
+}
+
+template <typename T>
+static void BuildBinaryNode(
+    const std::shared_ptr<OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto x = GetInputNode(op, "X", ngb_node_map);
+  auto y = GetInputNode(op, "Y", ngb_node_map);
+  auto out = std::make_shared<T>(x, y);
+  SetOutputNode(op, "Out", out, ngb_node_map);
+}
+
+template <typename T>
+static void BuildUnaryNode(
+    const std::shared_ptr<OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto input = GetInputNode(op, "X", ngb_node_map);
+  auto out = std::make_shared<T>(input);
+  SetOutputNode(op, "Out", out, ngb_node_map);
+}
+
 std::map<std::string,
          std::function<void(const std::shared_ptr<OperatorBase>&,
                             std::shared_ptr<std::unordered_map<
                                 std::string, std::shared_ptr<ngraph::Node>>>)>>
-    NgraphBridge::NG_NODE_MAP = {};
+    NgraphBridge::NG_NODE_MAP = {{"relu", BuildUnaryNode<ngraph::op::Relu>},
+                                 {"tanh", BuildUnaryNode<ngraph::op::Tanh>}};
 
-void NgraphBridge::build_graph(const std::shared_ptr<OperatorBase>& op) {
+void NgraphBridge::BuildNgNode(const std::shared_ptr<OperatorBase>& op) {
   auto& op_type = op->Type();
-  NG_NODE_MAP[op_type](op, ngb_node_map);
+  NG_NODE_MAP[op_type](op, ngb_node_map_);
 }
 
 }  // namespace framework
 }  // namespace paddle
-#endif
diff --git a/paddle/fluid/framework/ngraph_bridge.h b/paddle/fluid/framework/ngraph_bridge.h
index 55bf0d21f3471013b1fb780e852d813313345f03..5ad7b8daeb6a782515e50fc87ca7188b46308390 100644
--- a/paddle/fluid/framework/ngraph_bridge.h
+++ b/paddle/fluid/framework/ngraph_bridge.h
@@ -14,22 +14,18 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef PADDLE_WITH_NGRAPH
-
 #include <algorithm>
 #include <map>
 #include <string>
 #include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/enforce.h"
 
-#include "ngraph/ngraph.hpp"
+#include "ngraph/node.hpp"
 
 namespace paddle {
 namespace framework {
 
+class OperatorBase;
+
 class NgraphBridge {
  public:
   static std::map<
@@ -43,16 +39,15 @@ class NgraphBridge {
       std::shared_ptr<
           std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
           var_node_map)
-      : ngb_node_map(var_node_map) {}
+      : ngb_node_map_(var_node_map) {}
 
-  void build_graph(const std::shared_ptr<OperatorBase>& op);
+  void BuildNgNode(const std::shared_ptr<OperatorBase>& op);
 
  private:
   std::shared_ptr<
       std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-      ngb_node_map;
+      ngb_node_map_;
 };
 
 }  // namespace framework
 }  // namespace paddle
-#endif
diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc
index d967b2780c21713a2f9a73a3402964103f44269e..253de4c61160e52202a0192215a93284f27e5896 100644
--- a/paddle/fluid/framework/ngraph_operator.cc
+++ b/paddle/fluid/framework/ngraph_operator.cc
@@ -12,21 +12,35 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_NGRAPH
 #include <glog/logging.h>
 
 #include <algorithm>
 #include <map>
 
 #include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/ngraph_bridge.h"
 #include "paddle/fluid/framework/ngraph_operator.h"
-#include "paddle/fluid/framework/shape_inference.h"
+#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/framework/var_type.h"
 
+#include "ngraph/ngraph.hpp"
+
 namespace paddle {
 namespace framework {
 
+static ngraph::Shape Ddim2Shape(const DDim& dims) {
+  ngraph::Shape sp;
+  for (int i = 0; i < dims.size(); ++i) {
+    int k = dims[i];
+    k = k == 0 ? 1 : k;
+    sp.push_back(k);
+  }
+  return sp;
+}
+
 static std::map<proto::VarType::Type, ngraph::element::Type> pd2ng_type_map = {
     {proto::VarType::FP32, ngraph::element::f32},
     {proto::VarType::FP64, ngraph::element::f64},
@@ -42,16 +56,17 @@ typedef enum {                /* nGraph support state on ops          */
                PARTIAL_TEST   /* Support partial list of ops for test */
 } op_state;
 
-class NgraphOperator {
+// perform graph build through bridge and execute computation
+class NgraphEngine {
  public:
-  explicit NgraphOperator(const Scope& scope, const platform::Place& place,
-                          const std::vector<std::shared_ptr<OperatorBase>>& ops,
-                          const std::unordered_map<
-                              std::string, ngraph::element::Type>& var_type_map,
-                          const std::unordered_set<std::string>& persist,
-                          const std::unordered_set<std::string>& fetches,
-                          const std::unordered_set<std::string>& post_op_inputs,
-                          op_state ng_op_state)
+  explicit NgraphEngine(const Scope& scope, const platform::Place& place,
+                        const std::vector<std::shared_ptr<OperatorBase>>& ops,
+                        const std::unordered_map<
+                            std::string, ngraph::element::Type>& var_type_map,
+                        const std::unordered_set<std::string>& persist,
+                        const std::unordered_set<std::string>& fetches,
+                        const std::unordered_set<std::string>& post_op_inputs,
+                        op_state ng_op_state)
       : scope_(scope),
         place_(place),
         fused_ops_(ops),
@@ -59,13 +74,23 @@ class NgraphOperator {
         persistables_(persist),
         fetches_(fetches),
         post_op_inputs_(post_op_inputs),
-        ng_op_state_(ng_op_state) {}
+        ng_op_state_(ng_op_state) {
+    var_in_node_map_ = std::make_shared<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
+
+    var_node_map_ = std::make_shared<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
+
+    BuildNgIO();
+
+    GetNgFunction();
+  }
 
   void Run(const Scope& scope, const platform::Place& place) const;
 
  private:
   static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
-      func_cache;
+      func_cache_;
   const Scope& scope_;
   const platform::Place& place_;
   std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
@@ -74,10 +99,39 @@ class NgraphOperator {
   std::unordered_set<std::string> fetches_;
   std::unordered_set<std::string> post_op_inputs_;
   op_state ng_op_state_;
+
+  // ngraph backend eg. CPU
+  static std::shared_ptr<ngraph::runtime::Backend> backend_;
+  // ngraph function to call and execute
+  std::shared_ptr<ngraph::Function> ngraph_function_;
+  // var_name of inputs
+  std::vector<std::string> var_in_;
+  // var_name of outputs from  fetch in order
+  std::vector<std::string> var_out_;
+  // map input vars to nodes
+  std::shared_ptr<
+      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+      var_in_node_map_;
+  // map each var name with a ngraph node
+  std::shared_ptr<
+      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+      var_node_map_;
+  // cache key to check if function is cached
+  std::shared_ptr<std::string> GetCacheKey();
+  // get ngraph input and define ngraph input parameters
+  void GetNgInputShape(std::shared_ptr<OperatorBase> op);
+  // Call ngraph bridge to map ops
+  void BuildNgNodes();
+  // get the ngraph input and output var list
+  void BuildNgIO();
+  // build ngraph function call
+  void BuildNgFunction();
+  // Check cache for ngraph function or otherwise build the function
+  void GetNgFunction();
 };
 
 std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
-FusedOperator::FusedOpIntervals(
+NgraphOperator::NgraphOpIntervals(
     std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops) {
   std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
       intervals;
@@ -86,7 +140,7 @@ FusedOperator::FusedOpIntervals(
   }
   size_t size = ops->size();
   size_t left = 0;
-  while (left < size && ops.at(left)->Type() != kFeedOpType) {
+  while (left < size && ops->at(left)->Type() != kFeedOpType) {
     ++left;
   }
   if (left == size) {
@@ -116,7 +170,7 @@ FusedOperator::FusedOpIntervals(
       size_t start = pivot, end = start;
       while (pivot < right &&
              (paddle::framework::NgraphBridge::NG_NODE_MAP.find(
-                  ops.at(pivot)->Type()) !=
+                  ops->at(pivot)->Type()) !=
               paddle::framework::NgraphBridge::NG_NODE_MAP.end())) {
         ++pivot;
         ++end;
@@ -130,13 +184,15 @@ FusedOperator::FusedOpIntervals(
   return intervals;
 }
 
-FusedOperator::FusedOperator(
+NgraphOperator::NgraphOperator(
     const ProgramDesc& prog, size_t block_id,
     std::vector<std::unique_ptr<OperatorBase>>::iterator start,
     std::vector<std::unique_ptr<OperatorBase>>::iterator end,
     const std::string& type, const VariableNameMap& inputs,
     const VariableNameMap& outputs, const AttributeMap& attrs)
-    : OperatorBase(type, inputs, outputs, attrs), pdesc(prog), block(block_id) {
+    : OperatorBase(type, inputs, outputs, attrs),
+      pdesc_(prog),
+      block_(block_id) {
   for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = start;
        it != end; ++it) {
     fused_ops_.push_back(std::move(*it));
@@ -152,13 +208,13 @@ FusedOperator::FusedOperator(
   }
 
   if ((*(start - 1))->Type() == kFeedOpType && (*end)->Type() == kFetchOpType) {
-    is_complete = true;
+    is_full_ = true;
   }
 
   Process();
 }
 
-void FusedOperator::Process() {
+void NgraphOperator::Process() {
   auto& bdesc = pdesc_.Block(block_);
   for (auto& var : bdesc.AllVars()) {
     if (!(var->GetType() == proto::VarType::SELECTED_ROWS ||
@@ -194,8 +250,8 @@ void FusedOperator::Process() {
   }
 }
 
-void FusedOperator::RunImpl(const Scope& scope,
-                            const platform::Place& place) const {
+void NgraphOperator::RunImpl(const Scope& scope,
+                             const platform::Place& place) const {
   op_state ng_op_state = PARTIAL_TEST;
   auto& bdesc = pdesc_.Block(block_);
   for (auto* op : bdesc.AllOps()) {
@@ -205,16 +261,288 @@ void FusedOperator::RunImpl(const Scope& scope,
     }
   }
 
-  if (is_full) {
+  if (is_full_) {
     ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN;
   }
 
-  NgraphOperator ngraph_op(scope, place, fused_ops_, var_type_map_,
-                           persistables_, fetches_, post_op_inputs_,
-                           ng_op_state);
-  ngraph_op.Run(scope, place);
+  NgraphEngine ngraph_engine(scope, place, fused_ops_, var_type_map_,
+                             persistables_, fetches_, post_op_inputs_,
+                             ng_op_state);
+  ngraph_engine.Run(scope, place);
+}
+
+std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
+    NgraphEngine::func_cache_ = {};
+
+std::shared_ptr<ngraph::runtime::Backend> NgraphEngine::backend_ =
+    ngraph::runtime::Backend::create("CPU");
+
+void NgraphEngine::GetNgInputShape(std::shared_ptr<OperatorBase> op) {
+  op->RuntimeInferShape(scope_, place_);
+  for (auto& var_name_item : op->Inputs()) {
+    for (auto& var_name : var_name_item.second) {
+      auto* var = scope_.FindVar(var_name);
+      if (var && var->IsType<LoDTensor>()) {
+        auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+        auto sp = Ddim2Shape(tensor_pd->dims());
+        if (std::find(var_in_.begin(), var_in_.end(), var_name) !=
+            var_in_.end()) {
+          if (var_node_map_->find(var_name) == var_node_map_->end()) {
+            auto ng_type = var_type_map_.at(var_name);
+            auto prm =
+                std::make_shared<ngraph::op::Parameter>(ng_type, sp, true);
+            (*var_node_map_)[var_name] = prm;
+            (*var_in_node_map_)[var_name] = prm;
+          }
+        }
+      }
+    }
+  }
+}
+
+void NgraphEngine::BuildNgNodes() {
+  for (auto& var_name : var_out_) {
+    if (var_node_map_->find(var_name) == var_node_map_->end()) {
+      auto* var = scope_.FindVar(var_name);
+      if (var && var->IsType<LoDTensor>()) {
+        auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+        auto& ddim = tensor_pd->dims();
+        auto ng_shape = Ddim2Shape(ddim);
+        auto ng_type = var_type_map_.at(var_name);
+        auto prm =
+            std::make_shared<ngraph::op::Parameter>(ng_type, ng_shape, true);
+        (*var_node_map_)[var_name] = prm;
+      }
+    }
+  }
+
+  paddle::framework::NgraphBridge ngb(var_node_map_);
+  for (auto& op : fused_ops_) {
+    ngb.BuildNgNode(op);
+  }
+}
+
+void NgraphEngine::BuildNgIO() {
+  std::unordered_set<std::string> inputs;
+  std::unordered_set<std::string> outputs;
+
+  for (auto& op : fused_ops_) {
+    for (auto& var_name_item : op->Inputs()) {
+      for (auto& var_name : var_name_item.second) {
+        inputs.insert(var_name);
+        const bool is_output = outputs.find(var_name) != outputs.end();
+        if (!is_output &&
+            std::find(var_in_.begin(), var_in_.end(), var_name) ==
+                var_in_.end()) {
+          // fill var_in here to keep lhs and rhs order
+          var_in_.push_back(var_name);
+        }
+      }
+    }
+
+    if (op->Type() != "fill_constant") {
+      GetNgInputShape(op);
+    }
+
+    for (auto& var_name_item : op->Outputs()) {
+      PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
+                        "op %s has more than 1 output - Not handling yet",
+                        op->Type());
+      for (auto& var_name : var_name_item.second) {
+        outputs.insert(var_name);
+      }
+    }
+  }
+
+  // var_out.clear();
+  for (auto& op : fused_ops_) {
+    for (auto& var_name_item : op->Outputs()) {
+      PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
+                        "op %s has more than 1 output - Not handling yet",
+                        op->Type());
+      for (auto& var_name : var_name_item.second) {
+        switch (ng_op_state_) {
+          case PARTIAL_TEST:
+            if (post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
+                fetches_.find(var_name) != fetches_.end()) {
+              var_out_.push_back(var_name);
+            }
+            break;
+          case FULL_TEST:
+            if (fetches_.find(var_name) != fetches_.end()) {
+              var_out_.push_back(var_name);
+            }
+            break;
+          case PARTIAL_TRAIN:
+            if (fetches_.find(var_name) != fetches_.end() ||
+                post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
+                persistables_.find(var_name) != persistables_.end()) {
+              var_out_.push_back(var_name);
+            }
+            break;
+          case FULL_TRAIN:
+            if (fetches_.find(var_name) != fetches_.end() ||
+                persistables_.find(var_name) != persistables_.end()) {
+              var_out_.push_back(var_name);
+            }
+            break;
+          default:
+            var_out_.push_back(var_name);
+        }
+      }
+    }
+  }
 }
 
+void NgraphEngine::BuildNgFunction() {
+  BuildNgNodes();
+  ngraph_function_ = nullptr;
+  ngraph::NodeVector func_outputs;
+  ngraph::op::ParameterVector func_inputs;
+
+  for (auto& vo : var_out_) {
+    func_outputs.push_back(var_node_map_->at(vo));
+  }
+
+  for (auto& vi : var_in_) {
+    std::shared_ptr<ngraph::op::Parameter> prm =
+        std::dynamic_pointer_cast<ngraph::op::Parameter>(
+            var_in_node_map_->at(vi));
+    func_inputs.push_back(prm);
+  }
+
+  ngraph_function_ =
+      std::make_shared<ngraph::Function>(func_outputs, func_inputs);
+}
+
+std::shared_ptr<std::string> NgraphEngine::GetCacheKey() {
+  auto cache_key = std::make_shared<std::string>("");
+  *cache_key += std::to_string(fused_ops_.size());
+  for (auto& op : fused_ops_) {
+    *cache_key += op->Type();
+  }
+  for (auto& var_name : var_in_) {
+    auto shape = var_node_map_->at(var_name)->get_shape();
+    *cache_key += var_name;
+    *cache_key += var_type_map_.at(var_name).c_type_string();
+    for (size_t i = 0; i < shape.size(); ++i) {
+      *cache_key += std::to_string(shape.at(i));
+    }
+  }
+
+  for (auto& var_name : var_out_) {
+    auto* var = scope_.FindVar(var_name);
+    if (var && var->IsType<LoDTensor>()) {
+      auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+      auto& ddim = tensor_pd->dims();
+      for (int i = 0; i < ddim.size(); ++i) {
+        *cache_key += std::to_string(ddim[i]);
+      }
+    }
+  }
+  return cache_key;
+}
+
+void NgraphEngine::GetNgFunction() {
+  bool cache_on = true;
+  if (cache_on) {
+    std::string cache_key_val = *GetCacheKey();
+    if (func_cache_.find(cache_key_val) != func_cache_.end()) {
+      ngraph_function_ = func_cache_.at(cache_key_val);
+    } else {
+      BuildNgFunction();
+      func_cache_[cache_key_val] = ngraph_function_;
+    }
+  } else {
+    BuildNgFunction();
+  }
+}
+
+void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const {
+  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_in;
+  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_out;
+
+  for (size_t i = 0; i < var_in_.size(); ++i) {
+    auto vi = var_in_.at(i);
+    auto sp = var_node_map_->at(vi)->get_shape();
+    std::shared_ptr<ngraph::runtime::Tensor> ti;
+    auto* var = scope.FindVar(vi);
+    if (var && var->IsType<LoDTensor>()) {
+      auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+      PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()),
+                     "Ensure ngraph tensor layout align with paddle tensor");
+      if (tensor_pd->type().hash_code() ==
+          typeid(float).hash_code()) {  // NOLINT
+        const float* arr = tensor_pd->data<float>();
+        ti = backend_->create_tensor(ngraph::element::f32, sp,
+                                     const_cast<float*>(arr));
+      } else if (tensor_pd->type().hash_code() ==
+                 typeid(int).hash_code()) {  // NOLINT
+        const int* arr = tensor_pd->data<int>();
+        ti = backend_->create_tensor(ngraph::element::i32, sp,
+                                     const_cast<int*>(arr));
+      } else if (tensor_pd->type().hash_code() == typeid(int64_t).hash_code()) {
+        const int64_t* arr = tensor_pd->data<int64_t>();
+        ti = backend_->create_tensor(ngraph::element::i64, sp,
+                                     const_cast<int64_t*>(arr));
+      } else if (tensor_pd->type().hash_code() ==
+                 typeid(double).hash_code()) {  // NOLINT
+        const double* arr = tensor_pd->data<double>();
+        ti = backend_->create_tensor(ngraph::element::f64, sp,
+                                     const_cast<double*>(arr));
+      } else if (tensor_pd->type().hash_code() ==
+                 typeid(bool).hash_code()) {  // NOLINT
+        const bool* arr = tensor_pd->data<bool>();
+        ti = backend_->create_tensor(ngraph::element::boolean, sp,
+                                     const_cast<bool*>(arr));
+      } else {
+        PADDLE_THROW("Data type not handling for var %s", vi);
+      }
+    } else {
+      PADDLE_THROW("Cannot find var or tensor with var name %s", vi);
+    }
+    bool is_test = (ng_op_state_ == PARTIAL_TEST || ng_op_state_ == FULL_TEST)
+                       ? true
+                       : false;
+    bool is_persistable =
+        (persistables_.find(vi) != persistables_.end()) ? true : false;
+    if (is_test && is_persistable) {
+      ti->set_stale(false);
+    }
+    t_in.push_back(ti);
+  }
+
+  for (size_t i = 0; i < var_out_.size(); ++i) {
+    auto var_name = var_out_[i];
+    auto* var = scope.FindVar(var_name);
+    std::shared_ptr<ngraph::runtime::Tensor> to;
+    if (var && var->IsType<LoDTensor>()) {
+      auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
+      auto dd = tensor_pd->dims();
+      ngraph::Shape sp = Ddim2Shape(dd);
+      auto ng_type = var_type_map_.at(var_name);
+      if (ng_type == ngraph::element::f32) {
+        auto pd_arr = tensor_pd->mutable_data<float>(place);
+        to = backend_->create_tensor(ngraph::element::f32, sp, pd_arr);
+      } else if (ng_type == ngraph::element::i64) {
+        auto pd_arr = tensor_pd->mutable_data<int64_t>(place);
+        to = backend_->create_tensor(ngraph::element::i64, sp, pd_arr);
+      } else if (ng_type == ngraph::element::f64) {
+        auto pd_arr = tensor_pd->mutable_data<double>(place);
+        to = backend_->create_tensor(ngraph::element::f64, sp, pd_arr);
+      } else if (ng_type == ngraph::element::boolean) {
+        auto pd_arr = tensor_pd->mutable_data<bool>(place);
+        to = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr);
+      } else {
+        PADDLE_THROW("Data type not handled in for var %s", var_name);
+      }
+      t_out.push_back(to);
+    } else {
+      PADDLE_THROW("Cannot find var or tensor with var name %s", var_name);
+    }
+  }
+
+  backend_->call(ngraph_function_, t_out, t_in);
+}  // NgraphEngine::RunImpl
 }  // namespace framework
 }  // namespace paddle
-#endif
diff --git a/paddle/fluid/framework/ngraph_operator.h b/paddle/fluid/framework/ngraph_operator.h
index 0f655cef1dde624bcf4944b5c096279097e1c8ae..ede80f44bea208b66acc3b3f4bc0f4adee4fb860 100644
--- a/paddle/fluid/framework/ngraph_operator.h
+++ b/paddle/fluid/framework/ngraph_operator.h
@@ -14,39 +14,32 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef PADDLE_WITH_NGRAPH
-
 #include <algorithm>
-#include <atomic>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
 #include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/ngraph_bridge.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/variant.h"
 
-#include "ngraph/ngraph.hpp"
+#include "ngraph/type/element_type.hpp"
 
 namespace paddle {
 namespace framework {
 
-class FusedOperator : public OperatorBase {
+class NgraphOperator : public OperatorBase {
  public:
   static std::vector<
       std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
-  FusedOpIntervals(
+  NgraphOpIntervals(
       std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops);
 
-  explicit FusedOperator(
+  explicit NgraphOperator(
       const ProgramDesc& prog, size_t block_id,
       std::vector<std::unique_ptr<OperatorBase>>::iterator start,
       std::vector<std::unique_ptr<OperatorBase>>::iterator end,
@@ -69,4 +62,3 @@ class FusedOperator : public OperatorBase {
 };
 }  // namespace framework
 }  // namespace paddle
-#endif
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index e8ecd90502933a049cc8f886212579fc061d44ff..dde642764fa5dfce11edcef51ad1be11be331fbc 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -237,6 +237,23 @@ void OpDesc::SetOutput(const std::string &param_name,
   this->outputs_[param_name] = args;
 }
 
+bool OpDesc::HasProtoAttr(const std::string &name) const {
+  auto &op_info = OpInfoMap::Instance();
+  if (op_info.Has(desc_.type())) {
+    auto op_info_ptr = op_info.Get(desc_.type());
+    if (op_info_ptr.HasOpProtoAndChecker()) {
+      const proto::OpProto &proto = op_info_ptr.Proto();
+      for (int i = 0; i != proto.attrs_size(); ++i) {
+        const proto::OpProto::Attr &attr = proto.attrs(i);
+        if (attr.name() == name) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
 proto::AttrType OpDesc::GetAttrType(const std::string &name) const {
   auto it = attrs_.find(name);
   PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index 30c8a26c3d2f0068674aa70b4ff875a2f73c1dca..e8debec7f13706b7fc5a4882d237ee2257e53b7e 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -65,6 +65,8 @@ class OpDesc {
     return attrs_.find(name) != attrs_.end();
   }
 
+  bool HasProtoAttr(const std::string &name) const;
+
   proto::AttrType GetAttrType(const std::string &name) const;
 
   std::vector<std::string> AttrNames() const;
diff --git a/paddle/fluid/framework/op_kernel_type.cc b/paddle/fluid/framework/op_kernel_type.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6d4801e4a0eed7083e671e1d49b8628dfb280cf9
--- /dev/null
+++ b/paddle/fluid/framework/op_kernel_type.cc
@@ -0,0 +1,54 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_kernel_type.h"
+
+namespace paddle {
+namespace framework {
+
+size_t OpKernelType::Hash::operator()(const OpKernelType& key) const {
+  int cur_loc = 0;
+
+  int place = key.place_.which();
+  cur_loc += OpKernelType::kPlaceBits;
+
+  int data_type = static_cast<int>(key.data_type_) << cur_loc;
+  cur_loc += OpKernelType::kPrimaryDTypeBits;
+
+  int data_layout = static_cast<int>(key.data_layout_) << cur_loc;
+  cur_loc += OpKernelType::kLayoutBits;
+
+  int library_type = static_cast<int>(key.library_type_) << cur_loc;
+  cur_loc += OpKernelType::kLibBits;
+
+  int customized_value = key.customized_type_value_;
+  PADDLE_ENFORCE(customized_value < (1 << OpKernelType::kCustomizeBits));
+  customized_value = customized_value << cur_loc;
+  cur_loc += OpKernelType::kCustomizeBits;
+  PADDLE_ENFORCE(cur_loc < 64);
+
+  std::hash<int> hasher;
+  return hasher(place + data_type + data_layout + library_type +
+                customized_value);
+}
+
+bool OpKernelType::operator==(const OpKernelType& o) const {
+  return platform::places_are_same_class(place_, o.place_) &&
+         data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&
+         library_type_ == o.library_type_ &&
+         customized_type_value_ == o.customized_type_value_;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h
index ac0330218973123771367ed5ba9477c90143a043..9edc1a3e150027b5a3dbd8483dc8b58d1d4ab918 100644
--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
@@ -24,54 +24,55 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-struct OpKernelType {
-  struct Hash {
-    size_t operator()(const OpKernelType& key) const {
-      int place = key.place_.which();
-      int data_type = static_cast<int>(key.data_type_) << LEFT_SHIFT;
-      int data_layout = static_cast<int>(key.data_layout_) << (LEFT_SHIFT * 2);
-      int library_type = static_cast<int>(key.library_type_)
-                         << (LEFT_SHIFT * 3);
-
-      std::hash<int> hasher;
-      return hasher(place + data_type + data_layout + library_type);
-    }
-  };
+class OpKernelType {
+ public:
+  constexpr static int kDefaultCustomizedTypeValue = 0;
 
-  // place, data_type, library_type kinds less than 2^8
-  constexpr static int LEFT_SHIFT = 8;
-
-  proto::VarType::Type data_type_;
-  DataLayout data_layout_;
-  platform::Place place_;
-  LibraryType library_type_;
+  // In total should be smaller than 64.
+  constexpr static int kPlaceBits = 4;
+  constexpr static int kPrimaryDTypeBits = 8;
+  constexpr static int kLayoutBits = 4;
+  constexpr static int kLibBits = 4;
+  constexpr static int kCustomizeBits = 4;
 
   OpKernelType(proto::VarType::Type data_type, platform::Place place,
                DataLayout data_layout = DataLayout::kAnyLayout,
-               LibraryType library_type = LibraryType::kPlain)
+               LibraryType library_type = LibraryType::kPlain,
+               int customized_type_value = kDefaultCustomizedTypeValue)
       : data_type_(data_type),
         data_layout_(data_layout),
         place_(place),
-        library_type_(library_type) {}
+        library_type_(library_type),
+        customized_type_value_(customized_type_value) {}
 
   OpKernelType(proto::VarType::Type data_type,
                const platform::DeviceContext& dev_ctx,
                DataLayout data_layout = DataLayout::kAnyLayout,
-               LibraryType library_type = LibraryType::kPlain)
+               LibraryType library_type = LibraryType::kPlain,
+               int customized_type_value = kDefaultCustomizedTypeValue)
       : data_type_(data_type),
         data_layout_(data_layout),
         place_(dev_ctx.GetPlace()),
-        library_type_(library_type) {}
+        library_type_(library_type),
+        customized_type_value_(customized_type_value) {}
+
+  virtual ~OpKernelType() {}
+
+  struct Hash {
+    size_t operator()(const OpKernelType& key) const;
+  };
 
   size_t hash_key() const { return Hash()(*this); }
 
-  bool operator==(const OpKernelType& o) const {
-    return platform::places_are_same_class(place_, o.place_) &&
-           data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&
-           library_type_ == o.library_type_;
-  }
+  bool operator==(const OpKernelType& o) const;
 
   bool operator!=(const OpKernelType& o) const { return !(*this == o); }
+
+  proto::VarType::Type data_type_;
+  DataLayout data_layout_;
+  platform::Place place_;
+  LibraryType library_type_;
+  int customized_type_value_;
 };
 
 inline std::ostream& operator<<(std::ostream& os,
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 0e6e74293c30d5f8caa58fe6bfa63657d2669b46..6d39bb3c524b4725dfebd6ef07594b0b45c65463 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -35,6 +35,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
+
 class Registrar {
  public:
   // In our design, various kinds of classes, e.g., operators and kernels,
@@ -78,7 +79,7 @@ struct OpKernelRegistrarFunctor;
 
 template <typename PlaceType, typename T, typename Func>
 inline void RegisterKernelClass(const char* op_type, const char* library_type,
-                                Func func) {
+                                int customized_type_value, Func func) {
   std::string library(library_type);
   std::string data_layout = "ANYLAYOUT";
   if (library == "MKLDNN") {
@@ -86,7 +87,7 @@ inline void RegisterKernelClass(const char* op_type, const char* library_type,
   }
   OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(),
                    StringToDataLayout(data_layout),
-                   StringToLibraryType(library_type));
+                   StringToLibraryType(library_type), customized_type_value);
   OperatorWithKernel::AllOpKernels()[op_type][key] = func;
 }
 
@@ -95,22 +96,26 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
   using KERNEL_TYPE =
       typename std::tuple_element<I, std::tuple<KernelTypes...>>::type;
 
-  void operator()(const char* op_type, const char* library_type) const {
+  void operator()(const char* op_type, const char* library_type,
+                  int customized_type_value) const {
     using T = typename KERNEL_TYPE::ELEMENT_TYPE;
     RegisterKernelClass<PlaceType, T>(
-        op_type, library_type, [](const framework::ExecutionContext& ctx) {
+        op_type, library_type, customized_type_value,
+
+        [](const framework::ExecutionContext& ctx) {
           KERNEL_TYPE().Compute(ctx);
         });
     constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
     OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
         func;
-    func(op_type, library_type);
+    func(op_type, library_type, customized_type_value);
   }
 };
 
 template <typename PlaceType, size_t I, typename... KernelType>
 struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
-  void operator()(const char* op_type, const char* library_type) const {}
+  void operator()(const char* op_type, const char* library_type,
+                  int customized_type_value) const {}
 };
 
 // User can register many kernel in one place. The data type could be
@@ -118,9 +123,10 @@ struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
 template <typename PlaceType, typename... KernelType>
 class OpKernelRegistrar : public Registrar {
  public:
-  explicit OpKernelRegistrar(const char* op_type, const char* library_type) {
+  explicit OpKernelRegistrar(const char* op_type, const char* library_type,
+                             int customized_type_value) {
     OpKernelRegistrarFunctor<PlaceType, false, 0, KernelType...> func;
-    func(op_type, library_type);
+    func(op_type, library_type, customized_type_value);
   }
 };
 
@@ -130,17 +136,19 @@ struct OpKernelRegistrarFunctorEx;
 template <typename PlaceType, typename... DataTypeAndKernelType>
 class OpKernelRegistrarEx : public Registrar {
  public:
-  explicit OpKernelRegistrarEx(const char* op_type, const char* library_type) {
+  explicit OpKernelRegistrarEx(const char* op_type, const char* library_type,
+                               int customized_type_value) {
     OpKernelRegistrarFunctorEx<PlaceType, false, 0, DataTypeAndKernelType...>
         func;
-    func(op_type, library_type);
+    func(op_type, library_type, customized_type_value);
   }
 };
 
 template <typename PlaceType, size_t I, typename... DataTypeAndKernelType>
 struct OpKernelRegistrarFunctorEx<PlaceType, true, I,
                                   DataTypeAndKernelType...> {
-  void operator()(const char* op_type, const char* library_type) const {}
+  void operator()(const char* op_type, const char* library_type,
+                  int customized_type_value) const {}
 };
 
 template <typename PlaceType, size_t I, typename... DataTypeAndKernelType>
@@ -153,18 +161,21 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
       typename std::tuple_element<I,
                                   std::tuple<DataTypeAndKernelType...>>::type;
 
-  void operator()(const char* op_type, const char* library_type) const {
-    RegisterKernelClass<PlaceType, T>(op_type, library_type, Functor());
+  void operator()(const char* op_type, const char* library_type,
+                  int customized_type_value) const {
+    RegisterKernelClass<PlaceType, T>(op_type, library_type,
+                                      customized_type_value, Functor());
 
     constexpr auto size =
         std::tuple_size<std::tuple<DataTypeAndKernelType...>>::value;
     OpKernelRegistrarFunctorEx<PlaceType, I + 2 >= size, I + 2,
                                DataTypeAndKernelType...>
         func;
-    func(op_type, library_type);
+    func(op_type, library_type, customized_type_value);
   }
 };
 
+// clang-format off
 /**
  * check if MACRO is used in GLOBAL NAMESPACE.
  */
@@ -199,42 +210,64 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
 /**
  * Macro to register OperatorKernel.
  */
-#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...)        \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
-      __reg_op_kernel_##op_type##_##library_type##__,                      \
-      "REGISTER_OP_KERNEL must be called in global namespace");            \
-  static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__>  \
-      __op_kernel_registrar_##op_type##_##library_type##__(#op_type,       \
-                                                           #library_type); \
-  int TouchOpKernelRegistrar_##op_type##_##library_type() {                \
-    __op_kernel_registrar_##op_type##_##library_type##__.Touch();          \
-    return 0;                                                              \
+#define REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(op_type, library_type,             \
+                                            place_class, customized_name,      \
+                                            customized_type_value, ...)        \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                              \
+      __reg_op_kernel_##op_type##_##library_type##_##customized_name##__,      \
+                                 "REGISTER_OP_KERNEL must be called in "       \
+                                 "global namespace");                          \
+  static ::paddle::framework::OpKernelRegistrar<place_class,                   \
+                                                __VA_ARGS__>                   \
+      __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__(\
+          #op_type, #library_type, customized_type_value);                     \
+  int TouchOpKernelRegistrar_##op_type##_##library_type##_##customized_name() {\
+    __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__   \
+        .Touch();                                                              \
+    return 0;                                                                  \
   }
 
+#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...)   \
+  REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(                                \
+      op_type, library_type, place_class, DEFAULT_TYPE,               \
+      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
+      __VA_ARGS__)
+
 #define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__)
 
 #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
 
-#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, ...)      \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
-      __reg_op_kernel_##op_type##_##library_type##__,                       \
-      "REGISTER_OP_KERNEL_EX must be called in global namespace");          \
-  static ::paddle::framework::OpKernelRegistrarEx<place_class, __VA_ARGS__> \
-      __op_kernel_registrar_##op_type##_##library_type##__(#op_type,        \
-                                                           #library_type);  \
-  int TouchOpKernelRegistrar_##op_type##_##library_type() {                 \
-    __op_kernel_registrar_##op_type##_##library_type##__.Touch();           \
-    return 0;                                                               \
+#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class,  \
+                              customized_name,                     \
+                              customized_type_value,               \
+                              ...)                                 \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                  \
+      __reg_op_kernel_##op_type##_##library_type##_##customized_name##__, \
+                                 "REGISTER_OP_KERNEL_EX must be called in "  \
+                                 "global namespace");  \
+  static ::paddle::framework::OpKernelRegistrarEx<place_class,  \
+                                                  __VA_ARGS__>  \
+      __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__(\
+          #op_type, #library_type, customized_type_value);  \
+  int TouchOpKernelRegistrar_##op_type##_##library_type##_##customized_name() {\
+    __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__   \
+        .Touch();                                                              \
+    return 0;                                                                  \
   }
 
 #define REGISTER_OP_CUDA_KERNEL_FUNCTOR(op_type, ...)                 \
-  REGISTER_OP_KERNEL_EX(op_type, CUDA, ::paddle::platform::CUDAPlace, \
-                        __VA_ARGS__)
+  REGISTER_OP_KERNEL_EX(                                              \
+      op_type, CUDA, ::paddle::platform::CUDAPlace, DEFAULT_TYPE,     \
+      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
+      __VA_ARGS__)
 
-#define REGISTER_OP_CPU_KERNEL_FUNCTOR(op_type, ...) \
-  REGISTER_OP_KERNEL_EX(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
+#define REGISTER_OP_CPU_KERNEL_FUNCTOR(op_type, ...)                  \
+  REGISTER_OP_KERNEL_EX(                                              \
+      op_type, CPU, ::paddle::platform::CPUPlace, DEFAULT_TYPE,       \
+      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
+      __VA_ARGS__)
 
 /**
  * Macro to mark what Operator and Kernel
@@ -248,13 +281,19 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
   extern int TouchOpRegistrar_##op_type();                 \
   UNUSED static int use_op_itself_##op_type##_ = TouchOpRegistrar_##op_type()
 
-#define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE)               \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                 \
-      __use_op_kernel_##op_type##_##LIBRARY_TYPE##__,             \
-      "USE_OP_DEVICE_KERNEL must be in global namespace");        \
-  extern int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE(); \
-  UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_ = \
-      TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE()
+#define USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(op_type,                     \
+                                              LIBRARY_TYPE,                \
+                                              customized_name)             \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
+      __use_op_kernel_##op_type##_##LIBRARY_TYPE##_##customized_name##__,  \
+      "USE_OP_DEVICE_KERNEL must be in global namespace");                 \
+  extern int                                                               \
+      TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name(); \
+  UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_##DEFAULT_TYPE##_ = /* NOLINT */ \
+      TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name()
+
+#define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE) \
+  USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(op_type, LIBRARY_TYPE, DEFAULT_TYPE)
 
 // TODO(fengjiayi): The following macros
 // seems ugly, do we have better method?
@@ -280,6 +319,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
 #define USE_OP(op_type)   \
   USE_OP_ITSELF(op_type); \
   USE_OP_KERNEL(op_type)
+// clang-format on
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 8bfdf3891203823826fd5bf919c176011f22213c..66055e6f1d8c8685c27e7d63c3d79c607fb30104 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -695,6 +695,12 @@ static void CheckTensorNANOrInf(const std::string& name,
                  "Tensor %s contains NAN", name);
 }
 
+void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
+                                           const platform::Place& place) const {
+  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
+  this->InferShape(&infer_shape_ctx);
+}
+
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                  const platform::Place& place) const {
   RuntimeInferShapeContext infer_shape_ctx(*this, scope);
@@ -873,6 +879,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
           t = &(var->Get<SelectedRows>().value());
         }
         if (t != nullptr) {
+          PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized: %s",
+                         ipt_name, DebugString());
           int tmp = static_cast<int>(ToDataType(t->type()));
           PADDLE_ENFORCE(
               tmp == data_type || data_type == -1,
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 5bd68f9ac2e1b30bc6ce3094960bb89842b99e01..0a6a28a5bce01d71cf56f25f5556033db94452c2 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -128,6 +128,8 @@ class OperatorBase {
   virtual std::vector<std::string> OutputVars(bool has_intermediate) const;
 
   void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; }
+  virtual void RuntimeInferShape(const Scope& scope,
+                                 const platform::Place& place) const {}
 
  protected:
   std::string type_;
@@ -348,6 +350,9 @@ class OperatorWithKernel : public OperatorBase {
     OpInfoMap::Instance().Get(Type()).infer_shape_(ctx);
   }
 
+  void RuntimeInferShape(const Scope& scope,
+                         const platform::Place& place) const override;
+
  protected:
   virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
   virtual OpKernelType GetKernelTypeForVar(
diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc
index ac9dd8245ad4e0e8842f219b23d3866b03fdaedb..ab14732e4d6eab9dd15364da02b436c10ed68a19 100644
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -50,6 +50,8 @@ class OpWithoutKernelCheckerMaker : public OpProtoAndCheckerMaker {
     AddInput("input", "input of test op");
     AddOutput("output", "output of test op");
     AddAttr<float>("scale", "scale of cosine op");
+    AddAttr<int>("kernel_sub_type", "kernels with different implementations.")
+        .SetDefault(0);
     AddComment("This is test op");
   }
 };
@@ -95,6 +97,8 @@ TEST(OperatorBase, all) {
 namespace paddle {
 namespace framework {
 
+static int special_type_value = 1;
+
 class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
  public:
   void Make() {
@@ -103,11 +107,14 @@ class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
     AddAttr<float>("scale", "scale of cosine op")
         .SetDefault(1.0)
         .GreaterThan(0.0);
+    AddAttr<int>("kernel_sub_type", "kernels with different implementations.")
+        .SetDefault(0);
     AddComment("This is test op");
   }
 };
 
 static int cpu_kernel_run_num = 0;
+static int cpu_kernel2_run_num = 0;
 
 class OpWithKernelTest : public OperatorWithKernel {
  public:
@@ -117,7 +124,10 @@ class OpWithKernelTest : public OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {}
   OpKernelType GetExpectedKernelType(
       const ExecutionContext& ctx) const override {
-    return OpKernelType(proto::VarType::FP32, ctx.GetPlace());
+    int sub_type = ctx.Attr<int>("kernel_sub_type");
+    return OpKernelType(proto::VarType::FP32, ctx.GetPlace(),
+                        framework::DataLayout::kAnyLayout,
+                        framework::LibraryType::kPlain, sub_type);
   }
 };
 
@@ -132,6 +142,17 @@ class CPUKernelTest : public OpKernel<float> {
   }
 };
 
+template <typename T1, typename T2>
+class CPUKernel2Test : public OpKernel<float> {
+ public:
+  void Compute(const ExecutionContext& ctx) const {
+    std::cout << ctx.op().DebugString() << std::endl;
+    cpu_kernel2_run_num++;
+    ASSERT_EQ(ctx.op().Input("x"), "IN1");
+    ASSERT_EQ(ctx.op().Output("y"), "OUT1");
+  }
+};
+
 class OpKernelTestMultiInputsProtoAndCheckerMaker
     : public OpProtoAndCheckerMaker {
  public:
@@ -142,6 +163,8 @@ class OpKernelTestMultiInputsProtoAndCheckerMaker
     AddAttr<float>("scale", "scale of cosine op")
         .SetDefault(1.0)
         .GreaterThan(0.0);
+    AddAttr<int>("kernel_sub_type", "kernels with different implementations.")
+        .SetDefault(0);
     AddComment("This is test op");
   }
 };
@@ -189,9 +212,15 @@ class CPUKernalMultiInputsTest : public OpKernel<float> {
 REGISTER_OP_WITHOUT_GRADIENT(
     op_with_kernel, paddle::framework::OpWithKernelTest,
     paddle::framework::OpKernelTestProtoAndCheckerMaker);
+
 REGISTER_OP_CPU_KERNEL(op_with_kernel,
                        paddle::framework::CPUKernelTest<float, float>);
 
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(
+    op_with_kernel, CPU, paddle::platform::CPUPlace, MY_SPECIAL_NAME,
+    paddle::framework::special_type_value,
+    paddle::framework::CPUKernel2Test<float, float>);
+
 // test with single input
 TEST(OpKernel, all) {
   paddle::framework::InitDevices(true);
@@ -211,7 +240,19 @@ TEST(OpKernel, all) {
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
   op->Run(scope, cpu_place);
+  // kerne_sub_type = 0, hence cpu_kernel is called, cpu_kernel2 is not called.
+  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
+  ASSERT_EQ(paddle::framework::cpu_kernel2_run_num, 0);
+
+  attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("kernel_sub_type");
+  attr->set_type(paddle::framework::proto::AttrType::INT);
+  attr->set_i(1);
+  auto op2 = paddle::framework::OpRegistry::CreateOp(op_desc);
+  op2->Run(scope, cpu_place);
+  // kerne_sub_type = 1, hence cpu_kernel2 is called, cpu_kernel is not called.
   ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
+  ASSERT_EQ(paddle::framework::cpu_kernel2_run_num, 1);
 }
 
 REGISTER_OP_WITHOUT_GRADIENT(
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index b98408ee7726768a108772329b8dc95c2df3c891..eb4baa06b5284512eab128e57f893bad43afda97 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -26,17 +26,41 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/platform/profiler.h"
 
+#ifdef WITH_GPERFTOOLS
+#include "gperftools/profiler.h"
+#endif
+DEFINE_string(pe_profile_fname, "",
+              "Profiler filename for PE, which generated by gperftools."
+              "Only valid when compiled `WITH_PRIFILER=ON`. Empty if disable.");
+
 namespace paddle {
 namespace framework {
 
+static std::once_flag gProfileOnce;
+#ifdef WITH_GPERFTOOLS
+static bool gProfileStarted = false;
+#endif
 class ParallelExecutorPrivate {
  public:
   explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
-      : places_(places) {}
+      : places_(places) {
+    if (!FLAGS_pe_profile_fname.empty()) {
+      std::call_once(gProfileOnce, [] {
+#ifdef WITH_GPERFTOOLS
+        ProfilerStart(FLAGS_pe_profile_fname.c_str());
+        gProfileStarted = true;
+#else
+        LOG(WARNING) << "Paddle is not compiled with gperftools. "
+                        "FLAGS_pe_profile_fname will be ignored";
+#endif
+      });
+    }
+  }
 
   ~ParallelExecutorPrivate() {
     if (own_local_scope_) {
@@ -49,6 +73,26 @@ class ParallelExecutorPrivate {
       }
     }
   }
+
+  std::unique_ptr<ir::Graph> PrepareGCAndRefCnts(
+      std::unique_ptr<ir::Graph> graph, size_t max_memory_size);
+
+  inline bool HasGarbageCollectors() const { return !gcs_.empty(); }
+
+  void ResetRuntimeReferenceCount(const std::vector<std::string> &fetch_tensors,
+                                  const std::string &fetched_var_name) {
+    for (size_t i = 0; i < runtime_ref_cnts_.size(); ++i) {
+      for (auto &pair : global_ref_cnts_[i]) {
+        runtime_ref_cnts_[i][pair.first] = pair.second;
+      }
+
+      for (auto &fetch_name : fetch_tensors) {
+        runtime_ref_cnts_[i].erase(fetch_name);
+      }
+      runtime_ref_cnts_[i].erase(fetched_var_name);
+    }
+  }
+
   std::vector<platform::Place> places_;
   std::vector<Scope *> local_scopes_;
   Scope *global_scope_;  // not owned
@@ -60,8 +104,76 @@ class ParallelExecutorPrivate {
   bool own_local_scope_;
   bool use_cuda_;
   bool use_all_reduce_;
+
+  // global_ref_cnts_ is only initialized when ParallelExecutor constructs, and
+  // then keeps unchanged
+  // Before each iteration, runtime_ref_cnts_ is reset to global_ref_cnts_
+  std::vector<details::ReferenceCountMap> global_ref_cnts_;
+  std::vector<details::AtomicReferenceCountMap> runtime_ref_cnts_;
+  details::GarbageCollectorMap gcs_;
 };
 
+std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
+    std::unique_ptr<ir::Graph> graph, size_t max_memory_size) {
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto &place = places_[i];
+    if (gcs_.count(place) > 0) {
+      continue;
+    }
+    std::unique_ptr<GarbageCollector> gc;
+#ifdef PADDLE_WITH_CUDA
+    if (platform::is_gpu_place(place)) {
+      if (IsFastEagerDeletionModeEnabled()) {
+        gc.reset(new UnsafeFastGPUGarbageCollector(
+            boost::get<platform::CUDAPlace>(place), max_memory_size));
+      } else {
+        gc.reset(new StreamGarbageCollector(
+            boost::get<platform::CUDAPlace>(place), max_memory_size));
+      }
+      VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
+    } else {
+#endif
+      if (platform::is_cpu_place(place)) {
+        gc.reset(new CPUGarbageCollector(boost::get<platform::CPUPlace>(place),
+                                         max_memory_size));
+        VLOG(10) << "Created GarbageCollector at " << place;
+      } else {
+        PADDLE_THROW("Unsupported place for garbage collection");
+      }
+#ifdef PADDLE_WITH_CUDA
+    }
+#endif
+
+    gcs_.emplace(place, std::move(gc));
+  }
+
+  if (!gcs_.empty()) {
+    std::vector<details::LastLiveOpsOfVars> last_live_ops_of_vars;
+
+    auto ref_cnt_pass =
+        ir::PassRegistry::Instance().Get("reference_count_pass");
+    ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount,
+                              &global_ref_cnts_);
+    ref_cnt_pass->SetNotOwned(details::kLastLiveOpsOfVars,
+                              &last_live_ops_of_vars);
+    graph = ref_cnt_pass->Apply(std::move(graph));
+    VLOG(10) << "ReferenceCountPass Applied";
+
+    auto eager_deletion_pass =
+        ir::PassRegistry::Instance().Get("eager_deletion_pass");
+    eager_deletion_pass->SetNotOwned(details::kRuntimeReferenceCount,
+                                     &runtime_ref_cnts_);
+    eager_deletion_pass->SetNotOwned(details::kGarbageCollector, &gcs_);
+    eager_deletion_pass->SetNotOwned(details::kLastLiveOpsOfVars,
+                                     &last_live_ops_of_vars);
+    eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_);
+    graph = eager_deletion_pass->Apply(std::move(graph));
+    VLOG(10) << "EagerDeletionPass Applied";
+  }
+
+  return graph;
+}
+
 std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
   return member_->local_scopes_;
 }
@@ -128,36 +240,18 @@ ParallelExecutor::ParallelExecutor(
   std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
       main_program, member_->places_, loss_var_name, params,
       member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get());
-
-  auto max_memory_size = GetEagerDeletionThreshold();
-  if (max_memory_size >= 0) {
-    for (auto &place : member_->places_) {
-      if (!platform::is_gpu_place(place)) continue;
-      auto gpu_place = boost::get<platform::CUDAPlace>(place);
-      if (gcs_[gpu_place.device] == nullptr) {
-        ref_cnts_[gpu_place.device].reset(new details::ReferenceCountMap());
-        cur_ref_cnts_[gpu_place.device].reset(
-            new details::AtomicReferenceCountMap());
-        gcs_[gpu_place.device].reset(
-            new StreamGarbageCollector<Tensor>(gpu_place, max_memory_size));
-      }
-    }
-    if (!gcs_.empty()) {
-      auto ref_cnt_pass =
-          ir::PassRegistry::Instance().Get("reference_count_pass");
-      ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, &ref_cnts_);
-      ref_cnt_pass->SetNotOwned(details::kCurReferenceCount, &cur_ref_cnts_);
-      ref_cnt_pass->SetNotOwned(details::kGarbageCollector, &gcs_);
-      graph = ref_cnt_pass->Apply(std::move(graph));
-      graph->SetNotOwned("garbage_collector", &gcs_);
-    }
-  }
 #else
   std::unique_ptr<ir::Graph> graph =
       build_strategy.Apply(main_program, member_->places_, loss_var_name,
                            params, member_->local_scopes_, member_->use_cuda_);
 #endif
 
+  auto max_memory_size = GetEagerDeletionThreshold();
+  if (max_memory_size >= 0) {
+    graph = member_->PrepareGCAndRefCnts(std::move(graph),
+                                         static_cast<size_t>(max_memory_size));
+  }
+
   // Step 3. Create vars in each scope. Passes may also create new vars.
   //         skip control vars and empty vars
   std::vector<details::VariableInfo> var_infos;
@@ -270,19 +364,16 @@ void ParallelExecutor::BCastParamsToDevices(
 
 void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
                            const std::string &fetched_var_name) {
-  platform::RecordBlock b(0);
-#ifdef PADDLE_WITH_CUDA
-  if (!gcs_.empty()) {
-    ResetReferenceCount();
-    for (auto &pair : cur_ref_cnts_) {
-      auto &name_map = *(pair.second);
-      for (auto &fetch_name : fetch_tensors) {
-        name_map.erase(fetch_name);
-      }
-      name_map.erase(fetched_var_name);
-    }
+#ifdef WITH_GPERFTOOLS
+  if (gProfileStarted) {
+    ProfilerFlush();
   }
 #endif
+
+  platform::RecordBlock b(0);
+  if (member_->HasGarbageCollectors()) {
+    member_->ResetRuntimeReferenceCount(fetch_tensors, fetched_var_name);
+  }
   auto fetch_data = member_->executor_->Run(fetch_tensors);
   *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
       fetch_data;
@@ -326,13 +417,11 @@ ParallelExecutor::~ParallelExecutor() {
   for (auto &p : member_->places_) {
     platform::DeviceContextPool::Instance().Get(p)->Wait();
   }
-  // member_ must be destructed before gcs_ since the destructor of
-  // ReferenceCountOpHandle use raw pointers of gcs_ inside.
-  member_.reset();
+  delete member_;
 }
 
 }  // namespace framework
 }  // namespace paddle
-#ifdef PADDLE_WITH_CUDA
+
 USE_PASS(reference_count_pass);
-#endif
+USE_PASS(eager_deletion_pass);
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index ef09b98b2aa91a9d729b94d15dbb676dde4092b6..1fc17a0d64d50eb70ce66cacd4752a5b96d5e894 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include <atomic>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -29,10 +28,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/framework/details/reference_count_pass.h"
-#endif
-
 namespace paddle {
 namespace framework {
 
@@ -75,24 +70,7 @@ class ParallelExecutor {
  private:
   void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;
 
-  std::unique_ptr<ParallelExecutorPrivate> member_;
-
-#ifdef PADDLE_WITH_CUDA
-  // ref_cnts_ is only initialized when ParallelExecutor constructs, and then
-  // keeps unchanged
-  // Before each iteration, cur_ref_cnts_ is reset to ref_cnts_
-  details::DeviceReferenceCountMap ref_cnts_;
-  details::AtomicDeviceReferenceCountMap cur_ref_cnts_;
-  details::DeviceGarbageCollectorMap gcs_;
-
-  void ResetReferenceCount() {
-    for (auto &pair1 : ref_cnts_) {
-      for (auto &pair2 : *(pair1.second)) {
-        (*(cur_ref_cnts_[pair1.first]))[pair2.first] = pair2.second;
-      }
-    }
-  }
-#endif
+  ParallelExecutorPrivate *member_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 0d261dd7ccc323abddd2c3ef13f1874661a8ca75..6fa5e99f9f3a7e871f1a742a30803853988ea6eb 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -38,6 +38,10 @@ DEFINE_double(
     "Memory size threshold (GB) when the garbage collector clear tensors."
     "Disabled when this value is less than 0");
 
+DEFINE_bool(fast_eager_deletion_mode, false,
+            "Fast eager deletion mode. If enabled, memory would release "
+            "immediately without waiting GPU kernel ends.");
+
 // When in inference scenario, the scopes will not be written by two threads in
 // a mean time, but a scope may be read by multiple threads concurrently, and
 // the mutex will cause serious performance issue.
@@ -58,6 +62,8 @@ int64_t GetEagerDeletionThreshold() {
                                     (static_cast<int64_t>(1) << 30));
 }
 
+bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; }
+
 Scope::~Scope() { DropKids(); }
 
 Scope& Scope::NewScope() const {
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index 1901ffbe57e0d85193c3a218f06eba06a0f287a5..aded1f771cedbf2442ad36d7fab3e6e6caffdc24 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -27,6 +27,7 @@ namespace paddle {
 namespace framework {
 
 int64_t GetEagerDeletionThreshold();
+bool IsFastEagerDeletionModeEnabled();
 
 class Scope;
 
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index 44384082dbaf7a8d654e8461da87009bde33a3d5..e1bdba9b46a4cbdb664b70c7419f567ef95bdf31 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -32,8 +32,7 @@ namespace framework {
 class SelectedRows {
   /*
    * @brief We can use the SelectedRows structure to reproduce a sparse table.
-   *  A sparse table is a key-value structure that the key is an `int64_t`
-   * number,
+   *  A sparse table is a key-value structure that the key is an `int64_t`,
    *  and the value is a Tensor which the first dimension is 0.
    *  You can use the following interface to operate the sparse table, and you
    * can find
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 71e8badd4b6b08e7d380fd45d93a33176172081d..153222506af02911b792907d392a42e5499c5e7a 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -158,6 +158,10 @@ class Tensor {
   const std::shared_ptr<memory::Allocation>& Holder() const { return holder_; }
   size_t offset() const { return offset_; }
 
+  std::shared_ptr<memory::Allocation> MoveMemoryHolder() {
+    return std::move(holder_);
+  }
+
  private:
   /*! holds the memory block if allocated. */
   std::shared_ptr<memory::Allocation> holder_;
diff --git a/paddle/fluid/framework/variable_helper.cc b/paddle/fluid/framework/variable_helper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fc4525549caeebb06dea766ccb123b5ebc6d5b13
--- /dev/null
+++ b/paddle/fluid/framework/variable_helper.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/variable_helper.h"
+
+#include <vector>
+
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
+  if (var_type == proto::VarType::LOD_TENSOR) {
+    var->GetMutable<LoDTensor>();
+  } else if (var_type == proto::VarType::SELECTED_ROWS) {
+    var->GetMutable<SelectedRows>();
+  } else if (var_type == proto::VarType::FEED_MINIBATCH) {
+    var->GetMutable<FeedFetchList>();
+  } else if (var_type == proto::VarType::FETCH_LIST) {
+    var->GetMutable<FeedFetchList>();
+  } else if (var_type == proto::VarType::STEP_SCOPES) {
+    var->GetMutable<std::vector<framework::Scope*>>();
+  } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
+    var->GetMutable<LoDRankTable>();
+  } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
+    var->GetMutable<LoDTensorArray>();
+  } else if (var_type == proto::VarType::PLACE_LIST) {
+    var->GetMutable<platform::PlaceList>();
+  } else if (var_type == proto::VarType::READER) {
+    var->GetMutable<ReaderHolder>();
+  } else if (var_type == proto::VarType::RAW) {
+    // GetMutable will be called in operator
+  } else {
+    PADDLE_THROW(
+        "Variable type %d is not in "
+        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
+        "LOD_RANK_TABLE, PLACE_LIST, READER, RAW]",
+        var_type);
+  }
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/variable_helper.h b/paddle/fluid/framework/variable_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e0c72c3621dce0a6b372f9a9110a63fbc0a1d71
--- /dev/null
+++ b/paddle/fluid/framework/variable_helper.h
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/variable.h"
+namespace paddle {
+namespace framework {
+void InitializeVariable(Variable *var, proto::VarType::Type var_type);
+}
+}
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..373d292b443b7651b785a52a6986b0a0be58ad61
--- /dev/null
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -0,0 +1,3 @@
+cc_library(layer SRCS layer.cc DEPS proto_desc operator)
+cc_library(tracer SRCS tracer.cc DEPS proto_desc)
+cc_library(engine SRCS engine.cc)
diff --git a/paddle/fluid/imperative/engine.cc b/paddle/fluid/imperative/engine.cc
new file mode 100644
index 0000000000000000000000000000000000000000..de7ab0e5918281579728ef48d1517be2cd530af7
--- /dev/null
+++ b/paddle/fluid/imperative/engine.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/imperative/engine.h"
+
+#include <mutex>  // NOLINT
+#include <vector>
+
+#include "glog/logging.h"
+
+namespace paddle {
+namespace imperative {
+
+static std::once_flag init_engine;
+static Engine* engine;
+
+class DummyEngine : public Engine {
+ public:
+  void Enqueue(Runnable* runnable) override {
+    queued_runnables_.push_back(runnable);
+  }
+
+  size_t Size() const override { return queued_runnables_.size(); }
+
+  void Sync() override {
+    for (Runnable* l : queued_runnables_) {
+      LOG(INFO) << "running " << reinterpret_cast<void*>(l);
+    }
+    queued_runnables_.clear();
+  }
+
+ private:
+  std::vector<Runnable*> queued_runnables_;
+};
+
+Engine* GetEngine() {
+  std::call_once(init_engine, []() { engine = new DummyEngine(); });
+  return engine;
+}
+
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/imperative/engine.h b/paddle/fluid/imperative/engine.h
new file mode 100644
index 0000000000000000000000000000000000000000..a1dfa5bda38d0c419aa4ccbea77b32eb7e0d5b23
--- /dev/null
+++ b/paddle/fluid/imperative/engine.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace paddle {
+namespace imperative {
+
+struct Runnable {};
+
+class Engine {
+ public:
+  virtual ~Engine() {}
+
+  virtual void Enqueue(Runnable* runnable) = 0;
+
+  virtual size_t Size() const = 0;
+
+  virtual void Sync() = 0;
+};
+
+Engine* GetEngine();
+
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..612503768079472ba233ee3fcd43a47fdba9a0cc
--- /dev/null
+++ b/paddle/fluid/imperative/layer.cc
@@ -0,0 +1,221 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/imperative/layer.h"
+#include <deque>
+#include <limits>
+#include <map>
+#include <random>
+#include <utility>
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace paddle {
+namespace imperative {
+
+using framework::Variable;
+
+void AddTo(Variable* src, Variable* dst) {
+  framework::LoDTensor* dst_tensor = dst->GetMutable<framework::LoDTensor>();
+  framework::LoDTensor* src_tensor = src->GetMutable<framework::LoDTensor>();
+  PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), "%lld vs %lld",
+                 dst_tensor->numel(), src_tensor->numel());
+  float* dst_data = dst_tensor->mutable_data<float>(platform::CPUPlace());
+  const float* src_data = src_tensor->data<float>();
+  for (size_t i = 0; i < src_tensor->numel(); ++i) {
+    dst_data[i] += src_data[i];
+  }
+}
+
+class Autograd {
+ public:
+  explicit Autograd(framework::Scope* scope) : scope_(scope) {}
+
+  void RunBackward(VarBase* var) {
+    PADDLE_ENFORCE(var->pre_op_->op_desc_);
+    // TODO(panyx0718): Only create for vars that "require_grad"
+    (*var->pre_op_->output_vars_)[var->pre_op_out_idx_]->grads_ = var->grads_;
+
+    std::deque<OpBase*> ready;
+    ready.push_back(var->pre_op_);
+
+    std::map<OpBase*, int> dep_counts = ComputeDepCounts(var->pre_op_);
+
+    while (!ready.empty()) {
+      OpBase* ready_op = ready.front();
+      ready.pop_front();
+      std::vector<Variable*> input_grads = ready_op->ApplyGrad(scope_);
+
+      for (size_t i = 0; i < input_grads.size(); ++i) {
+        if (!input_grads[i]) continue;
+        OpBase* pre_op = ready_op->pre_ops_->at(i);
+        if (!pre_op) continue;
+
+        dep_counts[pre_op] -= 1;
+        PADDLE_ENFORCE(dep_counts[pre_op] >= 0);
+        bool pre_op_ready = dep_counts[pre_op] == 0;
+        if (pre_op_ready) {
+          ready.push_back(pre_op);
+        }
+      }
+    }
+  }
+
+ private:
+  std::map<OpBase*, int> ComputeDepCounts(OpBase* op) {
+    std::map<OpBase*, int> ret;
+
+    std::deque<OpBase*> queue;
+    queue.push_back(op);
+    std::unordered_set<OpBase*> visited;
+    visited.insert(op);
+    while (!queue.empty()) {
+      OpBase* candidate = queue.front();
+      queue.pop_front();
+      for (OpBase* pre_op : *(candidate->pre_ops_)) {
+        if (!pre_op) continue;
+        if (visited.find(pre_op) == visited.end()) {
+          visited.insert(pre_op);
+          queue.push_back(pre_op);
+        }
+        ret[pre_op] += 1;
+      }
+    }
+
+    return ret;
+  }
+
+  framework::Scope* scope_;
+};
+
+framework::Variable* CreateVariable(const std::string& name,
+                                    const framework::DDim& dim, float val,
+                                    framework::Scope* scope,
+                                    bool random_name = true) {
+  std::string varname = name;
+  if (random_name) {
+    std::mt19937 rng;
+    rng.seed(std::random_device()());
+    std::uniform_int_distribution<std::mt19937::result_type> dist6(
+        1, std::numeric_limits<int>::max());
+    int id = dist6(rng);
+    varname = string::Sprintf("%s@%d", varname, id);
+  }
+
+  VLOG(3) << "creating var " << varname;
+  framework::Variable* var = scope->Var(varname);
+  framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
+
+  float* data = tensor->mutable_data<float>(dim, platform::CPUPlace());
+  std::fill(data, data + tensor->numel(), val);
+  return var;
+}
+
+framework::LoDTensor& VarBase::Grad() {
+  VLOG(3) << "get var grad " << var_desc_->Name();
+  return *grads_->GetMutable<framework::LoDTensor>();
+}
+
+void VarBase::ApplyGrad(framework::Scope* scope, Variable* grad) {
+  VLOG(3) << "apply var grad " << var_desc_->Name() << " "
+          << grad->Get<framework::LoDTensor>().data<float>()[0];
+  if (!grads_) {
+    grads_ =
+        CreateVariable(string::Sprintf("%s@IGrad", var_desc_->Name()),
+                       var_->Get<framework::LoDTensor>().dims(), 0.0, scope);
+  }
+  AddTo(grad, grads_);
+  VLOG(3) << "grad_ after apply var grad " << var_desc_->Name() << " "
+          << grads_->Get<framework::LoDTensor>().data<float>()[0];
+}
+
+std::vector<Variable*> OpBase::ApplyGrad(framework::Scope* scope) {
+  VLOG(3) << "op grad " << grad_op_desc_->Type();
+
+  for (const std::string& grad_invar : grad_op_desc_->InputArgumentNames()) {
+    if (grad_to_var_->find(grad_invar) == grad_to_var_->end()) {
+      // grad op inputs can be forward inputs, so not in grad_to_var.
+      continue;
+    }
+    VLOG(3) << "op grad in var " << grad_invar;
+    block_->FindRecursiveOrCreateVar(grad_invar);
+    framework::Variable* var = scope->Var(grad_invar);
+    const std::string& invar = grad_to_var_->at(grad_invar);
+    for (VarBase* varbase : *output_vars_) {
+      // Use the accumulated grads_ by sharing the input with grads_.
+      if (varbase->var_desc_->Name() == invar) {
+        var->GetMutable<framework::LoDTensor>()->ShareDataWith(
+            varbase->grads_->Get<framework::LoDTensor>());
+        break;
+      }
+    }
+  }
+
+  for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) {
+    VLOG(3) << "grad outvar " << outvar;
+    block_->FindRecursiveOrCreateVar(outvar);
+    framework::Variable* var = scope->Var(outvar);
+    if (!var->IsInitialized()) {
+      framework::VarDesc* var_desc = block_->FindVar(outvar);
+      if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
+        var->GetMutable<framework::LoDTensor>();
+      } else {
+        LOG(ERROR) << "tracer doesn't support yet";
+      }
+    }
+  }
+  grad_op_desc_->InferShape(*block_);
+  grad_op_desc_->InferVarType(block_);
+  std::unique_ptr<framework::OperatorBase> opbase =
+      framework::OpRegistry::CreateOp(*grad_op_desc_);
+
+  opbase->Run(*scope, platform::CPUPlace());
+
+  // `ret` matches exactly with `input_vars_` of forward op.
+  std::vector<Variable*> ret;
+  for (size_t i = 0; i < input_vars_->size(); ++i) {
+    bool found = false;
+    for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) {
+      Variable* var = scope->FindVar(outvar);
+      VarBase* origin_var = (*input_vars_)[i];
+      std::string orig_var = grad_to_var_->at(outvar);
+      PADDLE_ENFORCE(origin_var->var_desc_->Name() == orig_var);
+      VLOG(3) << "apply grad " << outvar << " with origin " << orig_var;
+      origin_var->ApplyGrad(scope, var);
+      found = true;
+      ret.push_back(var);
+      // TODO(panyx0718): There might be another outvar with the same name.
+      // In that case, it doesn't matter the first one or the second one is
+      // used.
+      break;
+    }
+    if (!found) {
+      ret.push_back(nullptr);
+    }
+  }
+  return ret;
+}
+
+void VarBase::RunBackward(framework::Scope* scope) {
+  grads_ = CreateVariable(framework::GradVarName(var_desc_->Name()),
+                          var_->Get<framework::LoDTensor>().dims(), 1.0, scope,
+                          false);
+  if (!pre_op_) return;
+  Autograd(scope).RunBackward(this);
+}
+
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
new file mode 100644
index 0000000000000000000000000000000000000000..85a71ca83d21ed2595ddbe684300a46c05fed3af
--- /dev/null
+++ b/paddle/fluid/imperative/layer.h
@@ -0,0 +1,102 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace imperative {
+
+class OpBase;
+
+class VarBase {
+ public:
+  VarBase()
+      : pre_op_(nullptr),
+        pre_op_out_idx_(-1),
+        var_desc_(nullptr),
+        var_(nullptr),
+        grads_(nullptr) {}
+
+  virtual ~VarBase() {}
+
+  void ApplyGrad(framework::Scope* scope, framework::Variable* grad);
+
+  void RunBackward(framework::Scope* scope);
+
+  framework::LoDTensor& Grad();
+
+  OpBase* pre_op_;
+  int pre_op_out_idx_;
+
+  framework::VarDesc* var_desc_;
+  framework::Variable* var_;
+  framework::Variable* grads_;
+};
+
+class OpBase {
+ public:
+  OpBase()
+      : input_vars_(new std::vector<VarBase*>()),
+        output_vars_(new std::vector<VarBase*>()),
+        pre_ops_(new std::vector<OpBase*>()),
+        pre_ops_out_idx_(new std::vector<int>()),
+        op_desc_(nullptr),
+        grad_op_desc_(nullptr) {}
+
+  virtual ~OpBase() {
+    delete input_vars_;
+    delete output_vars_;
+
+    delete pre_ops_;
+    delete pre_ops_out_idx_;
+
+    if (grad_op_desc_) delete grad_op_desc_;
+    if (grad_to_var_) delete grad_to_var_;
+  }
+
+  std::vector<framework::Variable*> ApplyGrad(framework::Scope* scope);
+
+  std::vector<VarBase*>* input_vars_;
+  std::vector<VarBase*>* output_vars_;
+  std::vector<OpBase*>* pre_ops_;
+  std::vector<int>* pre_ops_out_idx_;
+  framework::OpDesc* op_desc_;
+
+  framework::OpDesc* grad_op_desc_;
+  std::unordered_map<std::string, std::string>* grad_to_var_;
+  framework::BlockDesc* block_;
+};
+
+class Layer {
+ public:
+  virtual ~Layer() {}
+
+  virtual std::vector<VarBase> Forward(const std::vector<VarBase>& inputs) {
+    std::vector<VarBase> vars;
+    return vars;
+  }
+
+  virtual void Backward() { LOG(ERROR) << "To support customize"; }
+};
+
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f64f9e72c4a23528948183b909d65e90783a4463
--- /dev/null
+++ b/paddle/fluid/imperative/tracer.cc
@@ -0,0 +1,19 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/imperative/tracer.h"
+
+namespace paddle {
+namespace imperative {}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
new file mode 100644
index 0000000000000000000000000000000000000000..433d07c0e5aa0986ab1e9fe349ef865d2851c0c0
--- /dev/null
+++ b/paddle/fluid/imperative/tracer.h
@@ -0,0 +1,128 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/imperative/engine.h"
+#include "paddle/fluid/imperative/layer.h"
+
+namespace paddle {
+namespace imperative {
+
+void CreateGradOp(const framework::OpDesc& op_desc,
+                  const std::unordered_set<std::string>& no_grad_set,
+                  const std::vector<framework::BlockDesc*>& grad_sub_block,
+                  framework::OpDesc** grad_op_desc,
+                  std::unordered_map<std::string, std::string>* grad_to_var) {
+  std::vector<std::unique_ptr<framework::OpDesc>> grad_op_descs =
+      framework::OpInfoMap::Instance()
+          .Get(op_desc.Type())
+          .GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block);
+  PADDLE_ENFORCE(grad_op_descs.size() == 1, "Only support 1 grad op now.");
+  // TODO(panyx0718): Leak?
+  *grad_op_desc = grad_op_descs[0].release();
+}
+
+class Tracer {
+ public:
+  explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {
+    root_scope_ = new framework::Scope();
+    scopes_[root_block_] = root_scope_;
+  }
+
+  virtual ~Tracer() { delete root_scope_; }
+
+  void Trace(OpBase* op, const std::vector<VarBase*>& inputs,
+             const std::vector<VarBase*>& outputs,
+             framework::BlockDesc* block) {
+    framework::Scope* scope = GetScope(block);
+    framework::OpDesc* op_desc = op->op_desc_;
+    VLOG(3) << "tracer tracing " << op_desc->Type();
+    op_desc->InferShape(*block);
+    op_desc->InferVarType(block);
+    std::unique_ptr<framework::OperatorBase> op_base =
+        framework::OpRegistry::CreateOp(*op_desc);
+
+    *op->input_vars_ = inputs;
+    for (VarBase* input : inputs) {
+      const std::string vname = input->var_desc_->Name();
+      framework::Variable* var = scope->Var(vname);
+      input->var_ = var;
+      if (!var->IsInitialized()) {
+        framework::VarDesc* var_desc = block->FindVar(vname);
+        if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
+          var->GetMutable<framework::LoDTensor>();
+        } else {
+          LOG(ERROR) << "tracer doesn't support yet";
+        }
+      }
+      if (input->pre_op_) {
+        op->pre_ops_->push_back(input->pre_op_);
+        op->pre_ops_out_idx_->push_back(input->pre_op_out_idx_);
+      } else {
+        op->pre_ops_->push_back(nullptr);
+      }
+    }
+
+    *op->output_vars_ = outputs;
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      const std::string vname = outputs[i]->var_desc_->Name();
+      framework::Variable* var = scope->Var(vname);
+      if (!var->IsInitialized()) {
+        framework::VarDesc* var_desc = block->FindVar(vname);
+        if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
+          var->GetMutable<framework::LoDTensor>();
+        } else {
+          LOG(ERROR) << "tracer doesn't support yet";
+        }
+      }
+      outputs[i]->var_ = var;
+      outputs[i]->pre_op_ = op;
+      outputs[i]->pre_op_out_idx_ = i;
+    }
+    op_base->Run(*scope, platform::CPUPlace());
+    framework::OpDesc* grad_op_desc;
+    auto grad_to_var = new std::unordered_map<std::string, std::string>();
+    CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var);
+    op->grad_op_desc_ = grad_op_desc;
+    op->grad_to_var_ = grad_to_var;
+    op->block_ = block;
+  }
+
+  framework::Scope* GetScope(framework::BlockDesc* block) {
+    if (scopes_.find(block) != scopes_.end()) {
+      return scopes_.at(block);
+    }
+    framework::BlockDesc* parent_block = block->ParentBlock();
+    PADDLE_ENFORCE(scopes_.find(parent_block) != scopes_.end());
+    framework::Scope* scope = &scopes_[parent_block]->NewScope();
+    scopes_[block] = scope;
+    return scope;
+  }
+
+ private:
+  std::map<framework::BlockDesc*, framework::Scope*> scopes_;
+  framework::BlockDesc* root_block_;
+  framework::Scope* root_scope_;
+};
+
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/analysis_pass.h b/paddle/fluid/inference/analysis/analysis_pass.h
index 299f235a74ae0ffb663be61079607d8ac1105a97..d5a972fab3beae4d4e2e512d1ccda3f0b8356682 100644
--- a/paddle/fluid/inference/analysis/analysis_pass.h
+++ b/paddle/fluid/inference/analysis/analysis_pass.h
@@ -46,8 +46,6 @@ class AnalysisPass {
  protected:
   // User should implement these.
   virtual void RunImpl(Argument* argument) = 0;
-
-  Argument* argument_{nullptr};
 };
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 21203e2d9f4e4cd22ea49ea7b6808aff07e70eff..83d411eecf6d706615243fd78cb7e4330d904fc1 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -103,6 +103,7 @@ struct Argument {
   // Model specified with program and parameters files.
   DECL_ARGUMENT_FIELD(model_program_path, ModelProgramPath, std::string);
   DECL_ARGUMENT_FIELD(model_params_path, ModelParamsPath, std::string);
+  DECL_ARGUMENT_FIELD(model_from_memory, ModelFromMemory, bool);
 
   // The overall graph to work on.
   DECL_ARGUMENT_UNIQUE_FIELD(main_graph, MainGraph, framework::ir::Graph);
@@ -115,6 +116,10 @@ struct Argument {
   DECL_ARGUMENT_FIELD(ir_analysis_passes, IrAnalysisPasses,
                       std::vector<std::string>);
 
+  // Pass a set of op types to enable its mkldnn kernel
+  DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes,
+                      std::unordered_set<std::string>);
+
   DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
   DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
   DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool);
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index fce5e1cac92064a320179243380ea02b2c5d7838..51bca8039d4531536cd7a3c39ef8a27f1a5412a1 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -63,6 +63,11 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set("graph_viz_path", new std::string(std::move(dot_file_path)));
       pass_num++;
     }
+    if (pass_name == "mkldnn_placement_pass") {
+      pass->Set("mkldnn_enabled_op_types",
+                new std::unordered_set<std::string>(
+                    argument->mkldnn_enabled_op_types()));
+    }
 
     if (pass_name == "tensorrt_subgraph_pass") {
       PADDLE_ENFORCE(argument->tensorrt_node_teller_valid());
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index c6b7c05f784b7c44fe30dd69529fe48405538ab6..4ffe5f575c232ccfc0089cb86e28737e56b32f94 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -178,11 +178,12 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
     output_mapping.push_back(output_name_map[name]);
   }
 
-  *block_desc.Proto()->mutable_vars() =
-      const_cast<framework::ProgramDesc *>(&graph->program())
-          ->Proto()
-          ->blocks(0)
-          .vars();
+  auto *vars = block_desc.Proto()->mutable_vars();
+  for (framework::ir::Node *node : graph->Nodes()) {
+    if (node->IsVar() && node->Var()) {
+      *vars->Add() = *node->Var()->Proto();
+    }
+  }
   PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
                  "the block has no var-desc");
   PADDLE_ENFORCE(!output_mapping.empty());
diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
index a30c27b1183a75de8c0bb50ef3617d747b239fae..d3ea511d8f4d8cbec1be57633391f00e29a3e6e9 100644
--- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
@@ -1,6 +1,7 @@
 cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager)
 cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager)
-cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass)
+cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager)
+cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass ir_params_sync_among_devices_pass)
 
 set(analysis_deps ${analysis_deps}
         ir_graph_build_pass
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
index 108cb6f74b1208395a4faabdf6184152c300d244..c3a2b3ca1d3b09e71921fde0b0bad8d195aaa38f 100644
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
@@ -61,6 +61,7 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
 void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) {
   std::vector<std::string> passes({
       "ir_graph_build_pass", "ir_analysis_pass",
+      "ir_params_sync_among_devices_pass",
   });
   for (const auto &pass : passes) {
     VLOG(2) << "Run pass " << pass;
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
index d5e0d90de1da8e54e2411c266f7a8c09c33b0336..c6e923c00484f01f17550ae2926dabcadc0c3ac6 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
@@ -36,12 +36,7 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
   // so that the parameters will on the same device, or they will keep copying
   // between difference devices.
   platform::Place place;
-  if (argument->use_gpu()) {
-    PADDLE_ENFORCE(argument->gpu_device_id_valid());
-    place = platform::CUDAPlace(argument->gpu_device_id());
-  } else {
-    place = platform::CPUPlace();
-  }
+  place = platform::CPUPlace();
 
   if (argument->model_dir_valid()) {
     auto program =
@@ -49,9 +44,10 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
     argument->SetMainProgram(program.release());
   } else if (argument->model_program_path_valid() &&
              argument->model_params_path_valid()) {
-    auto program =
-        LoadModel(argument->model_program_path(), argument->model_params_path(),
-                  argument->scope_ptr(), place);
+    auto program = LoadModel(
+        argument->model_program_path(), argument->model_params_path(),
+        argument->scope_ptr(), place,
+        argument->model_from_memory_valid() && argument->model_from_memory());
     argument->SetMainProgram(program.release());
   } else {
     PADDLE_THROW(
@@ -73,9 +69,14 @@ std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel(
 
 std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel(
     const std::string &program_path, const std::string &params_path,
-    framework::Scope *scope, const platform::Place &place) {
+    framework::Scope *scope, const platform::Place &place,
+    bool model_from_memory) {
   framework::Executor exe(place);
-  return Load(&exe, scope, program_path, params_path);
+  if (!model_from_memory) {
+    return Load(&exe, scope, program_path, params_path);
+  } else {
+    return LoadFromMemory(&exe, scope, program_path, params_path);
+  }
 }
 
 std::string IrGraphBuildPass::repr() const { return "ir-graph-build-pass"; }
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
index 271e64fce579bc9001b1dd632576571cec949752..adbde0433fad28b006b18b47c8fd0a8946d21a98 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
@@ -24,7 +24,7 @@ namespace inference {
 namespace analysis {
 
 /*
- * Load program and parameter to memory from the disk.
+ * Load program and parameter to memory from the disk or directly from memory.
  */
 class IrGraphBuildPass : public AnalysisPass {
  public:
@@ -38,7 +38,8 @@ class IrGraphBuildPass : public AnalysisPass {
       const platform::Place &place);
   std::unique_ptr<framework::ProgramDesc> LoadModel(
       const std::string &program_path, const std::string &params_path,
-      framework::Scope *scope, const platform::Place &place);
+      framework::Scope *scope, const platform::Place &place,
+      bool model_from_memory);
 
   std::string model_binary_str_;
 };
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8be2d3ac0b105e50fe619a720929dedaacb75537
--- /dev/null
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
+  PADDLE_ENFORCE(argument->scope_valid());
+  PADDLE_ENFORCE(argument->use_gpu_valid());
+
+  platform::Place place;
+
+  // The parameters are on the cpu, therefore, synchronization is not necessary.
+  if (!argument->use_gpu()) return;
+
+  LOG(INFO) << "Sync params from CPU to GPU";
+
+  PADDLE_ENFORCE(argument->gpu_device_id_valid());
+  place = platform::CUDAPlace(argument->gpu_device_id());
+
+  auto *scope = argument->scope_ptr();
+  std::vector<std::string> all_vars = scope->LocalVarNames();
+
+  // We get all the vars from local_scope instead of the ProgramDesc.
+  // Because there exists the case that new parameter variables are not added to
+  // the program in the analysis pass.
+  for (auto &var_name : all_vars) {
+    auto *var = scope->FindLocalVar(var_name);
+    PADDLE_ENFORCE(var != nullptr);
+    if (var->IsType<framework::LoDTensor>() ||
+        var->IsType<framework::Tensor>()) {
+      auto *t = var->GetMutable<framework::LoDTensor>();
+
+      platform::CPUPlace cpu_place;
+      framework::LoDTensor temp_tensor;
+      temp_tensor.Resize(t->dims());
+      temp_tensor.mutable_data<float>(cpu_place);
+
+      // Copy the parameter data to a tmp tensor.
+      TensorCopySync(*t, cpu_place, &temp_tensor);
+      // Reallocation the space on GPU
+      t->mutable_data<float>(place);
+
+      // Copy parameter data to newly allocated GPU space.
+      TensorCopySync(temp_tensor, place, t);
+    }
+  }
+}
+
+std::string IrParamsSyncAmongDevicesPass::repr() const {
+  return "ir-params-sync-among-devices-pass";
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..a95f460df6f9636fc17a5cf76920f5f459385120
--- /dev/null
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/analysis/analysis_pass.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * Sync parameter from CPU to GPU.
+ */
+class IrParamsSyncAmongDevicesPass : public AnalysisPass {
+ public:
+  void RunImpl(Argument *argument) override;
+  std::string repr() const override;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/passes.cc b/paddle/fluid/inference/analysis/passes/passes.cc
index 2ef515f45f2483df8d1238b4758d6729d0299ce9..9245e32cee28473c21e2acbc1c64165d8b475d3b 100644
--- a/paddle/fluid/inference/analysis/passes/passes.cc
+++ b/paddle/fluid/inference/analysis/passes/passes.cc
@@ -16,6 +16,7 @@
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc"
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
+#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
 
 namespace paddle {
 namespace inference {
@@ -27,6 +28,9 @@ PassRegistry::PassRegistry() {
                   std::unique_ptr<AnalysisPass>(new IrGraphBuildPass));
   passes_.emplace("ir_analysis_compose_pass",
                   std::unique_ptr<AnalysisPass>(new IrAnalysisComposePass));
+  passes_.emplace(
+      "ir_params_sync_among_devices_pass",
+      std::unique_ptr<AnalysisPass>(new IrParamsSyncAmongDevicesPass));
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index dd75f0d9a65404908667d873786160ddaa73fa57..dcefdd92f5157dce7426f2f3e4a2bc053ce24775 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -49,10 +49,15 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
   cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_;
   // fields from this.
   enable_ir_optim = other.enable_ir_optim;
+  // For mkldnn
+  use_mkldnn_ = other.use_mkldnn_;
+  mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_;
+
   use_feed_fetch_ops = other.use_feed_fetch_ops;
   use_tensorrt_ = other.use_tensorrt_;
   tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
   tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
+  model_from_memory_ = other.model_from_memory_;
 
   if (use_gpu) {
     pass_builder_.reset(new GpuPassStrategy(
@@ -76,10 +81,16 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) {
   cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_;
   // fields from this.
   enable_ir_optim = other.enable_ir_optim;
+  // For mkldnn
+  use_mkldnn_ = other.use_mkldnn_;
+  mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_;
+
   use_feed_fetch_ops = other.use_feed_fetch_ops;
   use_tensorrt_ = other.use_tensorrt_;
   tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
   tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
+  model_from_memory_ = other.model_from_memory_;
+
   pass_builder_ = std::move(other.pass_builder_);
 }
 
@@ -102,4 +113,13 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
   pass_builder()->InsertPass(1, "tensorrt_subgraph_pass");
 }
 
+void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
+                                             size_t prog_buffer_size,
+                                             const char *param_buffer,
+                                             size_t param_buffer_size) {
+  prog_file = std::string(prog_buffer, prog_buffer + prog_buffer_size);
+  param_file = std::string(param_buffer, param_buffer + param_buffer_size);
+  model_from_memory_ = true;
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 1862f61f0f4b94c9fa9636e876e943113d9aebd4..be51e7fc1f01c5fc4a48c7f32db15bb82a5ddc07 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -190,9 +190,13 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
   }
   VLOG(3) << "predict cost: " << timer.toc() << "ms";
 
-  // Fix TensorArray reuse not cleaned bug.
-  tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
-  tensor_array_batch_cleaner_.ResetTensorArray();
+  // All the containers in the scope will be hold in inference, but the
+  // operators assume that the container will be reset after each batch.
+  // Here is a bugfix, collect all the container variables, and reset then to a
+  // bool; the next time, the operator will call MutableData and construct a new
+  // container again, so that the container will be empty for each batch.
+  tensor_array_batch_cleaner_.CollectNoTensorVars(sub_scope_);
+  tensor_array_batch_cleaner_.ResetNoTensorVars();
   return true;
 }
 
@@ -304,6 +308,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
 
   argument_.SetUseGPU(config_.use_gpu);
   argument_.SetGPUDeviceId(config_.device);
+  argument_.SetModelFromMemory(config_.model_from_memory_);
   // Analyze inference_program
   if (!config_.model_dir.empty()) {
     argument_.SetModelDir(config_.model_dir);
@@ -322,6 +327,10 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
     argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
   }
 
+  if (config_.use_mkldnn_) {
+    argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_);
+  }
+
   auto passes = config_.pass_builder()->AllPasses();
   if (!config_.enable_ir_optim) passes.clear();
   argument_.SetIrAnalysisPasses(passes);
@@ -417,7 +426,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
 bool AnalysisPredictor::ZeroCopyRun() {
   executor_->Run();
   // Fix TensorArray reuse not cleaned bug.
-  tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
+  tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_);
   tensor_array_batch_cleaner_.ResetTensorArray();
   return true;
 }
@@ -444,20 +453,24 @@ bool AnalysisPredictor::LoadProgramDesc() {
     return false;
   }
 
-  std::string pb_content;
-  // Read binary
-  std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
-  fin.seekg(0, std::ios::end);
-
-  pb_content.resize(fin.tellg());
-  fin.seekg(0, std::ios::beg);
-  fin.read(&(pb_content.at(0)), pb_content.size());
-  fin.close();
-
   // Create ProgramDesc
   framework::proto::ProgramDesc proto;
-  proto.ParseFromString(pb_content);
+  if (!config_.model_from_memory()) {
+    std::string pb_content;
+    // Read binary
+    std::ifstream fin(filename, std::ios::in | std::ios::binary);
+    PADDLE_ENFORCE(static_cast<bool>(fin.is_open()), "Cannot open file %s",
+                   filename);
+    fin.seekg(0, std::ios::end);
+    pb_content.resize(fin.tellg());
+    fin.seekg(0, std::ios::beg);
+    fin.read(&(pb_content.at(0)), pb_content.size());
+    fin.close();
+
+    proto.ParseFromString(pb_content);
+  } else {
+    proto.ParseFromString(config_.prog_file);
+  }
   inference_program_.reset(new framework::ProgramDesc(proto));
   return true;
 }
@@ -465,6 +478,7 @@ bool AnalysisPredictor::LoadProgramDesc() {
 bool AnalysisPredictor::LoadParameters() {
   PADDLE_ENFORCE_NOT_NULL(inference_program_.get(),
                           "The inference program should be loaded first.");
+
   const auto &global_block = inference_program_->MutableBlock(0);
 
   // create a temporary program to load parameters.
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 74369e886692fef3172d24c637b03a5bcf81a6c2..4c5b412a2c1717b8edbb17c238caaa11aeccebd3 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -154,9 +154,9 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
   }
   VLOG(3) << "predict cost: " << timer.toc() << "ms";
 
-  // Fix TensorArray reuse not cleaned bug.
-  tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
-  tensor_array_batch_cleaner_.ResetTensorArray();
+  // For some other vector like containers not cleaned after each batch.
+  tensor_array_batch_cleaner_.CollectNoTensorVars(scope_.get());
+  tensor_array_batch_cleaner_.ResetNoTensorVars();
   return true;
 }
 
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 8fb464c0f5443f116815b14324f6cbc966dc6482..ec93729cd2b379dc2ac39b51df6799b74c8529b6 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -79,6 +79,16 @@ link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
 link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
 link_directories("${PADDLE_LIB}/paddle/lib")
 
+if (NOT WIN32)
+    set(NGRAPH_PATH "${PADDLE_LIB}/third_party/install/ngraph")
+    if(EXISTS ${NGRAPH_PATH})
+        include(GNUInstallDirs)
+        include_directories("${NGRAPH_PATH}/include")
+        link_directories("${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}")
+        set(NGRAPH_LIB ${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}/libngraph${CMAKE_SHARED_LIBRARY_SUFFIX})
+    endif()
+endif()
+
 add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
 
 if(WITH_MKL)
@@ -106,7 +116,7 @@ endif()
 if (NOT WIN32)
 set(EXTERNAL_LIB "-lrt -ldl -lpthread")
 set(DEPS ${DEPS}
-    ${MATH_LIB} ${MKLDNN_LIB}
+    ${MATH_LIB} ${MKLDNN_LIB} ${NGRAPH_LIB}
     glog gflags protobuf snappystream snappy z xxhash
     ${EXTERNAL_LIB})
 else()
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index ff718077c1ba6b10fe87aac10d84f96a23ad6bba..a94ccfa92439a735e101c7e5709909abea062ff8 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -54,6 +54,9 @@ mkdir -p build
 cd build
 
 for WITH_STATIC_LIB in ON OFF; do
+# TODO(Superjomn) reopen this
+# something wrong with the TensorArray reset.
+:<<D
   # -----simple_on_word2vec-----
   rm -rf *
   cmake .. -DPADDLE_LIB=${inference_install_dir} \
@@ -74,6 +77,7 @@ for WITH_STATIC_LIB in ON OFF; do
       fi
     done
   fi
+D
   # ---------vis_demo---------
   rm -rf *
   cmake .. -DPADDLE_LIB=${inference_install_dir} \
diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.cc b/paddle/fluid/inference/api/details/reset_tensor_array.cc
index 4ae6c6dc9f44650c1c62f5be5448864d817513b1..569a487328e2f1febe2ca5014b232dbd51d28079 100644
--- a/paddle/fluid/inference/api/details/reset_tensor_array.cc
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.cc
@@ -46,5 +46,28 @@ void TensorArrayBatchCleaner::ResetTensorArray() {
   }
 }
 
+void TensorArrayBatchCleaner::CollectNoTensorVars(framework::Scope *scope) {
+  if (no_tensor_flag_) {
+    for (auto &var_name : scope->LocalVarNames()) {
+      auto *var = scope->FindVar(var_name);
+      if (!var->IsInitialized()) continue;
+      if (!valid_types_.count(var->Type())) {
+        no_tensor_vars_.insert(var);
+      }
+    }
+
+    for (auto *kid : scope->kids()) {
+      CollectTensorArrays(kid);
+    }
+    no_tensor_flag_ = false;  // Only collect one time.
+  }
+}
+
+void TensorArrayBatchCleaner::ResetNoTensorVars() {
+  for (auto *var : no_tensor_vars_) {
+    var->Clear();
+  }
+}
+
 }  // namespace details
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.h b/paddle/fluid/inference/api/details/reset_tensor_array.h
index a39449ff0e67786815dfb8d2d30d79dcdba757d7..6a5ea64de66fcac44117d0d8f7798e8875703ec6 100644
--- a/paddle/fluid/inference/api/details/reset_tensor_array.h
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.h
@@ -14,9 +14,11 @@
 
 #pragma once
 
+#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/variable.h"
 
 namespace paddle {
 namespace details {
@@ -24,13 +26,28 @@ namespace details {
 // Clean the TensorArray each batch to make the behavior the same with the
 // training phase.
 struct TensorArrayBatchCleaner {
+  TensorArrayBatchCleaner() {
+    valid_types_.insert(typeid(framework::Tensor));
+    valid_types_.insert(typeid(framework::LoDTensor));
+  }
+  // Collect the variables that are not Tensor or LoDTensor, and reset them to a
+  // bool(trick), because some of them are containers, and some operators just
+  // keep inserting new items without clearing the containers first; So the
+  // memory grow larger and larger in inference service deployed online.
+  void CollectNoTensorVars(framework::Scope *scope);
+  void ResetNoTensorVars();
+
   // Fix the tensor array not clear in the inference scenarios.
   void CollectTensorArrays(framework::Scope *scope);
   void ResetTensorArray();
 
  private:
   bool flag_{true};
+  bool no_tensor_flag_{true};
   std::vector<framework::LoDTensorArray *> arrays_;
+
+  std::unordered_set<std::type_index> valid_types_;
+  std::unordered_set<framework::Variable *> no_tensor_vars_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index a09bd1cac2aa31b2ecee9b0f77d2b777104f1161..f05b9832da55f10b34eb2df914e443a478e5a4a4 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -16,6 +16,7 @@
 #include <cassert>
 #include <memory>
 #include <string>
+#include <unordered_set>
 #include <vector>
 
 // Here we include some header files with relative paths, for that in deploy,
@@ -52,18 +53,26 @@ struct AnalysisConfig : public NativeConfig {
   bool use_tensorrt() const { return use_tensorrt_; }
 
   void EnableMKLDNN();
-  // NOTE this is just for internal development, please not use it.
-  // NOT stable yet.
   bool use_mkldnn() const { return use_mkldnn_; }
+  void SetMKLDNNOp(std::unordered_set<std::string> op_list) {
+    mkldnn_enabled_op_types_ = op_list;
+  }
+
+  // Specify the memory buffer of program and parameter
+  void SetModelBuffer(const char* prog_buffer, size_t prog_buffer_size,
+                      const char* program_buffer, size_t program_buffer_size);
+  bool model_from_memory() const { return model_from_memory_; }
 
   friend class ::paddle::AnalysisPredictor;
 
  protected:
   bool use_tensorrt_{false};
   bool use_mkldnn_{false};
+  std::unordered_set<std::string> mkldnn_enabled_op_types_;
   int tensorrt_workspace_size_;
   int tensorrt_max_batchsize_;
   std::unique_ptr<PassStrategy> pass_builder_;
+  bool model_from_memory_{false};
 };
 
 // Configurations for Anakin engine.
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index 12e3a6f42e14010feedbbb5d8f8a98f60cea4556..bc5139a7e54eaf7133ea96ae3b36915a236a2c5e 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -98,9 +98,10 @@ class CpuPassStrategy : public PassStrategy {
     passes_.insert(passes_.begin(), "mkldnn_placement_pass");
 
     for (auto &pass :
-         std::vector<std::string>({"depthwise_conv_mkldnn_pass",  //
-                                   "conv_bias_mkldnn_fuse_pass",  //
-                                   "conv_relu_mkldnn_fuse_pass",  //
+         std::vector<std::string>({"depthwise_conv_mkldnn_pass",    //
+                                   "conv_bias_mkldnn_fuse_pass",    //
+                                   "conv3d_bias_mkldnn_fuse_pass",  //
+                                   "conv_relu_mkldnn_fuse_pass",    //
                                    "conv_elementwise_add_mkldnn_fuse_pass"})) {
       passes_.push_back(pass);
     }
@@ -116,12 +117,8 @@ class CpuPassStrategy : public PassStrategy {
 class GpuPassStrategy : public PassStrategy {
  public:
   GpuPassStrategy() : PassStrategy({}) {
-    // TODO(NHZlX) Problem with Data synchronization between GPU and CPU
-    // When running in GPU mode, the parameters are all on GPU. But the
-    // opearations of "conv_bn_fuse_pass" are on CPU.
     passes_.assign({
-        "infer_clean_graph_pass",
-        // "infer_clean_graph_pass", "conv_bn_fuse_pass",
+        "infer_clean_graph_pass", "conv_bn_fuse_pass",
     });
   }
 
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index 31f43bfdcaafb18c611d86ef26fd9de118562799..24d15f12f9cd4a9280cd316bd727fdbccb831b9b 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -69,7 +69,8 @@ bool IsPersistable(const framework::VarDesc* var) {
 void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
                       const framework::ProgramDesc& main_program,
                       const std::string& dirname,
-                      const std::string& param_filename) {
+                      const std::string& param_filename,
+                      bool model_from_memory = false) {
   const framework::BlockDesc& global_block = main_program.Block(0);
 
   framework::ProgramDesc* load_program = new framework::ProgramDesc();
@@ -108,6 +109,7 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
     op->SetType("load_combine");
     op->SetOutput("Out", paramlist);
     op->SetAttr("file_path", {param_filename});
+    op->SetAttr("model_from_memory", {model_from_memory});
     op->CheckAttrs();
   }
 
@@ -130,16 +132,17 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
                  "model version %ld is not supported.",
                  main_program->Version());
 
-  LoadPersistables(executor, scope, *main_program, dirname, "");
+  // model_from_memory is false in seperate parameters.
+  LoadPersistables(executor, scope, *main_program, dirname, "",
+                   false /* model_from_memory */);
   return main_program;
 }
 
 std::unique_ptr<framework::ProgramDesc> Load(
     framework::Executor* executor, framework::Scope* scope,
     const std::string& prog_filename, const std::string& param_filename) {
-  std::string model_filename = prog_filename;
   std::string program_desc_str;
-  ReadBinaryFile(model_filename, &program_desc_str);
+  ReadBinaryFile(prog_filename, &program_desc_str);
 
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
@@ -147,7 +150,22 @@ std::unique_ptr<framework::ProgramDesc> Load(
                  "model version %ld is not supported.",
                  main_program->Version());
 
-  LoadPersistables(executor, scope, *main_program, "", param_filename);
+  LoadPersistables(executor, scope, *main_program, "", param_filename,
+                   false /* model_from_memory */);
+  return main_program;
+}
+
+std::unique_ptr<framework::ProgramDesc> LoadFromMemory(
+    framework::Executor* executor, framework::Scope* scope,
+    const std::string& prog_buffer, const std::string& param_buffer) {
+  std::unique_ptr<framework::ProgramDesc> main_program(
+      new framework::ProgramDesc(prog_buffer));
+  PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
+                 "model version %ld is not supported.",
+                 main_program->Version());
+
+  LoadPersistables(executor, scope, *main_program, "", param_buffer,
+                   true /* model_filename */);
   return main_program;
 }
 
diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h
index ab492577c1476abee30d6dd1c740394391e5a93a..317ef9d93acf3af174cb44da6099425fff1418eb 100644
--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
@@ -30,7 +30,8 @@ void Init(const std::vector<std::string> argv);
 void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
                       const framework::ProgramDesc& main_program,
                       const std::string& dirname,
-                      const std::string& param_filename);
+                      const std::string& param_filename,
+                      bool model_from_memory);
 
 std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
                                              framework::Scope* scope,
@@ -41,6 +42,10 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
                                              const std::string& prog_filename,
                                              const std::string& param_filename);
 
+std::unique_ptr<framework::ProgramDesc> LoadFromMemory(
+    framework::Executor* executor, framework::Scope* scope,
+    const std::string& prog_buffer, const std::string& param_buffer);
+
 // Save the variables from a scope to disk.
 void SaveVars(const framework::Scope& scope,
               const std::vector<std::string>& vars, const std::string& dirname,
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index 343fd3f7c5aed6931fc215445c17d3ed7074368e..1d0d83d1f368f879878a4df8b2eefae0bc89423d 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -109,8 +109,12 @@ class Pool2dOpConverter : public OpConverter {
     }
 
     if (pool_type == "max") {
-      nvinfer1::DimsHW pre_pad(paddings[0], paddings[1]);
-      nvinfer1::DimsHW post_pad(paddings[0], paddings[1]);
+      // Under ceil mode, the pre_pad and post_pad are used to
+      // record the the padding size. In some ceil mode cases,
+      // we do not need padding, so we initialize the two vars to 0.
+
+      nvinfer1::DimsHW pre_pad(0, 0);
+      nvinfer1::DimsHW post_pad(0, 0);
       if (ceil_mode) {
         // If ceil mode is true, we will pad the appropriate size to the input.
         DealCeilMode(input_shape, ksize, strides, paddings, &pre_pad, &post_pad,
diff --git a/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc
index 453f222f1f1e3f3b9ee8fa7bd49f4cab2286e7ea..b086c910d38a243d98315f2d6eb82ecc0ec5c06d 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc
@@ -90,5 +90,4 @@ TEST(prelu_op, test_scalar) {
 }  // namespace inference
 }  // namespace paddle
 
-// USE_OP(prelu);
-USE_CPU_ONLY_OP(prelu);
+USE_OP(prelu);
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index e822785ad6f4f6f67b72141f3e7b04aefa72e58b..95443e813327c1247ac530c4d2e68b3607ff0e73 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -1,4 +1,4 @@
 nv_library(tensorrt_plugin
            SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu
            avg_pool_op_plugin.cu
-           DEPS enforce tensorrt_engine)
+           DEPS enforce tensorrt_engine prelu)
diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
index e8f4254402a5d8a5e6c5a2384bf9fbe48341956e..3075e87ea6d719a3f49d14c8c4b8015f7d688a50 100644
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
@@ -14,92 +14,16 @@
 
 #include <stdio.h>
 #include <cassert>
+#include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
+#include "paddle/fluid/operators/math/prelu.h"
 
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 namespace plugin {
 
-static const int CUDA_NUM_THREADS = 1024;
-static const int CUDA_MAX_NUM_BLOCKS = 65535;
-inline static int GET_NUM_BLOCKS(const int N) {
-  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
-}
-
-__global__ void PReluChannelWiseKernel(const float *input, const float *alpha,
-                                       float *output, int channel,
-                                       size_t spatial_size) {
-  size_t offset = blockIdx.x * spatial_size;
-  const float *in = input + offset;
-  float *out = output + offset;
-  float scale = alpha[blockIdx.x % channel];
-
-  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
-    float x = in[i];
-    out[i] = (x > 0) ? x : scale * x;
-  }
-}
-
-__global__ void PReluElementWiseKernel(const float *input, const float *alpha,
-                                       float *output, size_t spatial_size) {
-  size_t offset = blockIdx.x * spatial_size;
-  const float *in = input + offset;
-  const float *scale = alpha + offset;
-  float *out = output + offset;
-
-  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
-    float x = in[i];
-    out[i] = (x > 0) ? x : scale[i] * x;
-  }
-}
-
-__global__ void PReluScalarKernel(const float *input, const float *alpha,
-                                  float *output, size_t spatial_size) {
-  size_t offset = blockIdx.x * spatial_size;
-  const float *in = input + offset;
-  float scale = *alpha;
-  float *out = output + offset;
-
-  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
-    float x = in[i];
-    out[i] = (x > 0) ? x : scale * x;
-  }
-}
-
-static inline void PReluChannelWise(cudaStream_t stream, const float *input,
-                                    const float *alpha, float *output,
-                                    int batch_size,
-                                    const nvinfer1::Dims &dims) {
-  size_t unroll = batch_size * dims.d[0];
-  size_t spatial_size = dims.d[1] * dims.d[2];
-  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
-  PReluChannelWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
-      input, alpha, output, dims.d[0], spatial_size);
-}
-
-static inline void PReluElementWise(cudaStream_t stream, const float *input,
-                                    const float *alpha, float *output,
-                                    int batch_size,
-                                    const nvinfer1::Dims &dims) {
-  size_t unroll = batch_size * dims.d[0];
-  size_t spatial_size = dims.d[1] * dims.d[2];
-  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
-  PReluElementWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
-      input, alpha, output, spatial_size);
-}
-
-static inline void PReluScalar(cudaStream_t stream, const float *input,
-                               const float *alpha, float *output,
-                               int batch_size, const nvinfer1::Dims &dims) {
-  size_t unroll = batch_size * dims.d[0];
-  size_t spatial_size = dims.d[1] * dims.d[2];
-  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
-  PReluScalarKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
-      input, alpha, output, spatial_size);
-}
-
 nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,
                                                 const nvinfer1::Dims *inputDims,
                                                 int nbInputs) {
@@ -110,19 +34,31 @@ nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,
   return output_dims;
 }
 
-int PReluPlugin::enqueue(int batchSize, const void *const *inputs,
+int PReluPlugin::enqueue(int batch_size, const void *const *inputs,
                          void **outputs, void *workspace, cudaStream_t stream) {
   // input dims is CHW.
   const auto &input_dims = this->getInputDims(0);
   const float *input = reinterpret_cast<const float *>(inputs[0]);
   const float *alpha = reinterpret_cast<const float *>(alpha_.get().values);
   float *output = reinterpret_cast<float **>(outputs)[0];
+
+  std::vector<int> input_shape;
+  input_shape.push_back(batch_size);
+  for (int i = 0; i < input_dims.nbDims; i++) {
+    input_shape.push_back(input_dims.d[i]);
+  }
+
   if (mode_ == "channel") {
-    PReluChannelWise(stream, input, alpha, output, batchSize, input_dims);
+    operators::math::PreluChannelWiseDirectCUDAFunctor<float>
+        prelu_channel_wise;
+    prelu_channel_wise(stream, input, alpha, output, input_shape);
   } else if (mode_ == "element") {
-    PReluElementWise(stream, input, alpha, output, batchSize, input_dims);
+    operators::math::PreluElementWiseDirectCUDAFunctor<float>
+        prelu_element_wise;
+    prelu_element_wise(stream, input, alpha, output, input_shape);
   } else {
-    PReluScalar(stream, input, alpha, output, batchSize, input_dims);
+    operators::math::PreluScalarDirectCUDAFunctor<float> prelu_scalar;
+    prelu_scalar(stream, input, alpha, output, input_shape);
   }
   return cudaGetLastError() != cudaSuccess;
 }
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 7dc88d9dd052c59aaa59b7802ee5a38ea9d89bc6..8a4bc04b67879918c6ac8d1b40dae68a107034d4 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor)
+set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor benchmark)
 
 if(WITH_GPU AND TENSORRT_FOUND)
     set(INFERENCE_EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} ir_pass_manager analysis_predictor)
@@ -46,11 +46,18 @@ set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
 download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2_tester.cc)
 
-# DAM
+# normal DAM
 set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
 download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc)
 
+# small DAM
+set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam")
+download_model_and_data(${DAM_SMALL_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz")
+inference_analysis_test(test_analyzer_small_dam SRCS analyzer_dam_tester.cc
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+        ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt --max_turn_num=1)
+
 # chinese_ner
 set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
 download_model_and_data(${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz" "chinese_ner-data.txt.tar.gz")
diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
index b369cba5c8b3f8aadd1123d6b7345fad6e47bd0f..227e2ff45873fded45899146b97a7bee0c8ad763 100644
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -14,38 +14,54 @@
 
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
+DEFINE_int32(max_turn_num, 9,
+             "The max turn number: 1 for the small and 9 for the normal.");
+
 namespace paddle {
 namespace inference {
 using contrib::AnalysisConfig;
-#define MAX_TURN_NUM 9
-#define MAX_TURN_LEN 50
+
+constexpr int32_t kMaxTurnLen = 50;
+
 static std::vector<float> result_data;
 
 struct DataRecord {
-  std::vector<std::vector<int64_t>>
-      turns[MAX_TURN_NUM];  // turns data : MAX_TURN_NUM
-  std::vector<std::vector<float>>
-      turns_mask[MAX_TURN_NUM];                // turns mask data : MAX_TURN_NUM
-  std::vector<std::vector<int64_t>> response;  // response data : 1
+  std::vector<std::vector<int64_t>> *turns;
+  std::vector<std::vector<float>> *turns_mask;
+  std::vector<std::vector<int64_t>> response;     // response data : 1
   std::vector<std::vector<float>> response_mask;  // response mask data : 1
   size_t batch_iter{0};
   size_t batch_size{1};
   size_t num_samples;  // total number of samples
-  DataRecord() = default;
+
+  DataRecord() {
+    turns = new std::vector<std::vector<
+        int64_t>>[FLAGS_max_turn_num];  // turns data : FLAGS_max_turn_num
+    turns_mask = new std::vector<std::vector<
+        float>>[FLAGS_max_turn_num];  // turns mask data : FLAGS_max_turn_num
+  }
+
   explicit DataRecord(const std::string &path, int batch_size = 1)
-      : batch_size(batch_size) {
+      : DataRecord() {
+    this->batch_size = batch_size;
     Load(path);
   }
+
+  ~DataRecord() {
+    delete[] turns;
+    delete[] turns_mask;
+  }
+
   DataRecord NextBatch() {
     DataRecord data;
     size_t batch_end = batch_iter + batch_size;
     // NOTE skip the final batch, if no enough data is provided.
     if (batch_end <= response.size()) {
-      for (int i = 0; i < MAX_TURN_NUM; ++i) {
+      for (int i = 0; i < FLAGS_max_turn_num; ++i) {
         data.turns[i].assign(turns[i].begin() + batch_iter,
                              turns[i].begin() + batch_end);
       }
-      for (int i = 0; i < MAX_TURN_NUM; ++i) {
+      for (int i = 0; i < FLAGS_max_turn_num; ++i) {
         data.turns_mask[i].assign(turns_mask[i].begin() + batch_iter,
                                   turns_mask[i].begin() + batch_end);
       }
@@ -60,6 +76,7 @@ struct DataRecord {
     batch_iter += batch_size;
     return data;
   }
+
   void Load(const std::string &path) {
     std::ifstream file(path);
     std::string line;
@@ -69,30 +86,30 @@ struct DataRecord {
       num_lines++;
       std::vector<std::string> data;
       split(line, ',', &data);
-      CHECK_EQ(data.size(), (size_t)(2 * MAX_TURN_NUM + 3));
+      CHECK_EQ(data.size(), (size_t)(2 * FLAGS_max_turn_num + 3));
       // load turn data
-      std::vector<int64_t> turns_tmp[MAX_TURN_NUM];
-      for (int i = 0; i < MAX_TURN_NUM; ++i) {
+      std::vector<int64_t> turns_tmp[FLAGS_max_turn_num];
+      for (int i = 0; i < FLAGS_max_turn_num; ++i) {
         split_to_int64(data[i], ' ', &turns_tmp[i]);
         turns[i].push_back(std::move(turns_tmp[i]));
       }
       // load turn_mask data
-      std::vector<float> turns_mask_tmp[MAX_TURN_NUM];
-      for (int i = 0; i < MAX_TURN_NUM; ++i) {
-        split_to_float(data[MAX_TURN_NUM + i], ' ', &turns_mask_tmp[i]);
+      std::vector<float> turns_mask_tmp[FLAGS_max_turn_num];
+      for (int i = 0; i < FLAGS_max_turn_num; ++i) {
+        split_to_float(data[FLAGS_max_turn_num + i], ' ', &turns_mask_tmp[i]);
         turns_mask[i].push_back(std::move(turns_mask_tmp[i]));
       }
       // load response data
       std::vector<int64_t> response_tmp;
-      split_to_int64(data[2 * MAX_TURN_NUM], ' ', &response_tmp);
+      split_to_int64(data[2 * FLAGS_max_turn_num], ' ', &response_tmp);
       response.push_back(std::move(response_tmp));
       // load response_mask data
       std::vector<float> response_mask_tmp;
-      split_to_float(data[2 * MAX_TURN_NUM + 1], ' ', &response_mask_tmp);
+      split_to_float(data[2 * FLAGS_max_turn_num + 1], ' ', &response_mask_tmp);
       response_mask.push_back(std::move(response_mask_tmp));
       // load result data
       float result_tmp;
-      result_tmp = std::stof(data[2 * MAX_TURN_NUM + 2]);
+      result_tmp = std::stof(data[2 * FLAGS_max_turn_num + 2]);
       result_data.push_back(result_tmp);
     }
     num_samples = num_lines;
@@ -101,8 +118,8 @@ struct DataRecord {
 
 void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
                    int batch_size) {
-  PaddleTensor turns_tensor[MAX_TURN_NUM];
-  PaddleTensor turns_mask_tensor[MAX_TURN_NUM];
+  PaddleTensor turns_tensor[FLAGS_max_turn_num];
+  PaddleTensor turns_mask_tensor[FLAGS_max_turn_num];
   PaddleTensor response_tensor;
   PaddleTensor response_mask_tensor;
   std::string turn_pre = "turn_";
@@ -110,16 +127,16 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
 
   auto one_batch = data->NextBatch();
   int size = one_batch.response[0].size();
-  CHECK_EQ(size, MAX_TURN_LEN);
+  CHECK_EQ(size, kMaxTurnLen);
   // turn tensor assignment
-  for (int i = 0; i < MAX_TURN_NUM; ++i) {
+  for (int i = 0; i < FLAGS_max_turn_num; ++i) {
     turns_tensor[i].name = turn_pre + std::to_string(i);
     turns_tensor[i].shape.assign({batch_size, size, 1});
     turns_tensor[i].dtype = PaddleDType::INT64;
     TensorAssignData<int64_t>(&turns_tensor[i], one_batch.turns[i]);
   }
   // turn mask tensor assignment
-  for (int i = 0; i < MAX_TURN_NUM; ++i) {
+  for (int i = 0; i < FLAGS_max_turn_num; ++i) {
     turns_mask_tensor[i].name = turn_mask_pre + std::to_string(i);
     turns_mask_tensor[i].shape.assign({batch_size, size, 1});
     turns_mask_tensor[i].dtype = PaddleDType::FLOAT32;
@@ -137,10 +154,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   TensorAssignData<float>(&response_mask_tensor, one_batch.response_mask);
 
   // Set inputs.
-  for (int i = 0; i < MAX_TURN_NUM; ++i) {
+  for (int i = 0; i < FLAGS_max_turn_num; ++i) {
     input_slots->push_back(std::move(turns_tensor[i]));
   }
-  for (int i = 0; i < MAX_TURN_NUM; ++i) {
+  for (int i = 0; i < FLAGS_max_turn_num; ++i) {
     input_slots->push_back(std::move(turns_mask_tensor[i]));
   }
   input_slots->push_back(std::move(response_tensor));
@@ -171,10 +188,16 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 }
 
 // Easy for profiling independently.
-TEST(Analyzer_dam, profile) {
+void profile(bool use_mkldnn = false) {
   contrib::AnalysisConfig cfg;
   SetConfig(&cfg);
 
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+    std::unordered_set<std::string> op_list = {"conv3d"};
+    cfg.SetMKLDNNOp(op_list);
+  }
+
   std::vector<PaddleTensor> outputs;
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
@@ -192,6 +215,11 @@ TEST(Analyzer_dam, profile) {
   }
 }
 
+TEST(Analyzer_dam, profile) { profile(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_dam, profile_mkldnn) { profile(true /* use_mkldnn */); }
+#endif
+
 // Check the fuse status
 TEST(Analyzer_dam, fuse_statis) {
   contrib::AnalysisConfig cfg;
@@ -202,14 +230,17 @@ TEST(Analyzer_dam, fuse_statis) {
   auto fuse_statis = GetFuseStatis(
       static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
   ASSERT_TRUE(fuse_statis.count("fc_fuse"));
-  EXPECT_EQ(fuse_statis.at("fc_fuse"), 317);
-  EXPECT_EQ(num_ops, 2020);
 }
 
 // Compare result of NativeConfig and AnalysisConfig
-TEST(Analyzer_dam, compare) {
-  contrib::AnalysisConfig cfg;
+void compare(bool use_mkldnn = false) {
+  AnalysisConfig cfg;
   SetConfig(&cfg);
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+    std::unordered_set<std::string> op_list = {"conv3d"};
+    cfg.SetMKLDNNOp(op_list);
+  }
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
@@ -218,5 +249,10 @@ TEST(Analyzer_dam, compare) {
       reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
+TEST(Analyzer_dam, compare) { compare(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_dam, compare_mkldnn) { compare(true /* use_mkldnn */); }
+#endif
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
index 3a5f844de3cae7eb9b6e3555c5219c6cf8ee1919..66d85420c5701b1bf308b6850465beb6d8a0b703 100644
--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -93,9 +93,17 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   }
 }
 
-void SetConfig(contrib::AnalysisConfig *cfg) {
-  cfg->prog_file = FLAGS_infer_model + "/__model__";
-  cfg->param_file = FLAGS_infer_model + "/param";
+void SetConfig(contrib::AnalysisConfig *cfg, bool memory_load = false) {
+  if (memory_load) {
+    std::string buffer_prog, buffer_param;
+    ReadBinaryFile(FLAGS_infer_model + "/__model__", &buffer_prog);
+    ReadBinaryFile(FLAGS_infer_model + "/param", &buffer_param);
+    cfg->SetModelBuffer(&buffer_prog[0], buffer_prog.size(), &buffer_param[0],
+                        buffer_param.size());
+  } else {
+    cfg->prog_file = FLAGS_infer_model + "/__model__";
+    cfg->param_file = FLAGS_infer_model + "/param";
+  }
   cfg->use_gpu = false;
   cfg->device = 0;
   cfg->specify_input_name = true;
@@ -114,9 +122,9 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 }
 
 // Easy for profiling independently.
-TEST(Analyzer_Chinese_ner, profile) {
+void profile(bool memory_load = false) {
   contrib::AnalysisConfig cfg;
-  SetConfig(&cfg);
+  SetConfig(&cfg, memory_load);
   std::vector<PaddleTensor> outputs;
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
@@ -138,6 +146,12 @@ TEST(Analyzer_Chinese_ner, profile) {
   }
 }
 
+TEST(Analyzer_Chinese_ner, profile) { profile(); }
+
+TEST(Analyzer_Chinese_ner, profile_memory_load) {
+  profile(true /* memory_load */);
+}
+
 // Check the fuse status
 TEST(Analyzer_Chinese_ner, fuse_statis) {
   contrib::AnalysisConfig cfg;
diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h
index 4231eef7220735d0b80eb1adc951c55ff7378f1b..7046bce303e2bd46197ab512ae273500b9af88bf 100644
--- a/paddle/fluid/inference/tests/api/config_printer.h
+++ b/paddle/fluid/inference/tests/api/config_printer.h
@@ -49,8 +49,6 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) {
   os << GenSpaces(num_spaces) << "device: " << config.device << "\n";
   os << GenSpaces(num_spaces)
      << "fraction_of_gpu_memory: " << config.fraction_of_gpu_memory << "\n";
-  os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n";
-  os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n";
   os << GenSpaces(num_spaces)
      << "specify_input_name: " << config.specify_input_name << "\n";
   os << GenSpaces(num_spaces)
@@ -65,6 +63,13 @@ std::ostream &operator<<(std::ostream &os,
   os << GenSpaces(num_spaces) << "contrib::AnalysisConfig {\n";
   num_spaces++;
   os << *reinterpret_cast<const NativeConfig *>(&config);
+  if (!config.model_from_memory()) {
+    os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n";
+    os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n";
+  } else {
+    os << GenSpaces(num_spaces)
+       << "prog_file and param_file: load from memory \n";
+  }
   os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.enable_ir_optim
      << "\n";
   os << GenSpaces(num_spaces)
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index d572ea0177c1e398229a02718ca31cc78a7059ef..8209a049f4614fe31c22c4e83c1968411b749b49 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -30,8 +30,10 @@
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/tests/api/config_printer.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
+#include "paddle/fluid/inference/utils/benchmark.h"
 #include "paddle/fluid/platform/profiler.h"
 
+DEFINE_string(model_name, "", "model name");
 DEFINE_string(infer_model, "", "model path");
 DEFINE_string(infer_data, "", "data file");
 DEFINE_int32(batch_size, 1, "batch size.");
@@ -40,6 +42,8 @@ DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
 DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
 DEFINE_bool(use_analysis, true,
             "Running the inference program in analysis mode.");
+DEFINE_bool(record_benchmark, false,
+            "Record benchmark after profiling the model");
 
 DECLARE_bool(profile);
 DECLARE_int32(paddle_num_threads);
@@ -192,8 +196,16 @@ void TestOneThreadPrediction(
         predictor->Run(inputs[j], outputs, batch_size);
       }
     }
-    PrintTime(batch_size, num_times, 1, 0, run_timer.toc() / num_times,
-              inputs.size());
+
+    double latency = run_timer.toc() / num_times;
+    PrintTime(batch_size, num_times, 1, 0, latency, inputs.size());
+    if (FLAGS_record_benchmark) {
+      Benchmark benchmark;
+      benchmark.SetName(FLAGS_model_name);
+      benchmark.SetBatchSize(batch_size);
+      benchmark.SetLatency(latency);
+      benchmark.PersistToFile("benchmark_record.txt");
+    }
   }
 }
 
diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc
index ef612ce6148329c33f194842945bb5438afcf645..9eb3fb5da1065f14d9ad1c3520f6415fbadfdca1 100644
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -135,6 +135,9 @@ TEST(TensorRT_resnext50, compare) {
 
 TEST(TensorRT_resnext50, profile) {
   std::string model_dir = FLAGS_infer_model + "/resnext50";
+  // Set FLAGS_record_benchmark to true to record benchmark to file.
+  // FLAGS_record_benchmark=true;
+  FLAGS_model_name = "resnext50";
   profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt);
 }
 
diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt
index 2104e4ac7222258ee025bd5acd60b1db251df654..cfb80fe6ec11a55a887c7552ec4e6a8a0c6a2fce 100644
--- a/paddle/fluid/inference/utils/CMakeLists.txt
+++ b/paddle/fluid/inference/utils/CMakeLists.txt
@@ -1,2 +1,7 @@
 cc_library(benchmark SRCS benchmark.cc DEPS enforce)
 cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)
+cc_binary(visualizer SRCS visualizer.cc DEPS analysis
+    paddle_pass_builder ir_pass_manager pass graph_viz_pass analysis_passes)
+if(WIN32)
+  target_link_libraries(visualizer shlwapi)
+endif(WIN32)
diff --git a/paddle/fluid/inference/utils/benchmark.cc b/paddle/fluid/inference/utils/benchmark.cc
index 021edc2de5e90023fcd1431dd2025450e7462bd9..0bd526bcac2d9ceda95730dc3c5210aed8ccfb5c 100644
--- a/paddle/fluid/inference/utils/benchmark.cc
+++ b/paddle/fluid/inference/utils/benchmark.cc
@@ -30,10 +30,10 @@ std::string Benchmark::SerializeToString() const {
   ss << '\n';
 
   ss << name_ << "\t";
-  ss << batch_size_ << "\t";
+  ss << batch_size_ << "\t\t";
   ss << num_threads_ << "\t";
   ss << latency_ << "\t";
-  ss << 1000 / latency_;
+  ss << 1000.0 / latency_;
   ss << '\n';
   return ss.str();
 }
diff --git a/paddle/fluid/inference/utils/benchmark.h b/paddle/fluid/inference/utils/benchmark.h
index 80e8f77adb4ff2cc81a2a3dd0c44e4e304800122..76a3dd2c2992ebdf2528c539b3d161f558b34a08 100644
--- a/paddle/fluid/inference/utils/benchmark.h
+++ b/paddle/fluid/inference/utils/benchmark.h
@@ -11,9 +11,11 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#pragma once
 
 #include <fstream>
 #include <iostream>
+#include <string>
 
 namespace paddle {
 namespace inference {
@@ -31,8 +33,8 @@ struct Benchmark {
   bool use_gpu() const { return use_gpu_; }
   void SetUseGpu() { use_gpu_ = true; }
 
-  int latency() const { return latency_; }
-  void SetLatency(int x) { latency_ = x; }
+  float latency() const { return latency_; }
+  void SetLatency(float x) { latency_ = x; }
 
   const std::string& name() const { return name_; }
   void SetName(const std::string& name) { name_ = name; }
@@ -43,7 +45,7 @@ struct Benchmark {
  private:
   bool use_gpu_{false};
   int batch_size_{0};
-  int latency_;
+  float latency_;
   int num_threads_{1};
   std::string name_;
 };
diff --git a/paddle/fluid/inference/utils/visualizer.cc b/paddle/fluid/inference/utils/visualizer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7c0dd64dea88e51b24c4bc04818d633ee0d2f722
--- /dev/null
+++ b/paddle/fluid/inference/utils/visualizer.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/utils/visualizer.h"
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <fstream>
+#include <memory>
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
+#include "paddle/fluid/platform/init.h"
+
+DEFINE_string(model_dir, "", "model directory");
+DEFINE_string(model_program_path, "", "model program path");
+DEFINE_string(model_params_path, "", "model params path");
+
+using paddle::inference::analysis::Argument;
+
+namespace paddle {
+namespace inference {
+namespace utils {
+
+void Visualizer::SetArgument(Argument *argument) { argument_ = argument; }
+
+bool Visualizer::Run() {
+  paddle::framework::InitDevices(false);
+  paddle::inference::analysis::Analyzer().Run(argument_);
+  return true;
+}
+
+}  // namespace utils
+}  // namespace inference
+}  // namespace paddle
+
+// Generate a dot file describing the structure of graph.
+// To use this tool, run command: ./visualizer [options...]
+// Options:
+//     --model_dir: the directory of model
+//     --model_program_path: the path of program
+//     --model_params_path: the path of params
+int main(int argc, char *argv[]) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  google::InitGoogleLogging(argv[0]);
+
+  paddle::inference::analysis::Argument argument;
+  argument.SetUseGPU(false);
+  argument.SetUseTensorRT(false);
+
+  if (FLAGS_model_dir.empty()) {
+    if (FLAGS_model_program_path.empty() || FLAGS_model_params_path.empty()) {
+      LOG(ERROR) << "Please set model_dir"
+                    " or model_program_path and model_params_path";
+      return -1;
+    } else {
+      argument.SetModelProgramPath(FLAGS_model_program_path);
+      argument.SetModelParamsPath(FLAGS_model_params_path);
+    }
+  } else {
+    argument.SetModelDir(FLAGS_model_dir);
+  }
+
+  // Only 1 pass, default filename is 0_ir_origin.dot
+  // For more details, looking for paddle::inference::analysis::IRPassManager
+  argument.SetIrAnalysisPasses({"infer_clean_graph_pass", "graph_viz_pass"});
+
+  std::unique_ptr<paddle::framework::Scope> scope{
+      new paddle::framework::Scope()};
+  argument.SetScopeNotOwned(
+      const_cast<paddle::framework::Scope *>(scope.get()));
+
+  paddle::inference::utils::Visualizer visualizer;
+  visualizer.SetArgument(&argument);
+  visualizer.Run();
+
+  return 0;
+}
+
+USE_PASS(infer_clean_graph_pass);
+USE_PASS(graph_viz_pass);
+USE_PASS(graph_to_program_pass);
diff --git a/paddle/fluid/inference/utils/visualizer.h b/paddle/fluid/inference/utils/visualizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..be532f92cf60e06094bfcf8cc2be85085795fcf4
--- /dev/null
+++ b/paddle/fluid/inference/utils/visualizer.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/inference/analysis/argument.h"
+
+namespace paddle {
+namespace inference {
+namespace utils {
+
+using paddle::inference::analysis::Argument;
+
+class Visualizer final {
+ public:
+  Visualizer() = default;
+  ~Visualizer() = default;
+  Visualizer(const Visualizer &) = delete;
+  Visualizer &operator=(const Visualizer &) = delete;
+
+  void SetArgument(Argument *);
+  bool Run();
+
+ private:
+  Argument *argument_;
+};
+
+}  // namespace utils
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc
index 26e2038a534c18d2b7ab77adf33846803dcffcf5..64aa63ffe9705d75e70c8d9d9cbc433dd6358596 100644
--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -14,11 +14,13 @@
 
 #include "paddle/fluid/memory/allocation/legacy_allocator.h"
 #include <string>
+#include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/fluid/string/split.h"
 
 DEFINE_bool(init_allocated_mem, false,
             "It is a mistake that the values of the memory allocated by "
@@ -86,7 +88,7 @@ struct NaiveAllocator {
 
 template <>
 void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) {
-  VLOG(1) << "Allocate " << size << " bytes on " << platform::Place(place);
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
   void *p = GetCPUBuddyAllocator()->Alloc(size);
   if (FLAGS_init_allocated_mem) {
     memset(p, 0xEF, size);
@@ -97,7 +99,7 @@ void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) {
 
 template <>
 void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p) {
-  VLOG(1) << "Free pointer=" << p << " on " << platform::Place(place);
+  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
   GetCPUBuddyAllocator()->Free(p);
 }
 
@@ -110,19 +112,21 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
 BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
   static std::once_flag init_flag;
   static detail::BuddyAllocator **a_arr = nullptr;
+  static std::vector<int> devices;
 
   std::call_once(init_flag, [gpu_id]() {
-    int gpu_num = platform::GetCUDADeviceCount();
-    PADDLE_ENFORCE(gpu_id < gpu_num, "gpu_id:%d should < gpu_num:%d", gpu_id,
-                   gpu_num);
+    devices = platform::GetSelectedDevices();
+    int gpu_num = devices.size();
 
     a_arr = new BuddyAllocator *[gpu_num];
-    for (int i = 0; i < gpu_num; i++) {
+    for (size_t i = 0; i < devices.size(); ++i) {
+      int dev_id = devices[i];
       a_arr[i] = nullptr;
-      platform::SetDeviceId(i);
-      a_arr[i] = new BuddyAllocator(
-          std::unique_ptr<detail::SystemAllocator>(new detail::GPUAllocator(i)),
-          platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
+      platform::SetDeviceId(dev_id);
+      a_arr[i] = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
+                                        new detail::GPUAllocator(dev_id)),
+                                    platform::GpuMinChunkSize(),
+                                    platform::GpuMaxChunkSize());
 
       VLOG(10) << "\n\nNOTE: each GPU device use "
                << FLAGS_fraction_of_gpu_memory_to_use * 100
@@ -134,7 +138,9 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
   });
 
   platform::SetDeviceId(gpu_id);
-  return a_arr[gpu_id];
+  auto pos = std::distance(devices.begin(),
+                           std::find(devices.begin(), devices.end(), gpu_id));
+  return a_arr[pos];
 }
 #endif
 
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index de4f23515d8591f28b80ad00322365f8cdce768b..257bfc0a3f926d20abc4647b27e8e9cc2c49e014 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -37,7 +37,13 @@ if (WITH_GPU)
     SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub)
 endif()
 
-register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS})
+SET(OP_PREFETCH_DEPS "")
+if (WITH_DISTRIBUTE)
+    SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch)
+endif()
+
+register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
+
 
 # warpctc_op needs cudnn 7 above
 if (WITH_GPU AND NOT WIN32)
@@ -64,7 +70,7 @@ endif()
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
 if (WITH_GPU)
-  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv)
+  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu)
 endif()
 
 # FIXME(typhoonzero): operator deps may not needed.
diff --git a/paddle/fluid/operators/activation_mkldnn_op.cc b/paddle/fluid/operators/activation_mkldnn_op.cc
index 64649b1a5e471a30f435e2b1c1a9db03d35dbd8a..e16b6f78d16ce29cc493c4c795c7fe97a4bf2550 100644
--- a/paddle/fluid/operators/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/activation_mkldnn_op.cc
@@ -100,8 +100,9 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
   const T *x_data = x->data<T>();
   T *y_data = y->mutable_data<T>(ctx.GetPlace());
 
-  PADDLE_ENFORCE(x->dims().size() == 2 || x->dims().size() == 4,
-                 "Input dim must be with 2 or 4");
+  PADDLE_ENFORCE(
+      x->dims().size() == 2 || x->dims().size() == 3 || x->dims().size() == 4,
+      "Input dim must be with 2, 3 or 4");
 
   std::vector<int> src_tz = framework::vectorize2int(x->dims());
 
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 832245371e0b1966000ec0252a58ca02193332a7..9c5b8604f40ae56c463b54c71623feb61bd8d297 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -76,8 +76,8 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx,
   }
 #endif
   return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<framework::Tensor>(name)->type()),
-      ctx.GetPlace(), layout, library);
+      framework::GetDataTypeOfVar(ctx.InputVar(name)), ctx.GetPlace(), layout,
+      library);
 }
 
 class ActivationOp : public framework::OperatorWithKernel {
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index a0f8c5c14c48cb1e2be60b53a2198e30b050b33d..c7df3ea58a91579e35ff0d486516271a6daf054f 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -41,6 +41,12 @@ static std::unordered_set<std::string> InplaceOpSet = {
     "floor",   "reciprocal", "relu6", "soft_relu", "hard_sigmoid",
 };
 
+/* The following operator can be used to process SelectedRows, because the
+ * output of those operator for zero is zero too.
+ */
+static std::unordered_set<std::string> CanBeUsedBySelectedRows = {
+    "abs", "abs_grad", "square", "square_grad", "sqrt", "sqrt_grad"};
+
 static bool IsInplace(std::string op) { return InplaceOpSet.count(op); }
 
 template <typename DeviceContext, typename Functor>
@@ -50,16 +56,38 @@ class ActivationKernel
   using T = typename Functor::ELEMENT_TYPE;
 
   void Compute(const framework::ExecutionContext& context) const override {
-    auto& X = detail::Ref(context.Input<framework::Tensor>("X"),
-                          "Cannot get input tensor X, variable name = %s",
-                          context.op().Input("X"));
-
-    auto& Out = detail::Ref(context.Output<framework::Tensor>("Out"),
-                            "Cannot get output tensor Out, variable name = %s",
-                            context.op().Output("Out"));
-    Out.mutable_data<T>(context.GetPlace());
+    auto x_var = context.InputVar("X");
+    auto out_var = context.OutputVar("Out");
+    PADDLE_ENFORCE(x_var != nullptr,
+                   "Cannot get input Variable X, variable name = %s",
+                   context.op().Input("X"));
+    PADDLE_ENFORCE(out_var != nullptr,
+                   "Cannot get output Variable Out, variable name = %s",
+                   context.op().Output("Out"));
+
+    framework::Tensor X, *Out;
+
+    if (CanBeUsedBySelectedRows.count(context.op().Type())) {
+      X = detail::Ref(
+          paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var),
+          "Cannot get input Tensor X, variable name = %s",
+          context.op().Input("X"));
+      Out = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
+          out_var);
+    } else {
+      X = detail::Ref(context.Input<framework::Tensor>("X"),
+                      "Cannot get input Tensor X, variable name = %s",
+                      context.op().Input("X"));
+      Out = context.Output<framework::Tensor>("Out");
+    }
+
+    PADDLE_ENFORCE(Out != nullptr,
+                   "Cannot get output tensor Out, variable name = %s",
+                   context.op().Output("Out"));
+
+    Out->mutable_data<T>(context.GetPlace());
     auto x = framework::EigenVector<T>::Flatten(X);
-    auto out = framework::EigenVector<T>::Flatten(Out);
+    auto out = framework::EigenVector<T>::Flatten(*Out);
     auto* place =
         context.template device_context<DeviceContext>().eigen_device();
     Functor functor;
@@ -78,14 +106,54 @@ class ActivationGradKernel
  public:
   using T = typename Functor::ELEMENT_TYPE;
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* Out = context.Input<framework::Tensor>("Out");
-    auto* dOut =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto out_var = context.InputVar("Out");
+    auto out_grad_var = context.InputVar(framework::GradVarName("Out"));
+    auto x_grad_var = context.OutputVar(framework::GradVarName("X"));
+    PADDLE_ENFORCE(out_var != nullptr,
+                   "Cannot get input Variable Out, variable name = %s",
+                   context.op().Input("Out"));
+    PADDLE_ENFORCE(out_grad_var != nullptr,
+                   "Cannot get input Variable %s, variable name = %s",
+                   framework::GradVarName("Out"),
+                   context.op().Input(framework::GradVarName("Out")));
+    PADDLE_ENFORCE(x_grad_var != nullptr,
+                   "Cannot get output Variable %s, variable name = %s",
+                   framework::GradVarName("X"),
+                   context.op().Output(framework::GradVarName("X")));
+
+    framework::Tensor Out, dOut, *dX;
+    if (CanBeUsedBySelectedRows.count(context.op().Type())) {
+      Out = detail::Ref(
+          paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var),
+          "Cannot get input Tensor Out, variable name = %s",
+          context.op().Input("Out"));
+      dOut =
+          detail::Ref(paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(
+                          *out_grad_var),
+                      "Cannot get input Tensor %s, variable name = %s",
+                      framework::GradVarName("Out"),
+                      context.op().Input(framework::GradVarName("Out")));
+      dX = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
+          x_grad_var);
+    } else {
+      Out = detail::Ref(context.Input<framework::Tensor>("Out"),
+                        "Cannot get input Tensor Out, variable name = %s",
+                        context.op().Input("Out"));
+      dOut = detail::Ref(
+          context.Input<framework::Tensor>(framework::GradVarName("Out")),
+          "Cannot get input Tensor %s, variable name = %s",
+          framework::GradVarName("Out"),
+          context.op().Input(framework::GradVarName("Out")));
+      dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    }
+    PADDLE_ENFORCE(dX != nullptr,
+                   "Cannot get output tensor %s, variable name = %s",
+                   framework::GradVarName("X"),
+                   context.op().Output(framework::GradVarName("X")));
     dX->mutable_data<T>(context.GetPlace());
 
-    auto dout = framework::EigenVector<T>::Flatten(*dOut);
-    auto out = framework::EigenVector<T>::Flatten(*Out);
+    auto dout = framework::EigenVector<T>::Flatten(dOut);
+    auto out = framework::EigenVector<T>::Flatten(Out);
     auto dx = framework::EigenVector<T>::Flatten(*dX);
     auto* place =
         context.template device_context<DeviceContext>().eigen_device();
@@ -96,8 +164,19 @@ class ActivationGradKernel
     }
     bool inplace = functor.Inplace();
     if (!inplace) {
-      auto* X = context.Input<framework::Tensor>("X");
-      auto x = framework::EigenVector<T>::Flatten(*X);
+      auto x_var = context.InputVar("X");
+      PADDLE_ENFORCE(x_var != nullptr,
+                     "Cannot get input tensor X, variable name = %s",
+                     context.op().Input("X"));
+      framework::Tensor X;
+      if (CanBeUsedBySelectedRows.count(context.op().Type())) {
+        X = detail::Ref(
+            paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var));
+      } else {
+        X = detail::Ref(context.Input<framework::Tensor>("X"));
+      }
+
+      auto x = framework::EigenVector<T>::Flatten(X);
       functor(*place, x, out, dout, dx);
     } else {
       VLOG(10) << " Inplace activation ";
@@ -222,23 +301,22 @@ template <typename T>
 struct GeluFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Out>
   void operator()(Device d, X x, Out out) const {
-    auto temp =
-        ((x * static_cast<T>(M_SQRT1_2)).erf()).template cast<T>().eval();
+    auto temp = (x * static_cast<T>(M_SQRT1_2)).erf();
     out.device(d) = x * static_cast<T>(0.5) * (static_cast<T>(1) + temp);
   }
 };
 
 template <typename T>
 struct GeluGradFunctor : BaseActivationFunctor<T> {
-  bool Inplace() const { return IsInplace("gelu"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp = (static_cast<T>(0.5 * M_2_SQRTPI * M_SQRT1_2) * x *
-                 ((-static_cast<T>(0.5) * x.square()).exp()))
-                    .template cast<T>()
-                    .eval();
-    dx.device(d) = dout * (out / x + temp);
+    auto first = static_cast<T>(0.5) *
+                 (static_cast<T>(1) + ((x * static_cast<T>(M_SQRT1_2)).erf()));
+
+    auto second = static_cast<T>(0.5 * M_2_SQRTPI * M_SQRT1_2) * x *
+                  (-static_cast<T>(0.5) * x.square()).exp();
+    dx.device(d) = dout * (first + second);
   }
 };
 
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index 9b943440a869e213db4ed761cfe7c508bc5e94ae..75fc59125f21901b6781315eb3d7dba36b7f11f2 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -231,10 +231,10 @@ use lstm_x_t as input and compute as standard LSTM.
 template <typename T>
 inline void bias_relu(const int n, const T* x, const T* bias, T* y) {
   if (bias) {
-    math::vec_add_bias<T, platform::jit::avx>(n, *bias, x, y);
-    math::vec_relu<T, platform::jit::avx>(n, y, y);
+    math::vec_add_bias<T, platform::avx>(n, *bias, x, y);
+    math::vec_relu<T, platform::avx>(n, y, y);
   } else {
-    math::vec_relu<T, platform::jit::avx>(n, x, y);
+    math::vec_relu<T, platform::avx>(n, x, y);
   }
 }
 
@@ -245,8 +245,8 @@ inline void vec_softmax(const int n, const T* x, T* y) {
   for (int i = 1; i < n; ++i) {
     scalar = scalar < x[i] ? x[i] : scalar;
   }
-  math::vec_add_bias<T, platform::jit::avx>(n, -scalar, x, y);  // sub
-  math::vec_exp<T>(n, y, y);                                    // exp
+  math::vec_add_bias<T, platform::avx>(n, -scalar, x, y);  // sub
+  math::vec_exp<T>(n, y, y);                               // exp
   // sum
   scalar = T(0);
   for (int i = 0; i < n; ++i) {
@@ -302,13 +302,13 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
     auto& act_gate_str = ctx.Attr<std::string>("gate_activation");
     auto& act_cell_str = ctx.Attr<std::string>("cell_activation");
     auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");
-    if (platform::jit::MayIUse(platform::jit::avx)) {
-      math::VecActivations<T, platform::jit::avx> act_functor;
+    if (platform::MayIUse(platform::avx)) {
+      math::VecActivations<T, platform::avx> act_functor;
       act_gate = act_functor(act_gate_str);
       act_cell = act_functor(act_cell_str);
       act_cand = act_functor(act_cand_str);
     } else {
-      math::VecActivations<T, platform::jit::isa_any> act_functor;
+      math::VecActivations<T, platform::isa_any> act_functor;
       act_gate = act_functor(act_gate_str);
       act_cell = act_functor(act_cell_str);
       act_cand = act_functor(act_cand_str);
diff --git a/paddle/fluid/operators/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
index de641cb08e4cc3322cc8387d873f2aaab279e1dd..bddca232e6c8a2a7fde998877006e37ee6d3d0dc 100644
--- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "mkldnn.hpp"
 #include "paddle/fluid/operators/batch_norm_op.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace paddle {
 namespace operators {
@@ -146,7 +146,9 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const float epsilon = ctx.Attr<float>("epsilon");
     const float momentum = ctx.Attr<float>("momentum");
     const bool is_test = ctx.Attr<bool>("is_test");
+    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
     const bool fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
+    bool global_stats = is_test || use_global_stats;
 
     const auto *x = ctx.Input<Tensor>("X");
     const auto *mean = ctx.Input<Tensor>("Mean");
@@ -177,13 +179,14 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     T *batch_mean_data = nullptr;
     T *batch_variance_data = nullptr;
 
-    if (!is_test) {
+    if (!global_stats) {
       batch_mean_data = batch_mean->mutable_data<T>(ctx.GetPlace());
       batch_variance_data = batch_variance->mutable_data<T>(ctx.GetPlace());
     }
 
-    auto propagation = is_test == true ? mkldnn::prop_kind::forward_scoring
-                                       : mkldnn::prop_kind::forward_training;
+    auto propagation = global_stats == true
+                           ? mkldnn::prop_kind::forward_scoring
+                           : mkldnn::prop_kind::forward_training;
 
     auto src_tz = paddle::framework::vectorize2int(x->dims());
     auto scale_tz = paddle::framework::vectorize2int(scale->dims());
@@ -199,7 +202,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                     shift->data<T>() + ic, &scaleshift_data);
 
     unsigned flags = mkldnn::use_scale_shift;
-    if (is_test) flags |= mkldnn::use_global_stats;
+    if (global_stats) flags |= mkldnn::use_global_stats;
     if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu;
 
     // create mkldnn memory from input x tensor
@@ -208,7 +211,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     // keys for backward pass
     const std::string key = BatchNormMKLDNNHandler::GetHash(
-        src_tz, epsilon, flags, is_test, input_format,
+        src_tz, epsilon, flags, global_stats, input_format,
         ctx.op().Output("SavedMean"));
     const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
 
@@ -239,7 +242,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         batch_norm_fwd_pd->dst_primitive_desc().desc(), y_data);
 
     std::shared_ptr<batch_norm_fwd> batch_norm_p;
-    if (is_test) {
+    if (global_stats) {
       // create mkldnn memory for stats (as input)
       std::shared_ptr<memory> mean_memory =
           handler.AcquireMeanMemoryFromPrimitive(to_void_cast(mean_data));
@@ -269,7 +272,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     pipeline.push_back(*batch_norm_p);
     mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
 
-    if (!is_test) {
+    if (!global_stats) {
       // mkldnn only compute stats for current batch
       // so we need compute momentum stats via Eigen lib
       EigenVectorArrayMap<T> batch_mean_e(batch_mean_data, ic);
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 2463c939bc5d19500ba36ba3c73db176bb82c62a..f66813989c64737a4b41e3f653d9ca654be72dd6 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -159,6 +159,14 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>("fuse_with_relu",
                   "(bool, default false) Only used in mkldnn kernel")
         .SetDefault(false);
+    AddAttr<bool>("use_global_stats",
+                  "(bool, default false) Whether to use global mean and "
+                  "variance. In inference or test mode, set use_global_stats "
+                  "to true or is_test true. the behavior is equivalent. "
+                  "In train mode, when setting use_global_stats True, the "
+                  "global mean and variance are also used during train time, "
+                  "the BN acts as scaling and shiffting.")
+        .SetDefault(false);
     AddComment(R"DOC(
 Batch Normalization.
 
@@ -190,6 +198,10 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
     const float epsilon = ctx.Attr<float>("epsilon");
     const float momentum = ctx.Attr<float>("momentum");
     const bool is_test = ctx.Attr<bool>("is_test");
+    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+
+    bool global_stats = is_test || use_global_stats;
+
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
     const DataLayout data_layout =
         framework::StringToDataLayout(data_layout_str);
@@ -217,7 +229,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
     saved_mean->mutable_data<T>(ctx.GetPlace());
     saved_variance->mutable_data<T>(ctx.GetPlace());
 
-    if (!is_test) {
+    if (!global_stats) {
       // saved_xx is use just in this batch of data
       EigenVectorArrayMap<T> saved_mean_e(
           saved_mean->mutable_data<T>(ctx.GetPlace()), C);
@@ -234,7 +246,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
       if ((N * sample_size) == 1) {
         LOG(WARNING) << "Only 1 element in normalization dimension, "
                      << "we skip the batch norm calculation, let y = x.";
-        framework::TensorCopySync(*x, ctx.GetPlace(), y);
+        framework::TensorCopy(*x, ctx.GetPlace(), y);
         return;
       }
 
@@ -277,7 +289,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
 
     // use SavedMean and SavedVariance to do normalize
     Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
-    if (is_test) {
+    if (global_stats) {
       ConstEigenVectorArrayMap<T> var_arr(
           ctx.Input<Tensor>("Variance")->data<T>(), C);
       inv_std = (var_arr + epsilon).sqrt().inverse();
@@ -289,8 +301,8 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
       inv_std = saved_inv_std;
     }
     ConstEigenVectorArrayMap<T> mean_arr(
-        is_test ? ctx.Input<Tensor>("Mean")->data<T>()
-                : ctx.Output<Tensor>("SavedMean")->data<T>(),
+        global_stats ? ctx.Input<Tensor>("Mean")->data<T>()
+                     : ctx.Output<Tensor>("SavedMean")->data<T>(),
         C);
 
     //   ((x - est_mean) * (inv_var) * scale + bias
@@ -336,15 +348,27 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext *ctx) const override {
     // check input
     PADDLE_ENFORCE(ctx->HasInput("X"));
-    PADDLE_ENFORCE(ctx->HasInput("Scale"), "");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), "");
-    PADDLE_ENFORCE(ctx->HasInput("SavedMean"), "");
-    PADDLE_ENFORCE(ctx->HasInput("SavedVariance"), "");
+    PADDLE_ENFORCE(ctx->HasInput("Scale"), "Input(scale) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
+                   "Input(Y@GRAD) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("SavedMean"),
+                   "Input(SavedMean) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("SavedVariance"),
+                   "Input(SavedVariance) should not be null");
 
     // check output
     PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Scale")), "");
-    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")), "");
+    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
+      PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")),
+                     "Output(Scale@GRAD) and Output(Bias@GRAD) should not be "
+                     "null at same time");
+    }
+    const bool use_global_stats = ctx->Attrs().Get<bool>("use_global_stats");
+    if (use_global_stats) {
+      PADDLE_ENFORCE(!ctx->Attrs().Get<bool>("use_mkldnn"),
+                     "Using global stats during training is not supported "
+                     "in gradient op kernel of batch_norm_mkldnn_op now.");
+    }
 
     const auto x_dims = ctx->GetInputDim("X");
     const DataLayout data_layout = framework::StringToDataLayout(
@@ -354,8 +378,10 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
                                           : x_dims[x_dims.size() - 1]);
 
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
-    ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
+    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
+      ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
+      ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
+    }
   }
 
  protected:
@@ -405,6 +431,8 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
     // SavedVariance have been reverted in forward operator
     const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+    const float epsilon = ctx.Attr<float>("epsilon");
     const DataLayout data_layout =
         framework::StringToDataLayout(data_layout_str);
 
@@ -419,38 +447,60 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
                                           : x_dims[x_dims.size() - 1]);
     const int sample_size = x->numel() / N / C;
 
-    ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C);
-    ConstEigenVectorArrayMap<T> mean_arr(saved_mean->data<T>(), C);
-    ConstEigenVectorArrayMap<T> inv_var_arr(saved_inv_variance->data<T>(), C);
-
     // init output
     auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
     auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
 
     d_x->mutable_data<T>(ctx.GetPlace());
-    d_scale->mutable_data<T>(ctx.GetPlace());
-    d_bias->mutable_data<T>(ctx.GetPlace());
+
+    const T *mean_data = saved_mean->data<T>();
+    const T *inv_var_data = saved_inv_variance->data<T>();
+    Tensor inv_var_tensor;
+    if (use_global_stats) {
+      const auto *running_mean = ctx.Input<Tensor>("Mean");
+      const auto *running_variance = ctx.Input<Tensor>("Variance");
+      mean_data = running_mean->data<T>();
+      T *running_inv_var_data = inv_var_tensor.mutable_data<T>(ctx.GetPlace());
+      EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
+      ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
+
+      inv_var_tmp = (var_arr + epsilon).sqrt().inverse().eval();
+      inv_var_data = running_inv_var_data;
+    }
+
+    ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C);
+    ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
+    ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
+
+    T *d_bias_data = nullptr;
+    T *d_scale_data = nullptr;
+    if (d_scale && d_bias) {
+      d_scale->mutable_data<T>(ctx.GetPlace());
+      d_bias->mutable_data<T>(ctx.GetPlace());
+      d_bias_data = d_bias->mutable_data<T>(ctx.GetPlace());
+      d_scale_data = d_scale->mutable_data<T>(ctx.GetPlace());
+    }
 
     // d_bias = np.sum(d_y, axis=0)
     // d_scale = np.sum((X - mean) / inv_std * dy, axis=0)
     // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0)
     //   - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0))
+    EigenVectorArrayMap<T> d_bias_arr(d_bias_data, C);
+    EigenVectorArrayMap<T> d_scale_arr(d_scale_data, C);
 
-    EigenVectorArrayMap<T> d_bias_arr(d_bias->mutable_data<T>(ctx.GetPlace()),
-                                      C);
-    EigenVectorArrayMap<T> d_scale_arr(d_scale->mutable_data<T>(ctx.GetPlace()),
-                                       C);
-
-    d_bias_arr.setZero();
-    d_scale_arr.setZero();
+    if (d_scale && d_bias) {
+      d_bias_arr.setZero();
+      d_scale_arr.setZero();
+    }
 
-    if ((N * sample_size) == 1) {
-      framework::TensorCopySync(*d_y, ctx.GetPlace(), d_x);
+    if ((N * sample_size) == 1 && !use_global_stats) {
+      framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
       return;
     }
 
-    const auto scale_inv_var_nhw = scale_arr * inv_var_arr / (N * sample_size);
+    int scale_coefff = use_global_stats ? 1 : N * sample_size;
+    const auto scale_inv_var_nhw = scale_arr * inv_var_arr / scale_coefff;
 
     switch (data_layout) {
       case DataLayout::kNCHW: {
@@ -460,19 +510,29 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
                                  sample_size, N * C);
         d_x_arr.setZero();
 
-        for (int nc = 0; nc < N * C; ++nc) {
-          int c = nc % C;
-          d_bias_arr(c) += d_y_arr.col(nc).sum();
-          d_scale_arr(c) +=
-              ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc))
-                  .sum();
+        if (d_scale && d_bias) {
+          for (int nc = 0; nc < N * C; ++nc) {
+            int c = nc % C;
+            d_bias_arr(c) += d_y_arr.col(nc).sum();
+            d_scale_arr(c) += ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) *
+                               d_y_arr.col(nc))
+                                  .sum();
+          }
         }
-        for (int nc = 0; nc < N * C; ++nc) {
-          int c = nc % C;
-          d_x_arr.col(nc) +=
-              scale_inv_var_nhw(c) *
-              (d_y_arr.col(nc) * N * sample_size - d_bias_arr(c) -
-               (x_arr.col(nc) - mean_arr[c]) * d_scale_arr(c) * inv_var_arr(c));
+        if (!use_global_stats) {
+          for (int nc = 0; nc < N * C; ++nc) {
+            int c = nc % C;
+            d_x_arr.col(nc) +=
+                scale_inv_var_nhw(c) *
+                (d_y_arr.col(nc) * N * sample_size - d_bias_arr(c) -
+                 (x_arr.col(nc) - mean_arr[c]) * d_scale_arr(c) *
+                     inv_var_arr(c));
+          }
+        } else {
+          for (int nc = 0; nc < N * C; ++nc) {
+            int c = nc % C;
+            d_x_arr.col(nc) += scale_inv_var_nhw(c) * d_y_arr.col(nc);
+          }
         }
         break;
       }
@@ -488,15 +548,27 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
         const auto d_y_mul_x_minus_mean_row_sum =
             (d_y_arr * x_minus_mean).rowwise().sum();
         const auto inv_var_sqr = inv_var_arr * inv_var_arr;
-        for (int nhw = 0; nhw < N * sample_size; ++nhw) {
-          d_bias_arr += d_y_arr.col(nhw);
-          d_scale_arr +=
-              (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw);
-          d_x_arr.col(nhw) +=
-              scale_inv_var_nhw *
-              (d_y_arr.col(nhw) * N * sample_size - d_y_row_sum -
-               x_minus_mean.col(nhw) * inv_var_sqr *
-                   d_y_mul_x_minus_mean_row_sum);
+
+        if (d_scale && d_bias) {
+          for (int nhw = 0; nhw < N * sample_size; ++nhw) {
+            d_bias_arr += d_y_arr.col(nhw);
+            d_scale_arr +=
+                (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw);
+          }
+        }
+
+        if (!use_global_stats) {
+          for (int nhw = 0; nhw < N * sample_size; ++nhw) {
+            d_x_arr.col(nhw) +=
+                scale_inv_var_nhw *
+                (d_y_arr.col(nhw) * N * sample_size - d_y_row_sum -
+                 x_minus_mean.col(nhw) * inv_var_sqr *
+                     d_y_mul_x_minus_mean_row_sum);
+          }
+        } else {
+          for (int nhw = 0; nhw < N * sample_size; ++nhw) {
+            d_x_arr.col(nhw) += scale_inv_var_nhw * d_y_arr.col(nhw);
+          }
         }
         break;
       }
@@ -522,6 +594,10 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
     op->SetInput("SavedMean", Output("SavedMean"));
     op->SetInput("SavedVariance", Output("SavedVariance"));
 
+    // used when setting use_global_stats True during training
+    op->SetInput("Mean", Output("MeanOut"));
+    op->SetInput("Variance", Output("VarianceOut"));
+
     op->SetAttrMap(Attrs());
 
     op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
diff --git a/paddle/fluid/operators/batch_norm_op.cu.cc b/paddle/fluid/operators/batch_norm_op.cu
similarity index 57%
rename from paddle/fluid/operators/batch_norm_op.cu.cc
rename to paddle/fluid/operators/batch_norm_op.cu
index aaed335c905c0d80cd519afc5fecb06af73fcfe7..1c45746a92ad057a97d9f65aa256df616fc37f3d 100644
--- a/paddle/fluid/operators/batch_norm_op.cu.cc
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -12,9 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/batch_norm_op.h"
+#include <algorithm>
 #include <cfloat>
+#include <string>
+#include <vector>
+#include "cub/cub.cuh"
 #include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/operators/batch_norm_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 #include "paddle/fluid/platform/float16.h"
@@ -59,6 +63,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
     const float momentum = ctx.Attr<float>("momentum");
     const bool is_test = ctx.Attr<bool>("is_test");
+    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
     const DataLayout data_layout =
         framework::StringToDataLayout(data_layout_str);
@@ -121,7 +126,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     auto handle = dev_ctx.cudnn_handle();
 
     // Now, depending on whether we are running test or not, we have two paths.
-    if (is_test) {
+    if (is_test || use_global_stats) {
       // only when test we use input to do computation.
       const auto *est_mean = ctx.Input<Tensor>("Mean");
       const auto *est_var = ctx.Input<Tensor>("Variance");
@@ -163,7 +168,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
       if ((N * H * W * D) == 1) {
         LOG(WARNING) << "Only 1 element in normalization dimension, "
                      << "we skip the batch norm calculation, let y = x.";
-        framework::TensorCopySync(*x, ctx.GetPlace(), y);
+        framework::TensorCopy(*x, ctx.GetPlace(), y);
       } else {
         double this_factor = 1. - momentum;
 
@@ -191,6 +196,58 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
   }
 };
 
+template <typename T, framework::DataLayout layout>
+static __global__ void KeBNBackwardData(const T *dy,
+                                        const BatchNormParamType<T> *scale,
+                                        const BatchNormParamType<T> *variance,
+                                        const double epsilon, const int C,
+                                        const int HxW, const int num, T *dx) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
+    BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
+    dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
+                           scale[c] * inv_var);
+  }
+}
+
+template <typename T, int BlockDim, framework::DataLayout layout>
+static __global__ void KeBNBackwardScaleBias(
+    const T *dy, const T *x, const BatchNormParamType<T> *mean,
+    const BatchNormParamType<T> *variance, const double epsilon, const int N,
+    const int C, const int HxW, BatchNormParamType<T> *dscale,
+    BatchNormParamType<T> *dbias) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ds_storage;
+  __shared__ typename BlockReduce::TempStorage db_storage;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
+
+    BatchNormParamType<T> inv_var_i = 1.0 / sqrt(variance[i] + epsilon);
+    BatchNormParamType<T> mean_i = mean[i];
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == framework::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      ds_sum += static_cast<BatchNormParamType<T>>(dy[index]) *
+                (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
+      db_sum += static_cast<BatchNormParamType<T>>(dy[index]);
+    }
+    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
+    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      dscale[i] = ds_sum * inv_var_i;
+      dbias[i] = db_sum;
+    }
+    __syncthreads();
+  }
+}
+
 template <typename T>
 class BatchNormGradKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
@@ -200,6 +257,8 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
                    "It must use CUDAPlace.");
     double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+
     const DataLayout data_layout =
         framework::StringToDataLayout(data_layout_str);
     const auto *x = ctx.Input<Tensor>("X");
@@ -219,42 +278,13 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
     auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
 
     d_x->mutable_data<T>(ctx.GetPlace());
-    d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    if ((N * H * W * D) == 1) {
-      framework::TensorCopySync(*d_y, ctx.GetPlace(), d_x);
-      math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
-          functor;
-      functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
-      functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
-      return;
+    if (d_scale && d_bias) {
+      d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+      d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
     }
-
     PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL);
     PADDLE_ENFORCE_EQ(scale->dims()[0], C);
 
-    // ------------------- cudnn descriptors ---------------------
-    cudnnTensorDescriptor_t data_desc_;
-    cudnnTensorDescriptor_t bn_param_desc_;
-    cudnnBatchNormMode_t mode_;
-
-    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
-    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
-      LOG(ERROR) << "Provided epsilon is smaller than "
-                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
-                 << "CUDNN_BN_MIN_EPSILON instead.";
-    }
-    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-#if CUDNN_VERSION_MIN(7, 0, 0)
-    mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
-#else
-    mode_ = CUDNN_BATCHNORM_SPATIAL;
-#endif
-
     std::vector<int> dims;
     std::vector<int> strides;
     if (data_layout == DataLayout::kNCHW) {
@@ -264,34 +294,114 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
       dims = {N, C, H, W, D};
       strides = {H * W * C * D, 1, W * D * C, D * C, C};
     }
-    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-    CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor(
-        bn_param_desc_, data_desc_, mode_));
-
-    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
-    const void *saved_mean_data =
-        saved_mean->template data<BatchNormParamType<T>>();
-    const void *saved_var_data =
-        saved_var->template data<BatchNormParamType<T>>();
-
-    CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward(
-        dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
-        CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
-        CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
-        data_desc_, d_y->template data<T>(), data_desc_,
-        d_x->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
-        scale->template data<BatchNormParamType<T>>(),
-        d_scale->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
-        d_bias->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
-        epsilon, saved_mean_data, saved_var_data));
 
-    // clean when exit.
-    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    CUDNN_ENFORCE(
-        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    if (!use_global_stats) {
+      if ((N * H * W * D) == 1) {
+        framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
+        math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
+            functor;
+        functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
+        functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
+        return;
+      }
+
+      // ------------------- cudnn descriptors ---------------------
+      cudnnTensorDescriptor_t data_desc_;
+      cudnnTensorDescriptor_t bn_param_desc_;
+      cudnnBatchNormMode_t mode_;
+
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+      if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+        LOG(ERROR) << "Provided epsilon is smaller than "
+                   << "CUDNN_BN_MIN_EPSILON. Setting it to "
+                   << "CUDNN_BN_MIN_EPSILON instead.";
+      }
+      epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+#if CUDNN_VERSION_MIN(7, 0, 0)
+      mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+#else
+      mode_ = CUDNN_BATCHNORM_SPATIAL;
+#endif
+
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          data_desc_, CudnnDataType<T>::type,
+          x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
+      CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor(
+          bn_param_desc_, data_desc_, mode_));
+
+      const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
+      const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
+      const void *saved_mean_data =
+          saved_mean->template data<BatchNormParamType<T>>();
+      const void *saved_var_data =
+          saved_var->template data<BatchNormParamType<T>>();
+
+      CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward(
+          dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
+          CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
+          CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
+          data_desc_, d_y->template data<T>(), data_desc_,
+          d_x->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
+          scale->template data<BatchNormParamType<T>>(),
+          d_scale->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
+          d_bias->template mutable_data<BatchNormParamType<T>>(ctx.GetPlace()),
+          epsilon, saved_mean_data, saved_var_data));
+
+      // clean when exit.
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+    } else {
+      const auto *running_mean = ctx.Input<Tensor>("Mean");
+      const auto *running_var = ctx.Input<Tensor>("Variance");
+
+      const auto *running_mean_data =
+          running_mean->template data<BatchNormParamType<T>>();
+      const auto *running_var_data =
+          running_var->template data<BatchNormParamType<T>>();
+
+      const int num = x->numel();
+      const int block = 512;
+      int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+      const int max_blocks = std::max(max_threads / block, 1);
+      int grid1 = (num + block - 1) / block;
+      int grid2 = std::min(C, max_blocks);
+
+      if (data_layout == framework::DataLayout::kNCHW) {
+        if (d_x) {
+          KeBNBackwardData<T, framework::DataLayout::kNCHW><<<
+              grid1, block, 0, dev_ctx.stream()>>>(
+              d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
+              running_var_data, epsilon, C, H * W, num, d_x->data<T>());
+        }
+        if (d_scale && d_bias) {
+          KeBNBackwardScaleBias<T, block, framework::DataLayout::kNCHW><<<
+              grid2, block, 0, dev_ctx.stream()>>>(
+              d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
+              epsilon, C, H * W, num, d_scale->data<BatchNormParamType<T>>(),
+              d_bias->data<BatchNormParamType<T>>());
+        }
+      } else {
+        if (d_x) {
+          KeBNBackwardData<T, framework::DataLayout::kNHWC><<<
+              grid1, block, 0, dev_ctx.stream()>>>(
+              d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
+              running_var_data, epsilon, C, H * W, num, d_x->data<T>());
+        }
+        if (d_scale && d_bias) {
+          KeBNBackwardScaleBias<T, block, framework::DataLayout::kNCHW><<<
+              grid2, block, 0, dev_ctx.stream()>>>(
+              d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
+              epsilon, C, H * W, num, d_scale->data<BatchNormParamType<T>>(),
+              d_bias->data<BatchNormParamType<T>>());
+        }
+      }
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cu b/paddle/fluid/operators/bilinear_tensor_product_op.cu
index 9426ffbe174c7daf9f24525f5f7ca12d986042f4..c2b4f69e6854522b91dfd9fb5f738c0e5ffc77b1 100644
--- a/paddle/fluid/operators/bilinear_tensor_product_op.cu
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.cu
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/bilinear_tensor_product_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9258d7c7e83122149c7cbc42e4a4bdd84903ce67
--- /dev/null
+++ b/paddle/fluid/operators/bpr_loss_op.cc
@@ -0,0 +1,145 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/bpr_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+class BprLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto label_dims = ctx->GetInputDim("Label");
+    int rank = x_dims.size();
+    PADDLE_ENFORCE_EQ(rank, label_dims.size(),
+                      "Input(X) and Input(Label) shall have the same rank.");
+    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
+                      framework::slice_ddim(label_dims, 0, rank - 1),
+                      "Input(X) and Input(Label) shall have the same shape "
+                      "except the last dimension.");
+
+    auto y_dims = x_dims;
+    y_dims[rank - 1] = 1;
+    ctx->SetOutputDim("Y", y_dims);
+    ctx->ShareLoD("X", /*->*/ "Y");
+  }
+
+ protected:
+  // Explicitly set that the data type of computation kernel of Seq-bpr
+  // is determined by its input "X".
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        platform::CPUPlace());
+  }
+};
+
+class BprLossGradientOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
+                   "Input(Y@GRAD) shoudl be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@GRAD) should be not null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto label_dims = ctx->GetInputDim("Label");
+    auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
+    int rank = x_dims.size();
+    PADDLE_ENFORCE_EQ(dy_dims.size(), rank,
+                      "Input(Y@Grad) and Input(X) should have the same rank.");
+    PADDLE_ENFORCE_EQ(label_dims.size(), rank,
+                      "Input(Label) and Input(X) should have the same rank.");
+    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
+                      framework::slice_ddim(label_dims, 0, rank - 1),
+                      "The Input(X) and Input(Label) should have the same "
+                      "shape except the last dimension.");
+    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
+                      framework::slice_ddim(dy_dims, 0, rank - 1),
+                      "The Input(X) and Input(Y@Grad) should have the same "
+                      "shape except the last dimension.");
+    PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1,
+                      "The last dimension of Input(Y@Grad) should be 1.");
+    PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1,
+                      " the last dimension of Input(Label) should be 1.");
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->ShareLoD("X", framework::GradVarName("X"));
+  }
+
+ protected:
+  // Explicitly set that the data type of computation kernel of cross_entropy
+  // is determined by its input "X".
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        platform::CPUPlace());
+  }
+};
+
+class BprLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor, default Tensor<float>), a tensor whose last dimension "
+             "size is equal to the number of classes. This input is a "
+             "real number.");
+    AddInput(
+        "Label",
+        "(Tensor), the tensor which represents the ground truth. It has the "
+        "same shape with 'X' except the last dimension. the last dimension "
+        "size is 1.");
+    AddOutput("Y",
+              "(Tensor, default Tensor<float>), a tensor whose shape is same "
+              "with 'X' except that the last dimension size is 1. It "
+              "represents the sequence bpr loss.");
+    AddComment(R"DOC(
+Bayesian Personalized Ranking Loss Operator.
+
+This operator belongs to pairwise ranking loss. Label is the desired item.
+The loss at a given point in one session is defined as:
+$Y[i] = -\frac{1}{N_{i}} * \sum_{j=0}^{N_{i}}\log(\sigma(X[i, Label[i]]-X[i, j]))$
+
+Learn more details by reading paper <session-based recommendations with recurrent
+neural networks>(https://arxiv.org/abs/1511.06939)
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CPUCtx = paddle::platform::CPUDeviceContext;
+
+REGISTER_OPERATOR(bpr_loss, ops::BprLossOp, ops::BprLossOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(bpr_loss_grad, ops::BprLossGradientOp);
+REGISTER_OP_CPU_KERNEL(bpr_loss, ops::BprLossOpKernel<CPUCtx, float>,
+                       ops::BprLossOpKernel<CPUCtx, double>);
+REGISTER_OP_CPU_KERNEL(bpr_loss_grad,
+                       ops::BprLossGradientOpKernel<CPUCtx, float>,
+                       ops::BprLossGradientOpKernel<CPUCtx, double>);
diff --git a/paddle/fluid/operators/bpr_loss_op.h b/paddle/fluid/operators/bpr_loss_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e223be7af82146e7c69c7c5aab8f08d0fe0d1710
--- /dev/null
+++ b/paddle/fluid/operators/bpr_loss_op.h
@@ -0,0 +1,118 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+/*Todo:
+ *Find a way to adapt TolerableValue, using blas or eigen.
+ */
+template <typename T>
+struct TolerableValue {
+  HOSTDEVICE T operator()(const T& x) const {
+    PADDLE_ASSERT(std::is_floating_point<T>::value);
+    const T kApproInf = 1e20;
+    if (x == INFINITY) return kApproInf;
+    if (x == -INFINITY) return -kApproInf;
+    return x;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class BprLossOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* label = ctx.Input<Tensor>("Label");
+    auto* y = ctx.Output<Tensor>("Y");
+    y->mutable_data<T>(ctx.GetPlace());
+    int rank = x->dims().size();
+
+    Tensor x_2d = framework::ReshapeToMatrix(*x, rank - 1);
+    Tensor labels_2d = framework::ReshapeToMatrix(*label, rank - 1);
+    Tensor y_2d = framework::ReshapeToMatrix(*y, rank - 1);
+
+    const framework::Tensor* logits = &x_2d;
+    const framework::Tensor* labels = &labels_2d;
+    framework::Tensor* out = &y_2d;
+
+    const int step_size = logits->dims()[0];
+    const int class_num = logits->dims()[1];
+    const T* logits_data = logits->data<T>();
+    T* loss_data = out->data<T>();
+
+    const int64_t* label_data = labels->data<int64_t>();
+    for (int i = 0; i < step_size; ++i) {
+      int lbl_pos = label_data[i];
+      PADDLE_ENFORCE_GE(lbl_pos, 0);
+      PADDLE_ENFORCE_LT(lbl_pos, class_num);
+      int index_pos = i * class_num + lbl_pos;
+      T sum = static_cast<T>(0);
+      for (int j = 0; j < class_num; j++) {
+        if (j == lbl_pos) continue;
+        int index_neg = i * class_num + j;
+        sum += TolerableValue<T>()(-std::log(
+            1.0f + TolerableValue<T>()(std::exp(logits_data[index_neg] -
+                                                logits_data[index_pos]))));
+      }
+      loss_data[i] = -sum / (class_num - 1);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class BprLossGradientOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto* label = ctx.Input<Tensor>("Label");
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    const int step_size = x->dims()[0];
+    const int num_classes = x->dims()[1];
+    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    const T* dy_data = dy->data<T>();
+    const T* x_data = x->data<T>();
+    const int64_t* label_data = label->data<int64_t>();
+
+    for (size_t sample_id = 0; sample_id < step_size; sample_id++) {
+      for (size_t x_offset = sample_id * num_classes;
+           x_offset < (sample_id + 1) * num_classes; x_offset++) {
+        dx_data[x_offset] = static_cast<T>(0);
+      }
+      auto p_index = sample_id * num_classes + label_data[sample_id];
+      for (size_t ni = 0; ni < num_classes; ni++) {
+        if (label_data[sample_id] == ni) continue;
+        auto n_index = sample_id * num_classes + ni;
+        auto grad_ = -dy_data[sample_id] /
+                     ((num_classes - 1) *
+                      (1.0f + TolerableValue<T>()(std::exp(x_data[p_index] -
+                                                           x_data[n_index]))));
+        dx_data[p_index] += grad_;
+        dx_data[n_index] -= grad_;
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/concat_mkldnn_op.cc b/paddle/fluid/operators/concat_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7ad674056f0d753d79408a11eff1aca47a84998a
--- /dev/null
+++ b/paddle/fluid/operators/concat_mkldnn_op.cc
@@ -0,0 +1,152 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include "paddle/fluid/operators/concat_op.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::DataLayout;
+using framework::Tensor;
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::concat;
+using mkldnn::stream;
+using platform::to_void_cast;
+
+static void EnforceLayouts(const std::vector<const Tensor*> inputs) {
+  for (auto* input : inputs) {
+    const bool is_layout_correct = input->layout() == DataLayout::kMKLDNN;
+    const bool is_format_defined =
+        input->format() != memory::format::format_undef;
+    PADDLE_ENFORCE(is_layout_correct && is_format_defined,
+                   "Wrong layout/format set for Input tensor");
+  }
+}
+
+static memory::primitive_desc CreateMemPrimDesc(const Tensor& input,
+                                                const mkldnn::engine& engine) {
+  constexpr auto data_type = mkldnn::memory::f32;
+  const auto dims = paddle::framework::vectorize2int(input.dims());
+  const auto format = input.format();
+  auto description = memory::desc(dims, data_type, format);
+  auto mem_prim_desc = memory::primitive_desc(description, engine);
+  return mem_prim_desc;
+}
+
+static mkldnn::memory::format GetDstMemFormat(
+    const concat::primitive_desc& concat_pd) {
+  return (memory::format)concat_pd.dst_primitive_desc().desc().data.format;
+}
+
+static platform::CPUPlace GetCpuPlace(
+    const paddle::framework::ExecutionContext& ctx) {
+  auto place = ctx.GetPlace();
+  PADDLE_ENFORCE(paddle::platform::is_cpu_place(place),
+                 "It must use CPUPlace.");
+  return boost::get<platform::CPUPlace>(place);
+}
+
+static const mkldnn::engine& GetMKLDNNEngine(
+    const paddle::framework::ExecutionContext& ctx) {
+  auto& dev_ctx = ctx.template device_context<platform::MKLDNNDeviceContext>();
+  return dev_ctx.GetEngine();
+}
+
+template <typename T>
+class ConcatPrimitiveFactory {
+ public:
+  concat::primitive_desc CreateConcatPrimDescriptor(
+      const std::vector<const Tensor*> multi_input, Tensor* output,
+      int concat_axis, const mkldnn::engine& mkldnn_engine) {
+    CreateSourcesDescriptors(multi_input, mkldnn_engine);
+    auto dst_desc = CreateDstMemDescriptor(output);
+    return concat::primitive_desc(dst_desc, concat_axis, srcs_pd);
+  }
+
+  concat CreateConcatPrimitive(const concat::primitive_desc& concat_pd,
+                               Tensor* output, platform::CPUPlace place) {
+    CreateSourcePrimitiveAts();
+    dst_mem = CreateDstMemory(concat_pd, output, place);
+    return concat(concat_pd, inputs, dst_mem.get());
+  }
+
+ private:
+  memory::desc CreateDstMemDescriptor(Tensor* output) {
+    auto dst_dims = paddle::framework::vectorize2int(output->dims());
+    return memory::desc(dst_dims, platform::MKLDNNGetDataType<T>(),
+                        memory::format::any);
+  }
+
+  mkldnn::memory CreateDstMemory(const concat::primitive_desc& concat_pd,
+                                 Tensor* output, platform::CPUPlace place) {
+    return memory(concat_pd.dst_primitive_desc(),
+                  output->mutable_data<T>(place));
+  }
+
+  void CreateSourcesDescriptors(const std::vector<const Tensor*> multi_input,
+                                const mkldnn::engine& mkldnn_engine) {
+    for (size_t i = 0; i < multi_input.size(); i++) {
+      auto mem_prim_desc = CreateMemPrimDesc(*multi_input[i], mkldnn_engine);
+      srcs_pd.push_back(mem_prim_desc);
+      srcs.push_back(
+          memory(mem_prim_desc, to_void_cast(multi_input[i]->data<T>())));
+    }
+  }
+
+  void CreateSourcePrimitiveAts() {
+    inputs.reserve(srcs.size());
+    for (size_t i = 0; i < srcs.size(); i++) {
+      inputs.push_back(srcs[i]);
+    }
+  }
+
+ private:
+  std::vector<memory::primitive_desc> srcs_pd;
+  std::vector<memory> srcs;
+  std::vector<primitive::at> inputs;
+  boost::optional<memory> dst_mem;  // TODO(mgallus): change to std::optional
+};                                  // upon introduction of C++17 to paddle
+
+template <typename T>
+class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    auto place = GetCpuPlace(ctx);
+    const auto& mkldnn_engine = GetMKLDNNEngine(ctx);
+
+    auto multi_input = ctx.MultiInput<Tensor>("X");
+    EnforceLayouts(multi_input);
+    Tensor* output = ctx.Output<Tensor>("Out");
+    int64_t concat_axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
+
+    ConcatPrimitiveFactory<T> prim_creator;
+    auto concat_pd = prim_creator.CreateConcatPrimDescriptor(
+        multi_input, output, static_cast<int>(concat_axis), mkldnn_engine);
+    auto concat = prim_creator.CreateConcatPrimitive(concat_pd, output, place);
+    stream(stream::kind::eager).submit({concat}).wait();
+
+    output->set_layout(DataLayout::kMKLDNN);
+    output->set_format(GetDstMemFormat(concat_pd));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(concat, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::ConcatMKLDNNOpKernel<float>)
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 57817da71adfd80faad29a48b05ba2f326de6c07..194f9cf5033a3a73afeb8e92ddbdcc7b316fcd35 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -13,10 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/concat_op.h"
-
 #include <string>
 #include <vector>
 
+#ifdef PADDLE_WITH_MKLDNN
+#include <paddle/fluid/platform/mkldnn_helper.h>
+#endif
+
 namespace paddle {
 namespace operators {
 using framework::Tensor;
@@ -59,6 +62,22 @@ class ConcatOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", out_dims);
     ctx->ShareLoD("X", /*->*/ "Out");
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type =
+        framework::GetDataTypeOfVar(ctx.MultiInputVar("X")[0]);
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (platform::CanMKLDNNBeUsed(ctx)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 
 class ConcatOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -66,6 +85,10 @@ class ConcatOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X", "Input tensors of concat operator.").AsDuplicable();
     AddOutput("Out", "Output tensor of concat operator.");
+    AddAttr<bool>(
+        "use_mkldnn",
+        "(bool, default false) Indicates if MKL-DNN kernel will be used")
+        .SetDefault(false);
     AddAttr<int>("axis",
                  "The axis along which the input tensors will be concatenated.")
         .SetDefault(0);
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index 6c1b2f329a59e1b27caad2996308b33b3a72de1d..5ab0918c486cc56c7d55f24f4952a013044971ee 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -32,6 +32,20 @@ static constexpr char kStepScopes[] = "StepScopes";
 static constexpr char kX[] = "X";
 static constexpr char kXGRAD[] = "X@GRAD";
 static constexpr char kOutputs[] = "Out";
+static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars";
+
+namespace {  // NOLINT
+static std::string GetSkipEagerDeletionVarsDebugString(
+    const std::vector<std::string> &vars) {
+  std::string str = "Skip " + std::to_string(vars.size()) +
+                    " var(s) in eager deletion mode: ";
+  for (auto &var : vars) {
+    str.append(var);
+    str.push_back(' ');
+  }
+  return str;
+}
+}  // NOLINT
 
 class WhileOp : public framework::OperatorBase {
  public:
@@ -59,7 +73,10 @@ class WhileOp : public framework::OperatorBase {
                    "Condition of while op must in CPU memory.");
 
     bool is_test = Attr<bool>("is_test");
-    auto ctx = executor.Prepare(*program, block->ID());
+    auto &skip_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars);
+    VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);
+
+    auto ctx = executor.Prepare(*program, block->ID(), skip_vars);
     while (cond.data<bool>()[0]) {
       auto &current_scope = scope.NewScope();
       step_scopes->push_back(&current_scope);
@@ -96,6 +113,10 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(bool, default false) Set to true for inference only, false "
                   "for training. Some layers may run faster when this is true.")
         .SetDefault(false);
+    AddAttr<std::vector<std::string>>(kSkipEagerDeletionVars,
+                                      "Vars that would skip eager deletion."
+                                      "Users should not set this manually.")
+        .SetDefault(std::vector<std::string>());
     AddComment(R"DOC(
 )DOC");
   }
@@ -119,7 +140,10 @@ class WhileGradOp : public framework::OperatorBase {
     framework::Executor executor(dev_place);
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
     auto *program = block->Program();
-    auto ctx = executor.Prepare(*program, block->ID());
+
+    auto &skip_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars);
+    VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);
+    auto ctx = executor.Prepare(*program, block->ID(), skip_vars);
 
     auto *step_scopes =
         scope.FindVar(Input(kStepScopes))->GetMutable<StepScopeVar>();
@@ -341,6 +365,8 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
     // while operator could be renamed.
     while_grad->SetAttr("original_output_grad", output_grads_list);
 
+    while_grad->SetAttr(kSkipEagerDeletionVars, std::vector<std::string>());
+
     return std::unique_ptr<framework::OpDesc>(while_grad);
   }
 };
diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc
index 2c09ee7394ad605f7a324d021ce0468a79bb71ca..3235ad52b999e1ca3f992034781edaab9921a300 100644
--- a/paddle/fluid/operators/conv_fusion_op.cu.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cu.cc
@@ -110,11 +110,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
 
     auto x_dims = framework::vectorize(input->dims());
     auto f_dims = framework::vectorize(filter->dims());
-    if (activation == "identity") {
-      // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
-      // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
-      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
-    } else if (!exhaustive_search) {
+    if (!exhaustive_search) {
       CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
           handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
           cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
@@ -165,18 +161,42 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
                       "workspace_size to be allocated exceeds the limit");
 
-    // ------------------- cudnn conv+bias+act forward --------------------
-    ScalingParamType<T> alpha1 = 1.0f;
-    ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f;
-    auto cudnn_func = [&](void* cudnn_workspace) {
-      CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
-          handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc,
-          filter_data, cudnn_conv_desc, algo, cudnn_workspace,
-          workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data,
-          cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc,
+    if ((activation == "identity") &&
+        (algo != CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) &&
+        (!residual)) {
+      // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
+      // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
+      // But test in some case, the speed is slower, change to use
+      // cudnnConvolutionForward and cudnnAddTensor
+      // ------------- cudnn conv forward and bias add ---------------------
+      ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
+            handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
+            filter_data, cudnn_conv_desc, algo, cudnn_workspace,
+            workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
+      };
+      workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
+      CUDNN_ENFORCE(platform::dynload::cudnnAddTensor(
+          handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc,
           output_data));
-    };
-    workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
+    } else {
+      if (activation == "identity") {
+        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+      }
+      // ------------------- cudnn conv+bias+act forward --------------------
+      ScalingParamType<T> alpha1 = 1.0f;
+      ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f;
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
+            handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc,
+            filter_data, cudnn_conv_desc, algo, cudnn_workspace,
+            workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data,
+            cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc,
+            output_data));
+      };
+      workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
+    }
   }
 };
 #endif
diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index 9e2e2cf818000d9181447a0aa6b4ac4878781f35..154ff2bb209bb8f932c06caa319223ccf3314767 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -15,7 +15,7 @@
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/conv_op.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace paddle {
 namespace operators {
@@ -28,258 +28,45 @@ using mkldnn::stream;
 using platform::to_void_cast;
 using platform::GetMKLDNNFormat;
 
-class ConvMKLDNNHandler : public platform::MKLDNNHandler {
- public:
-  ConvMKLDNNHandler(
-      std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd,
-      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
-      const std::string& base_key)
-      : platform::MKLDNNHandler(dev_ctx, engine, base_key) {
-    conv_pd_ = conv_pd;
-  }
-
-  ConvMKLDNNHandler(
-      std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd,
-      std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc>
-          conv_bwd_data_pd,
-      std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc>
-          conv_bwd_weights_pd,
-      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
-      const std::string& base_key)
-      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
-        conv_pd_(conv_pd),
-        conv_bwd_weights_pd_(conv_bwd_weights_pd),
-        conv_bwd_data_pd_(conv_bwd_data_pd) {
-    // If we are in Grad operatgor then update a key with BWD suffix to
-    // distinguish from FWD memory primitives
-    key_ += "-BWD";
-  }
-
-  size_t GetDstMemorySize() const {
-    return conv_pd_->dst_primitive_desc().get_size();
-  }
-
-  mkldnn::memory::format GetDstFormat() const {
-    return static_cast<mkldnn::memory::format>(
-        conv_pd_->dst_primitive_desc().desc().data.format);
-  }
-
-  size_t GetDiffWeightsMemorySize() const {
-    return conv_bwd_weights_pd_->diff_weights_primitive_desc().get_size();
-  }
-
-  size_t GetDiffSourceMemorySize() const {
-    return conv_bwd_data_pd_->diff_src_primitive_desc().get_size();
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromWeightsPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto src_pd = conv_bwd_weights_pd_->src_primitive_desc();
-    auto user_pd = user_memory_p->get_primitive_desc();
-    return this->AcquireMemory(src_pd, user_pd, user_memory_p,
-                               "@weights-src_mem_p", pipeline);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromWeightsPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto diff_dst_pd = conv_bwd_weights_pd_->diff_dst_primitive_desc();
-    auto user_pd = user_memory_p->get_primitive_desc();
-    return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
-                               "@weights-diff_dst_mem_p", pipeline);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffWeightsMemoryFromWeightsPrimitive(
-      void* ptr) {
-    return this->AcquireMemoryFromPrimitive(
-        conv_bwd_weights_pd_->diff_weights_primitive_desc(), ptr,
-        "@diff_weights_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromDataPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto diff_dst_pd = conv_bwd_data_pd_->diff_dst_primitive_desc();
-    auto user_pd = user_memory_p->get_primitive_desc();
-    return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
-                               "@data-diff_dst_mem_p", pipeline);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromDataPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto weights_pd = conv_bwd_data_pd_->weights_primitive_desc();
-    auto user_pd = user_weights_memory_p->get_primitive_desc();
-    return this->AcquireMemory(weights_pd, user_pd, user_weights_memory_p,
-                               "@data-weights_mem_p", pipeline);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireResidualDataMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_residual_data_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromResidualDataMemory(
-      const std::shared_ptr<mkldnn::memory>& user_residual_memory_p,
-      void* dst_ptr,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    return this->AcquireMemory(user_residual_memory_p,
-                               this->AcquireDstMemoryFromPrimitive(dst_ptr),
-                               "@residual_data_mem_p", pipeline);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemoryFromDataPrimitive(
-      void* ptr) {
-    return this->AcquireMemoryFromPrimitive(
-        conv_bwd_data_pd_->diff_src_primitive_desc(), ptr, "@diff_src_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromPrimitive(void* ptr) {
-    return this->AcquireMemoryFromPrimitive(conv_pd_->dst_primitive_desc(), ptr,
-                                            "@dst_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto src_pd = conv_pd_->src_primitive_desc();
-    auto user_pd = user_memory_p->get_primitive_desc();
-    return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p",
-                               pipeline);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
-      std::vector<mkldnn::primitive>& pipeline,  // NOLINT
-      bool is_persistent = false) {
-    auto user_weights_pd = user_weights_memory_p->get_primitive_desc();
-    auto weights_pd = conv_pd_->weights_primitive_desc();
-    return this->AcquireMemory(weights_pd, user_weights_pd,
-                               user_weights_memory_p, "@weights_mem_p",
-                               pipeline, is_persistent);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireBiasMemoryFromPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_bias_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto user_bias_pd = user_bias_memory_p->get_primitive_desc();
-    auto bias_pd = conv_pd_->bias_primitive_desc();
-    return this->AcquireMemory(bias_pd, user_bias_pd, user_bias_memory_p,
-                               "@bias_mem_p", pipeline);
-  }
-
-  std::shared_ptr<mkldnn::convolution_forward> AcquireConvolution(
-      std::shared_ptr<mkldnn::memory> src_memory_p,
-      std::shared_ptr<mkldnn::memory> weights_memory_p,
-      std::shared_ptr<mkldnn::memory> dst_memory_p) {
-    auto prim_key = key_ + "@conv_p";
-    auto conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(
-        dev_ctx_.GetBlob(prim_key));
-    PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
-                   "Fail to find convolution primitive in device context");
-    if (conv_p == nullptr) {
-      conv_p = std::make_shared<mkldnn::convolution_forward>(
-          *conv_pd_, *(src_memory_p), *(weights_memory_p.get()),
-          *(dst_memory_p.get()));
-
-      dev_ctx_.SetBlob(prim_key, conv_p);
-    } else {
-      is_reusing_ = true;
-    }
-    return conv_p;
-  }
-
-  std::shared_ptr<mkldnn::convolution_forward> AcquireConvolution(
-      std::shared_ptr<mkldnn::memory> src_memory_p,
-      std::shared_ptr<mkldnn::memory> weights_memory_p,
-      std::shared_ptr<mkldnn::memory> bias_memory_p,
-      std::shared_ptr<mkldnn::memory> dst_memory_p) {
-    auto prim_key = key_ + "@conv_p";
-    auto conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(
-        dev_ctx_.GetBlob(prim_key));
-    PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
-                   "Fail to find convolution primitive in device context");
-    if (conv_p == nullptr) {
-      conv_p = std::make_shared<mkldnn::convolution_forward>(
-          *conv_pd_, *(src_memory_p), *(weights_memory_p.get()),
-          *(bias_memory_p.get()), *(dst_memory_p.get()));
-
-      dev_ctx_.SetBlob(prim_key, conv_p);
-    } else {
-      is_reusing_ = true;
-    }
-    return conv_p;
-  }
-
-  std::shared_ptr<mkldnn::convolution_backward_weights>
-  AcquireConvolutionBackwardWeights(
-      std::shared_ptr<mkldnn::memory> src_memory_p,
-      std::shared_ptr<mkldnn::memory> diff_dst_memory_p,
-      std::shared_ptr<mkldnn::memory> diff_weights_memory_p) {
-    auto prim_key = key_ + "@conv_bwd_weights_p";
-    auto conv_bwd_weights_p =
-        std::static_pointer_cast<mkldnn::convolution_backward_weights>(
-            dev_ctx_.GetBlob(prim_key));
-    PADDLE_ENFORCE(
-        (conv_bwd_weights_p != nullptr) || (is_reusing_ == false),
-        "Fail to find convolution bwd weights primitive in device context");
-    if (conv_bwd_weights_p == nullptr) {
-      // create backward conv primitive for weights
-      conv_bwd_weights_p =
-          std::make_shared<mkldnn::convolution_backward_weights>(
-              *conv_bwd_weights_pd_, *src_memory_p, *diff_dst_memory_p,
-              *diff_weights_memory_p);
-      dev_ctx_.SetBlob(prim_key, conv_bwd_weights_p);
-    } else {
-      is_reusing_ = true;
-    }
-    return conv_bwd_weights_p;
-  }
-
-  std::shared_ptr<mkldnn::convolution_backward_data>
-  AcquireConvolutionBackwardData(
-      std::shared_ptr<mkldnn::memory> diff_dst_memory_p,
-      std::shared_ptr<mkldnn::memory> weights_memory_p,
-      std::shared_ptr<mkldnn::memory> diff_src_memory_p) {
-    auto prim_key = key_ + "@conv_bwd_data_p";
-    auto conv_bwd_data_p =
-        std::static_pointer_cast<mkldnn::convolution_backward_data>(
-            dev_ctx_.GetBlob(prim_key));
-    PADDLE_ENFORCE(
-        (conv_bwd_data_p != nullptr) || (is_reusing_ == false),
-        "Fail to find convolution bwd data primitive in device context");
-    if (conv_bwd_data_p == nullptr) {
-      conv_bwd_data_p = std::make_shared<mkldnn::convolution_backward_data>(
-          *conv_bwd_data_pd_, *diff_dst_memory_p, *weights_memory_p,
-          *diff_src_memory_p);
-      dev_ctx_.SetBlob(prim_key, conv_bwd_data_p);
+inline void GetWeightsTz(std::vector<int>& weights_tz, int groups,  // NOLINT
+                         bool is_conv3d) {
+  if (groups > 1) {
+    if (is_conv3d) {
+      int output = weights_tz[0];
+      int input = weights_tz[1];
+      int dimension = weights_tz[2];
+      int height = weights_tz[3];
+      int width = weights_tz[4];
+      weights_tz.resize(6);
+      weights_tz[0] = groups;
+      weights_tz[1] = output / groups;
+      weights_tz[2] = input;
+      weights_tz[3] = dimension;
+      weights_tz[4] = height;
+      weights_tz[5] = width;
     } else {
-      is_reusing_ = true;
+      int output = weights_tz[0];
+      int input = weights_tz[1];
+      int height = weights_tz[2];
+      int width = weights_tz[3];
+      weights_tz.resize(5);
+      weights_tz[0] = groups;
+      weights_tz[1] = output / groups;
+      weights_tz[2] = input;
+      weights_tz[3] = height;
+      weights_tz[4] = width;
     }
-    return conv_bwd_data_p;
   }
-
-  // Generate keys for storing/retriving primitives for this operator
-  // TODO(jczaja): Make hashing function more optimial
-  static std::string GetHash(memory::dims& input_dims,     // NOLINT
-                             memory::dims& weights_dims,   // NOLINT
-                             std::vector<int>& strides,    // NOLINT
-                             std::vector<int>& paddings,   // NOLINT
-                             std::vector<int>& dilations,  // NOLINT
-                             int groups, const std::string& suffix) {
-    return dims2str(input_dims) + dims2str(weights_dims) + dims2str(strides) +
-           dims2str(paddings) + dims2str(dilations) + std::to_string(groups) +
-           suffix;
+}
+
+inline mkldnn::memory::format GetWeightsFormat(mkldnn::memory::format format,
+                                               int groups, bool is_conv3d) {
+  if (is_conv3d) {
+    return (groups == 1) ? format : mkldnn::memory::format::goidhw;
+  } else {
+    return (groups == 1) ? format : mkldnn::memory::format::goihw;
   }
-
- private:
-  std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd_;
-  std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc>
-      conv_bwd_weights_pd_;
-  std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc>
-      conv_bwd_data_pd_;
-};
+}
 
 template <typename T>
 class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
@@ -305,10 +92,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
                        filter->format() != memory::format::format_undef,
                    "Wrong layout/format set for Filter tensor");
-    PADDLE_ENFORCE(input->dims().size() == 4,
-                   "Input must be with 4 dimensions, i.e. NCHW");
-    PADDLE_ENFORCE(filter->dims().size() == 4,
-                   "Filter must be with 4 dimensions, i.e. OIHW");
+    PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5,
+                   "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW");
+    PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5,
+                   "Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW");
     if (bias) {
       PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN &&
                          bias->format() != memory::format::format_undef,
@@ -324,9 +111,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
     int groups = ctx.Attr<int>("groups");
 
+    bool is_conv3d = strides.size() == 3U;
     // TODO(tpatejko): add support for dilation
     PADDLE_ENFORCE(
-        dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
+        is_conv3d
+            ? dilations.size() == 3 && dilations[0] == 1 && dilations[1] == 1 &&
+                  dilations[2] == 1
+            : dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
         "dilation in convolution is not implemented yet");
 
     const T* input_data = input->data<T>();
@@ -336,33 +127,25 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     std::vector<int> weights_tz =
         paddle::framework::vectorize2int(filter->dims());
     int g = std::max(groups, 1);
-    if (g > 1) {
-      int o = weights_tz[0];
-      int i = weights_tz[1];
-      int h = weights_tz[2];
-      int w = weights_tz[3];
-      weights_tz.resize(5);
-      weights_tz[0] = g;
-      weights_tz[1] = o / g;
-      weights_tz[2] = i;
-      weights_tz[3] = h;
-      weights_tz[4] = w;
-    }
+    GetWeightsTz(weights_tz, g, is_conv3d);
     std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
 
     // Get unique name for storing MKLDNN primitives
-    const std::string key = ConvMKLDNNHandler::GetHash(
+    const std::string key = platform::ConvMKLDNNHandler::GetHash(
         src_tz, weights_tz, strides, paddings, dilations, groups,
         ctx.op().Output("Output"));
     const std::string key_conv_pd = key + "@conv_pd";
 
     std::vector<primitive> pipeline;
 
+    auto src_format = input->format();
+    mkldnn::memory::format weights_format =
+        GetWeightsFormat(filter->format(), g, is_conv3d);
+
     auto user_src_md = platform::MKLDNNMemDesc(
-        {src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
+        {src_tz}, platform::MKLDNNGetDataType<T>(), src_format);
     auto user_weights_md = platform::MKLDNNMemDesc(
-        {weights_tz}, platform::MKLDNNGetDataType<T>(),
-        (g == 1) ? filter->format() : mkldnn::memory::format::goihw);
+        {weights_tz}, platform::MKLDNNGetDataType<T>(), weights_format);
 
     /* create memory descriptor for convolution without specified format
      * ('any') which lets a primitive (convolution in this case) choose
@@ -372,10 +155,16 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto chosen_memory_format =
         platform::data_format_to_memory_format(data_format);
 
+    if (is_conv3d) {
+      chosen_memory_format =
+          platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format);
+    }
+    weights_format = GetWeightsFormat(chosen_memory_format, g, is_conv3d);
+
     auto src_md = platform::MKLDNNMemDesc(
         src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto weights_md = platform::MKLDNNMemDesc(
-        weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+        weights_tz, platform::MKLDNNGetDataType<T>(), weights_format);
     std::vector<int> bias_tz;  // TODO(mgallus): avoid empty vector creation.
                                // Currently used whenever bias is != nullptr.
     auto dst_md = platform::MKLDNNMemDesc(
@@ -400,7 +189,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     // Save conv_pd/src_memory/weights_memory for backward pass
     if (!is_test) dev_ctx.SetBlob(key_conv_pd, conv_pd);
 
-    ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key);
+    platform::ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key);
 
     // create mkldnn memory from input tensors (data/weights)
     auto user_src_memory_p =
@@ -516,8 +305,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                        const mkldnn::engine& engine, const bool fuse_relu,
                        const bool fuse_residual_conn,
                        mkldnn::prop_kind fwd_prop_kind) const {
-    memory::dims stride_dims = {strides[0], strides[1]};
-    memory::dims padding_dims = {paddings[0], paddings[1]};
+    memory::dims stride_dims = strides;
+    memory::dims padding_dims = paddings;
 
     auto conv_desc = mkldnn::convolution_forward::desc(
         fwd_prop_kind, mkldnn::convolution_direct, src, weights, dst,
@@ -541,8 +330,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                        const mkldnn::engine& engine, const bool fuse_relu,
                        const bool fuse_residual_conn,
                        mkldnn::prop_kind fwd_prop_kind) const {
-    memory::dims stride_dims = {strides[0], strides[1]};
-    memory::dims padding_dims = {paddings[0], paddings[1]};
+    memory::dims stride_dims = strides;
+    memory::dims padding_dims = paddings;
 
     auto conv_desc = mkldnn::convolution_forward::desc(
         fwd_prop_kind, mkldnn::convolution_direct, src, weights, bias, dst,
@@ -602,6 +391,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
     int groups = ctx.Attr<int>("groups");
 
+    bool is_conv3d = strides.size() == 3U;
     const T* input_data = input->data<T>();
     const T* filter_data = filter->data<T>();
     const T* output_grad_data = output_grad->data<T>();
@@ -611,23 +401,29 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
     std::vector<int> weights_tz =
         paddle::framework::vectorize2int(filter->dims());
+    int g = std::max(groups, 1);
+    GetWeightsTz(weights_tz, g, is_conv3d);
     std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
 
+    auto src_format = input->format();
+    mkldnn::memory::format weights_format =
+        GetWeightsFormat(filter->format(), g, is_conv3d);
+
     // Get an unique name from "argument" name of "Output" variable
     // as well as attributes of primitive to be created
     // This name will be used as key when saving info into device context
-    const std::string key =
-        ConvMKLDNNHandler::GetHash(src_tz, weights_tz, strides, paddings,
-                                   dilations, groups, ctx.op().Input("Output"));
+    const std::string key = platform::ConvMKLDNNHandler::GetHash(
+        src_tz, weights_tz, strides, paddings, dilations, groups,
+        ctx.op().Input("Output"));
 
     const std::string key_conv_pd = key + "@conv_pd";
     std::vector<primitive> pipeline;
 
     // Create user memory descriptors
     auto user_src_md = platform::MKLDNNMemDesc(
-        {src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
+        {src_tz}, platform::MKLDNNGetDataType<T>(), src_format);
     auto user_weights_md = platform::MKLDNNMemDesc(
-        {weights_tz}, platform::MKLDNNGetDataType<T>(), filter->format());
+        {weights_tz}, platform::MKLDNNGetDataType<T>(), weights_format);
     auto user_diff_dst_md = platform::MKLDNNMemDesc(
         {dst_tz}, platform::MKLDNNGetDataType<T>(), output_grad->format());
 
@@ -639,14 +435,20 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     auto chosen_memory_format =
         platform::data_format_to_memory_format(data_format);
 
+    if (is_conv3d) {
+      chosen_memory_format =
+          platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format);
+    }
+    weights_format = GetWeightsFormat(chosen_memory_format, g, is_conv3d);
+
     auto src_md = platform::MKLDNNMemDesc(
         src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto diff_src_md = platform::MKLDNNMemDesc(
         src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
     auto weights_md = platform::MKLDNNMemDesc(
-        weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+        weights_tz, platform::MKLDNNGetDataType<T>(), weights_format);
     auto diff_weights_md = platform::MKLDNNMemDesc(
-        weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+        weights_tz, platform::MKLDNNGetDataType<T>(), weights_format);
     auto diff_dst_md = platform::MKLDNNMemDesc(
         dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
 
@@ -673,8 +475,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
         std::make_shared<mkldnn::convolution_backward_data::primitive_desc>(
             conv_bwd_data_desc, mkldnn_engine, *conv_pd);
 
-    ConvMKLDNNHandler handler(conv_pd, conv_bwd_data_pd, conv_bwd_weights_pd,
-                              dev_ctx, mkldnn_engine, key);
+    platform::ConvMKLDNNHandler handler(conv_pd, conv_bwd_data_pd,
+                                        conv_bwd_weights_pd, dev_ctx,
+                                        mkldnn_engine, key);
 
     // create mkldnn memory from input tensors (data/weights)
     auto user_src_memory_p =
@@ -743,8 +546,22 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_KERNEL(conv2d, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::ConvMKLDNNOpKernel<float>);
-
-REGISTER_OP_KERNEL(conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::ConvMKLDNNGradOpKernel<float>);
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN,
+                                    ::paddle::platform::CPUPlace, FP32,
+                                    ops::kConvMKLDNNFP32,
+                                    ops::ConvMKLDNNOpKernel<float>);
+
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN,
+                                    ::paddle::platform::CPUPlace, FP32,
+                                    ops::kConvMKLDNNFP32,
+                                    ops::ConvMKLDNNGradOpKernel<float>);
+
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d, MKLDNN,
+                                    ::paddle::platform::CPUPlace, FP32,
+                                    ops::kConvMKLDNNFP32,
+                                    ops::ConvMKLDNNOpKernel<float>);
+
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d_grad, MKLDNN,
+                                    ::paddle::platform::CPUPlace, FP32,
+                                    ops::kConvMKLDNNFP32,
+                                    ops::ConvMKLDNNGradOpKernel<float>);
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 342525be49e28f1785e25d4daad38c3c81b4774f..d7b876628855b8b76b340cd1e6115896ead4aa6c 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -74,6 +74,8 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
 
 framework::OpKernelType ConvOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
+  int customized_type_value =
+      framework::OpKernelType::kDefaultCustomizedTypeValue;
   framework::LibraryType library{framework::LibraryType::kPlain};
   // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
   std::string data_format = ctx.Attr<std::string>("data_format");
@@ -89,6 +91,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
       platform::CanMKLDNNBeUsed(ctx)) {
     library = framework::LibraryType::kMKLDNN;
     layout = framework::DataLayout::kMKLDNN;
+    customized_type_value = kConvMKLDNNFP32;
   }
 #endif
 
@@ -105,7 +108,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
   }
 
   return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
-                                 library);
+                                 library, customized_type_value);
 }
 
 void Conv2DOpMaker::Make() {
@@ -131,14 +134,14 @@ void Conv2DOpMaker::Make() {
            "The format of output tensor is X (one-dimensional) of size equal"
            "to the number of output channels. Only used with MKL-DNN.")
       .AsDispensable();
-  AddOutput("Output",
-            "(Tensor) The output tensor of convolution operator. "
-            "The format of output tensor is also NCHW.");
   AddInput("ResidualData",
            "(Tensor) Tensor with residual data "
            "to which convolution output will be added."
            "Used with fuse_residual_connection fusion.")
       .AsDispensable();
+  AddOutput("Output",
+            "(Tensor) The output tensor of convolution operator. "
+            "The format of output tensor is also NCHW.");
   AddAttr<std::vector<int>>("strides",
                             "(vector<int> default:{1, 1}), the "
                             "strides(h_stride, w_stride) of "
@@ -229,6 +232,10 @@ $$
 }
 
 void Conv3DOpMaker::Make() {
+  AddAttr<bool>("is_test",
+                "(bool, default false) Set to true for inference only, false "
+                "for training. Some layers may run faster when this is true.")
+      .SetDefault(false);
   AddInput(
       "Input",
       "(Tensor) The input tensor of convolution operator. "
@@ -244,6 +251,11 @@ void Conv3DOpMaker::Make() {
            "is the width of the filter."
            "If the groups attribute is greater than 1, C equals the number of "
            "input image channels divided by the groups.");
+  AddInput("ResidualData",
+           "(Tensor) Tensor with residual data "
+           "to which convolution output will be added."
+           "Used with fuse_residual_connection fusion.")
+      .AsDispensable();
   AddOutput("Output",
             "(Tensor) The output tensor of convolution operator."
             "The format of output tensor is also NCDHW.");
@@ -277,6 +289,13 @@ void Conv3DOpMaker::Make() {
   AddAttr<bool>("use_mkldnn",
                 "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
+  AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
+  AddAttr<bool>("fuse_residual_connection",
+                "(bool, default false) Only used in mkldnn kernel. Used "
+                "whenever convolution output is as an input to residual "
+                "connection.")
+      .SetDefault(false);
   AddAttr<std::string>(
       "data_format",
       "(string, default NCHW) Only used in "
@@ -342,6 +361,8 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
 
 framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
+  int customized_type_value =
+      framework::OpKernelType::kDefaultCustomizedTypeValue;
   framework::LibraryType library_{framework::LibraryType::kPlain};
   // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
   std::string data_format = ctx.Attr<std::string>("data_format");
@@ -357,12 +378,13 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
       platform::CanMKLDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kMKLDNN;
     layout_ = framework::DataLayout::kMKLDNN;
+    customized_type_value = kConvMKLDNNFP32;
   }
 #endif
 
   return framework::OpKernelType(
       framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
-      layout_, library_);
+      layout_, library_, customized_type_value);
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index e69814001e4da5d10e51ee57c1dbe291338b8b49..249f308c13ff5636fbaa6747b28cab7886b7e736 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -27,6 +27,8 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
+constexpr int kConvMKLDNNFP32 = 1;
+constexpr int kConvMKLDNNINT8 = 2;
 
 // Base convolution operator definations for other conv
 // like operators to reuse the implementation.
diff --git a/paddle/fluid/operators/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/conv_transpose_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..317d4cebe26b81ff03c212e6328233d5152ed1b4
--- /dev/null
+++ b/paddle/fluid/operators/conv_transpose_mkldnn_op.cc
@@ -0,0 +1,299 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/framework/data_layout_transform.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using framework::DataLayout;
+
+template <typename T>
+class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    const bool is_test = ctx.Attr<bool>("is_test");
+    PADDLE_ENFORCE(
+        is_test == true,
+        "ConvTransposeMKLDNN works only for inference!. Set is_test = True");
+
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* filter = ctx.Input<Tensor>("Filter");
+    auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
+    auto* output = ctx.Output<Tensor>("Output");
+
+    PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
+                       input->format() != mkldnn::memory::format::format_undef,
+                   "Wrong layout/format set for Input tensor");
+    PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
+                       filter->format() != mkldnn::memory::format::format_undef,
+                   "Wrong layout/format set for Filter tensor");
+    PADDLE_ENFORCE(input->dims().size() == 4,
+                   "Input must be with 4 dimensions, i.e. NCHW");
+    PADDLE_ENFORCE(filter->dims().size() == 4,
+                   "Filter must be with 4 dimensions, i.e. OIHW");
+
+    if (bias) {
+      PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN &&
+                         bias->format() != mkldnn::memory::format::format_undef,
+                     "Wrong layout/format set for Bias tensor");
+      PADDLE_ENFORCE(bias->dims().size() == 1,
+                     "Bias must only have 1 dimension, i.e. X");
+    }
+
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int groups = ctx.Attr<int>("groups");
+
+    // TODO(tpatejko): add support for dilation
+    PADDLE_ENFORCE(
+        dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
+        "dilation in convolution is not implemented yet");
+
+    const T* input_data = input->data<T>();
+    const T* filter_data = filter->data<T>();
+
+    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
+    std::vector<int> iohw_weights_tz =
+        paddle::framework::vectorize2int(filter->dims());
+    std::vector<int> weights_tz = iohw_weights_tz;
+    // IOHW -> OIHW
+    weights_tz[0] = iohw_weights_tz[1];
+    weights_tz[1] = iohw_weights_tz[0];
+
+    // Custom Reorder from IOHW to OIHW
+    auto iohw2oihw_reorder =
+        [&iohw_weights_tz](const T* filter_data) -> std::shared_ptr<T> {
+      int o = iohw_weights_tz[1];
+      int c = iohw_weights_tz[0];
+      int h = iohw_weights_tz[2];
+      int w = iohw_weights_tz[3];
+      std::shared_ptr<T> reordered_filter_data(new T[o * c * h * w](),
+                                               std::default_delete<T[]>());
+      for (int i = 0; i < c; ++i) {
+        for (int j = 0; j < o; ++j) {
+          int in_offset = j * h * w + i * o * h * w;
+          int out_offset = j * c * h * w + i * h * w;
+          std::memcpy(&(reordered_filter_data.get())[out_offset],
+                      &filter_data[in_offset], h * w * sizeof(T));
+        }
+      }
+
+      return reordered_filter_data;
+    };
+
+    int g = std::max(groups, 1);
+    if (g > 1) {
+      int o = weights_tz[0];
+      int i = weights_tz[1];
+      int h = weights_tz[2];
+      int w = weights_tz[3];
+      weights_tz.resize(5);
+      weights_tz[0] = g;
+      weights_tz[1] = o / g;
+      weights_tz[2] = i;
+      weights_tz[3] = h;
+      weights_tz[4] = w;
+    }
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+
+    // Get unique name for storing MKLDNN primitives
+    const std::string key = platform::ConvTransposeMKLDNNHandler::GetHash(
+        src_tz, weights_tz, strides, paddings, dilations, groups,
+        ctx.op().Output("Output"));
+    const std::string key_conv_transpose_pd = key + "@conv_transpose_pd";
+
+    std::vector<mkldnn::primitive> pipeline;
+
+    auto user_src_md = platform::MKLDNNMemDesc(
+        {src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
+    auto user_weights_md =
+        platform::MKLDNNMemDesc({weights_tz}, platform::MKLDNNGetDataType<T>(),
+                                (g == 1) ? mkldnn::memory::format::oihw
+                                         : mkldnn::memory::format::goihw);
+
+    /* create memory descriptor for convolution without specified format
+     * ('any') which lets a primitive (convolution in this case) choose
+     * the memory format preferred for best performance
+     */
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    auto chosen_memory_format =
+        platform::data_format_to_memory_format(data_format);
+    bool fuse_relu = ctx.Attr<bool>("fuse_relu");
+
+    auto src_md = platform::MKLDNNMemDesc(
+        src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+    auto weights_md = platform::MKLDNNMemDesc(
+        weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+    std::vector<int> bias_tz;  // TODO(mgallus): avoid empty vector creation.
+                               // Currently used whenever bias is != nullptr.
+    auto dst_md = platform::MKLDNNMemDesc(
+        dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+
+    // create a deconv(conv transpose) primitive descriptor and save it for
+    // usage in backward
+    std::shared_ptr<mkldnn::deconvolution_forward::primitive_desc>
+        conv_transpose_pd;
+    auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference
+                                 : mkldnn::prop_kind::forward_training;
+    if (bias) {
+      bias_tz = paddle::framework::vectorize2int(bias->dims());
+      auto bias_md = platform::MKLDNNMemDesc(
+          bias_tz, platform::MKLDNNGetDataType<T>(), mkldnn::memory::format::x);
+      conv_transpose_pd = ConvTransposeFwdPrimitiveDesc(
+          src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine,
+          fuse_relu, fwd_prop_kind);
+    } else {
+      conv_transpose_pd = ConvTransposeFwdPrimitiveDesc(
+          src_md, weights_md, dst_md, strides, paddings, mkldnn_engine,
+          fuse_relu, fwd_prop_kind);
+    }
+    // Save conv_pd/src_memory/weights_memory for backward pass
+    if (!is_test) dev_ctx.SetBlob(key_conv_transpose_pd, conv_transpose_pd);
+
+    platform::ConvTransposeMKLDNNHandler handler(conv_transpose_pd, dev_ctx,
+                                                 mkldnn_engine, key);
+
+    // create mkldnn memory from input tensors (data/weights)
+    auto user_src_memory_p = handler.AcquireSrcMemory(
+        user_src_md, platform::to_void_cast<T>(input_data));
+    auto user_weights_memory_p = handler.AcquireWeightsMemory(
+        user_weights_md, platform::to_void_cast<T>(filter_data),
+        is_test ? iohw2oihw_reorder : platform::user_function());
+
+    // create reorder primitive if the input format is not the preferred one
+    auto src_memory_p =
+        handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
+    auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive(
+        user_weights_memory_p, pipeline, is_test);
+
+    std::shared_ptr<mkldnn::memory> dst_memory_p;
+
+    auto output_data = output->mutable_data<T>(
+        ctx.GetPlace(), paddle::memory::Allocator::kDefault,
+        handler.GetDstMemorySize());
+    dst_memory_p = handler.AcquireDstMemoryFromPrimitive(
+        platform::to_void_cast<T>(output_data));
+
+    // create convolution op primitive
+    std::shared_ptr<mkldnn::deconvolution_forward> conv_p;
+    if (bias) {
+      const T* bias_data = bias->data<T>();
+      auto user_bias_md =
+          platform::MKLDNNMemDesc({bias_tz}, platform::MKLDNNGetDataType<T>(),
+                                  mkldnn::memory::format::x);
+      auto user_bias_memory_p = handler.AcquireBiasMemory(
+          user_bias_md, platform::to_void_cast<T>(bias_data));
+
+      auto bias_memory_p =
+          handler.AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline);
+      conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
+                                          bias_memory_p, dst_memory_p);
+    } else {
+      conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
+                                          dst_memory_p);
+    }
+
+    // push primitive to stream and wait until it's executed
+    pipeline.push_back(*conv_p);
+    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+
+    output->set_layout(DataLayout::kMKLDNN);
+    output->set_format(platform::GetMKLDNNFormat(*dst_memory_p));
+  }
+
+ private:
+  mkldnn::primitive_attr CreatePostOps(bool fuse_relu) const {
+    mkldnn::primitive_attr conv_attr;
+    mkldnn::post_ops post_operations;
+    // Fusion with ReLU layer is executed through the PostOps feature. Create a
+    // PostOps object and configure it to execute an eltwise relu operation.
+    if (fuse_relu) {
+      constexpr float scale = 1.0f;
+      constexpr float negative_slope = 0.0f;
+      constexpr float placeholder = 0.0f;
+      post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
+                                     negative_slope, placeholder);
+    }
+    conv_attr.set_post_ops(post_operations);
+    return conv_attr;
+  }
+
+  std::unique_ptr<mkldnn::deconvolution_forward::primitive_desc>
+  ConvTransposeFwdPrimitiveDesc(
+      const mkldnn::memory::desc& src, const mkldnn::memory::desc& weights,
+      const mkldnn::memory::desc& dst, const std::vector<int>& strides,
+      const std::vector<int>& paddings, const mkldnn::engine& engine,
+      const bool fuse_relu, mkldnn::prop_kind fwd_prop_kind) const {
+    mkldnn::memory::dims stride_dims = {strides[0], strides[1]};
+    mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]};
+
+    auto deconv_desc = mkldnn::deconvolution_forward::desc(
+        fwd_prop_kind, mkldnn::deconvolution_direct, src, weights, dst,
+        stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
+
+    mkldnn::primitive_attr deconv_attr = CreatePostOps(fuse_relu);
+
+    auto p_conv_transpose_pd =
+        new mkldnn::deconvolution_forward::primitive_desc(deconv_desc,
+                                                          deconv_attr, engine);
+
+    return std::unique_ptr<mkldnn::deconvolution_forward::primitive_desc>(
+        p_conv_transpose_pd);
+  }
+
+  std::unique_ptr<mkldnn::deconvolution_forward::primitive_desc>
+  ConvTransposeFwdPrimitiveDesc(
+      const mkldnn::memory::desc& src, const mkldnn::memory::desc& weights,
+      const mkldnn::memory::desc& bias, const mkldnn::memory::desc& dst,
+      const std::vector<int>& strides, const std::vector<int>& paddings,
+      const mkldnn::engine& engine, const bool fuse_relu,
+      mkldnn::prop_kind fwd_prop_kind) const {
+    mkldnn::memory::dims stride_dims = {strides[0], strides[1]};
+    mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]};
+
+    auto deconv_desc = mkldnn::deconvolution_forward::desc(
+        fwd_prop_kind, mkldnn::deconvolution_direct, src, weights, bias, dst,
+        stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
+
+    mkldnn::primitive_attr deconv_attr = CreatePostOps(fuse_relu);
+
+    auto p_conv_transpose_pd =
+        new mkldnn::deconvolution_forward::primitive_desc(deconv_desc,
+                                                          deconv_attr, engine);
+
+    return std::unique_ptr<mkldnn::deconvolution_forward::primitive_desc>(
+        p_conv_transpose_pd);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(conv2d_transpose, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::ConvTransposeMKLDNNOpKernel<float>);
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index a916dd3496ffaffa138529a8a2f7e20ef26fcc96..2fdfc40d194224f0328161f5689da6246b1aae7f 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -16,6 +16,10 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -78,29 +82,38 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
 
 framework::OpKernelType ConvTransposeOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
+  framework::LibraryType library_{framework::LibraryType::kPlain};
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
   bool use_cudnn = ctx.Attr<bool>("use_cudnn");
   use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
 #ifdef PADDLE_WITH_CUDA
   if (platform::is_gpu_place(ctx.GetPlace())) {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
+    if (use_cudnn) {
+      library_ = framework::LibraryType::kCUDNN;
+    }
   }
 #endif
-  framework::LibraryType library_;
-  if (use_cudnn) {
-    library_ = framework::LibraryType::kCUDNN;
-  } else {
-    library_ = framework::LibraryType::kPlain;
+#ifdef PADDLE_WITH_MKLDNN
+  if (library_ == framework::LibraryType::kPlain &&
+      platform::CanMKLDNNBeUsed(ctx)) {
+    library_ = framework::LibraryType::kMKLDNN;
+    layout_ = framework::DataLayout::kMKLDNN;
   }
+#endif
 
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
   return framework::OpKernelType(
       framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
       layout_, library_);
 }
 
 void Conv2DTransposeOpMaker::Make() {
+  AddAttr<bool>("is_test",
+                "(bool, default false) Set to true for inference only, false "
+                "for training. Some layers may run faster when this is true.")
+      .SetDefault(false);
   AddInput(
       "Input",
       "(Tensor) The input tensor of convolution transpose operator. "
@@ -145,6 +158,11 @@ void Conv2DTransposeOpMaker::Make() {
       "use_cudnn",
       "(bool, default false) Only used in cudnn kernel, need install cudnn")
       .SetDefault(false);
+  AddAttr<bool>("use_mkldnn",
+                "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
+  AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
   AddAttr<std::string>(
       "data_format",
       "(string, default NCHW) Only used in "
@@ -238,6 +256,9 @@ void Conv3DTransposeOpMaker::Make() {
       "use_cudnn",
       "(bool, default false) Only used in cudnn kernel, need install cudnn")
       .SetDefault(false);
+  AddAttr<bool>("use_mkldnn",
+                "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
   AddAttr<std::string>(
       "data_format",
       "(string, default NCHW) Only used in "
diff --git a/paddle/fluid/operators/cos_sim_op.cu b/paddle/fluid/operators/cos_sim_op.cu
index 82205e9c75402e368a2d1e161d471e35ff7356ea..3d144ca29d9989ad2cbb438a950860eaac873d07 100644
--- a/paddle/fluid/operators/cos_sim_op.cu
+++ b/paddle/fluid/operators/cos_sim_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/cos_sim_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/crop_op.cu b/paddle/fluid/operators/crop_op.cu
index b75678217e36aa2297c68a7f8e2a9dfafadaca72..66cb5c452de4b2107693127ce414daf9fb7cd7d8 100644
--- a/paddle/fluid/operators/crop_op.cu
+++ b/paddle/fluid/operators/crop_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/crop_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e63d57be57a66e8e02f7ef88acd01246302bc53c
--- /dev/null
+++ b/paddle/fluid/operators/cudnn_lstm_op.cc
@@ -0,0 +1,218 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class CudnnLSTMOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("W"),
+                   "Input(Weight) of LSTM should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasInput("InitH"),
+                   "Input(init_h) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("InitC"),
+                   "Input(init_c) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Cache"),
+                   "Input(Cache) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("last_h"),
+                   "Output(last_h) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("last_c"),
+                   "Output(last_c) of LSTM should not be null.");
+
+    auto in_dims = ctx->GetInputDim("Input");
+    PADDLE_ENFORCE_EQ(in_dims.size(), 3, "Input(X)'s rank must be 3.");
+
+    ctx->SetOutputDim("Out", ctx->GetInputDim("Input"));
+    ctx->SetOutputDim("last_h", ctx->GetInputDim("InitH"));
+    ctx->SetOutputDim("last_c", ctx->GetInputDim("InitC"));
+  }
+};
+
+class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "Input",
+        "(Tensor) RNN input tensor, which support variable-time length input "
+        "sequence."
+        "The shape of the Tensor MUST be ( seq_len * batch_size * input_size)"
+        "seq_len is the total time step in this mini-batch (CAN be change in "
+        "different batch)"
+        "batch_size is the instance number of this batch"
+        "input_size is the hidden size of the input."
+        "input_hidden_size and the hidden_size in the next may not be same");
+    AddInput("InitH",
+             "(Tensor) the initial hidden state of the LSTM"
+             "input. This is a tensor with shape (num_layers x batch_size x "
+             "hidden_size)"
+             "and When is_bidirec is True, the shape will be (num_layers*2 x "
+             "batch_size x hidden_size)");
+    AddInput("InitC",
+             "(Tensor) the initial cell state of the LSTm "
+             "input. This is a tensor with shape (num_layers x batch_size x "
+             "hidden_size)"
+             "and When is_bidirec is True, the shape will be (num_layers*2 x "
+             "batch_size x hidden_size)");
+    AddInput("W",
+             "(Tensor) the learnable hidden-hidden weights."
+             " The shape is (N), where N is total weight size of the LSTM. "
+             " cudnn concatenate all the weight to one Tensor");
+    AddInput("Cache",
+             "The cache of dropout op, a RAW type variable including random "
+             "number generator states and some descriptors, which is used in "
+             "cudnn kernel.")
+        .AsDispensable();
+    AddOutput("Out",
+              "(Tensor) the hidden state of LSTM operator. "
+              "The shape is ( seq_len x batch_size x hidden_size) if "
+              "is_bidirec is False"
+              "and When is_bidirec is True, the shape will be ( seq_len x "
+              "batch_size x hidden_size * 2) ");
+    AddOutput("last_h",
+              "(Tensor) the hidden state of the last step. "
+              "The shape is ( num_layers x batch_size x hidden_size) if "
+              "is_bidirec is False"
+              "and When is_bidirec is True, the shape will be (num_layers*2 x "
+              "batch_size x hidden_size)");
+    AddOutput("last_c",
+              "(Tensor) the cell state of the last step"
+              "The shape is ( num_layers x batch_size x hidden_size) if "
+              "is_bidirec is False"
+              "and When is_bidirect is True, the shape will be (num_layers*2 x "
+              "batch_size x hidden_size*2)");
+    AddAttr<int>("max_len",
+                 "max length of the LSTM op"
+                 "the first dim of the Input can NOT be greater than max_len")
+        .SetDefault(20);
+    AddAttr<float>(
+        "dropout_prob",
+        "dropout prob of the dropout op"
+        "the dropout ONLY work between lstm layers, not between time steps"
+        "There is no dropout work on the Out tensor")
+        .SetDefault(0.0);
+    AddAttr<bool>("is_bidirec",
+                  "is_bidirec"
+                  "if it is bidirection rnn"
+                  "The will affect the shape of the Out, last_h, and last_c")
+        .SetDefault(false);
+    AddAttr<int>("input_size", "input size ot the Input Tensor").SetDefault(10);
+    AddAttr<int>("hidden_size", "hidden size of the LSTM").SetDefault(100);
+    AddAttr<int>("num_layers", "the total layer number of the LSTM")
+        .SetDefault(1);
+    AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
+    AddAttr<int>("seed", "seed to used if fix_seed is True").SetDefault(-1);
+    AddComment(R"DOC(
+CUDNN LSTM implementation
+
+A four-gate Long Short-Term Memory network with no peephole connections.
+In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, 
+the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
+
+$$ i_t = sigmoid(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
+
+$$ f_t = sigmoid(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$
+
+$$ o_t = sigmoid(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$
+
+$$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$
+
+$$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$
+
+$$ h_t = o_t \\odot tanh(c_t) $$
+
+- W terms denote weight matrices (e.g. $W_{ix}$ is the matrix
+  of weights from the input gate to the input)
+- The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
+- sigmoid is the logistic sigmoid function.
+- $i, f, o$ and $c$ are the input gate, forget gate, output gate,
+  and cell activation vectors, respectively, all of which have the same size as
+  the cell output activation vector $h$.
+- The $\odot$ is the element-wise product of the vectors.
+- `tanh` is the activation functions.
+- $\tilde{c_t}$ is also called candidate hidden state,
+  which is computed based on the current input and the previous hidden state.
+
+Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, 
+X represensts a matrix multiplication
+
+
+)DOC");
+  }
+};
+
+class CudnnLSTMGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("last_h"),
+                   "Input(last_h) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("last_c"),
+                   "Input(last_c) of LSTM should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasInput("Cache"),
+                   "Input(last_c) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("InitH"),
+                   "Input(init_h) of LSTM should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasInput("InitC"),
+                   "Input(init_c) of LSTM should not be null.");
+
+    auto SetOutGradDim = [&ctx](const std::string& name) {
+      auto g_name = framework::GradVarName(name);
+      if (ctx->HasOutput(g_name)) {
+        ctx->SetOutputDim(g_name, ctx->GetInputDim(name));
+      }
+    };
+
+    SetOutGradDim("Input");
+    SetOutGradDim("W");
+    SetOutGradDim("InitH");
+    SetOutGradDim("InitC");
+  }
+};
+
+template <typename T>
+class NotImpleKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(
+        "CPU is not support for this kernel now. Will be add in the future");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(cudnn_lstm, ops::CudnnLSTMOp, ops::CudnnLSTMOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(cudnn_lstm_grad, ops::CudnnLSTMGradOp);
+
+REGISTER_OP_CPU_KERNEL(cudnn_lstm, ops::NotImpleKernel<float>);
+REGISTER_OP_CPU_KERNEL(cudnn_lstm_grad, ops::NotImpleKernel<float>);
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dd64cc327fc383937bc9a9d6e7daa0cec488e4cc
--- /dev/null
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -0,0 +1,493 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+struct CudnnRNNCache {
+  CudnnRNNCache() {
+    x_desc_ = NULL;
+    y_desc_ = NULL;
+    dx_desc_ = NULL;
+    dy_desc_ = NULL;
+  }
+  ~CudnnRNNCache() { release(); }
+
+  cudnnRNNDescriptor_t rnn_desc_;
+  cudnnTensorDescriptor_t *x_desc_;
+  cudnnTensorDescriptor_t *y_desc_;
+  cudnnTensorDescriptor_t *dx_desc_;
+  cudnnTensorDescriptor_t *dy_desc_;
+
+  cudnnTensorDescriptor_t hx_desc_;
+  cudnnTensorDescriptor_t cx_desc_;
+  cudnnTensorDescriptor_t hy_desc_;
+  cudnnTensorDescriptor_t cy_desc_;
+
+  cudnnTensorDescriptor_t dhx_desc_;
+  cudnnTensorDescriptor_t dcx_desc_;
+  cudnnTensorDescriptor_t dhy_desc_;
+  cudnnTensorDescriptor_t dcy_desc_;
+
+  cudnnTensorDescriptor_t output_x_desc_;
+  cudnnTensorDescriptor_t output_y_desc_;
+
+  cudnnDropoutDescriptor_t dropout_desc_;
+
+  size_t weights_size_;
+  cudnnFilterDescriptor_t w_desc_;
+  cudnnFilterDescriptor_t dw_desc_;
+
+  size_t workspace_size_;
+  size_t reserve_size_;
+  Tensor reserve_data_;
+  Tensor workspace_data_;
+
+  Tensor dropout_state_;
+
+  size_t max_length_;
+
+  float dropout_prob_;
+  bool is_bidirec_;
+
+  int batch_size_;
+  int input_size_;
+  int hidden_size_;
+  int num_layers_;
+  int seed_;
+
+  void init(cudnnHandle_t handle, const framework::ExecutionContext &ctx,
+            size_t max_len, int batch_size, int input_size, int hidden_size,
+            int num_layers, float dropout_prob, bool is_bidirec, int seed,
+            int weight_numel) {
+    max_length_ = max_len;
+    batch_size_ = batch_size;
+    input_size_ = input_size;
+    hidden_size_ = hidden_size;
+    num_layers_ = num_layers;
+    dropout_prob_ = dropout_prob;
+    is_bidirec_ = is_bidirec;
+    seed_ = seed;
+
+    x_desc_ = new cudnnTensorDescriptor_t[max_length_];
+    y_desc_ = new cudnnTensorDescriptor_t[max_length_];
+    dx_desc_ = new cudnnTensorDescriptor_t[max_length_];
+    dy_desc_ = new cudnnTensorDescriptor_t[max_length_];
+    int dim_a[3];
+    int stride_a[3];
+
+    for (size_t i = 0; i < max_length_; ++i) {
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&dx_desc_[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnCreateTensorDescriptor(&dy_desc_[i]));
+      dim_a[0] = batch_size_;
+      dim_a[1] = input_size_;
+      dim_a[2] = 1;
+
+      stride_a[0] = dim_a[2] * dim_a[1];
+      stride_a[1] = dim_a[2];
+      stride_a[2] = 1;
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          x_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          dx_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+
+      dim_a[0] = batch_size_;
+      dim_a[1] = is_bidirec_ ? hidden_size_ * 2 : hidden_size_;
+      dim_a[2] = 1;
+
+      stride_a[0] = dim_a[2] * dim_a[1];
+      stride_a[1] = dim_a[2];
+      stride_a[2] = 1;
+
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          y_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+      CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+          dy_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    }
+
+    dim_a[0] = num_layers_ * (is_bidirec_ ? 2 : 1);
+    dim_a[1] = batch_size_;
+    dim_a[2] = hidden_size_;
+
+    stride_a[0] = dim_a[2] * dim_a[1];
+    stride_a[1] = dim_a[2];
+    stride_a[2] = 1;
+
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        hx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        cx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        hy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        cy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        dhx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        dcx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        dhy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        dcy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
+
+    CUDNN_ENFORCE(
+        platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_));
+
+    size_t state_size;
+    CUDNN_ENFORCE(
+        platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size);
+        dropout_state_.Resize({static_cast<int64_t>(state_size)}));
+    auto *dropout_state_data =
+        dropout_state_.mutable_data<uint8_t>(ctx.GetPlace());
+    CUDNN_ENFORCE(platform::dynload::cudnnSetDropoutDescriptor(
+        dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size,
+        seed_));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_));
+
+#if CUDNN_VERSION >= 6000
+    CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6(
+        handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
+        CUDNN_LINEAR_INPUT,
+        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
+        CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT));
+#else
+    CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor(
+        rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT,
+        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
+        CUDNN_DATA_FLOAT));
+#endif
+
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnGetRNNParamsSize(
+        handle, rnn_desc_, x_desc_[0], &weights_size_, CUDNN_DATA_FLOAT));
+
+    PADDLE_ENFORCE_EQ(weights_size_, sizeof(float) * weight_numel,
+                      "cudnn lstm weight size should be SAME");
+    int dim_w[3];
+    dim_w[0] = weights_size_ / sizeof(float);
+    dim_w[1] = 1;
+    dim_w[2] = 1;
+    CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor(
+        w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor(
+        dw_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnGetRNNWorkspaceSize(
+        handle, rnn_desc_, max_length_, x_desc_, &workspace_size_));
+    CUDNN_ENFORCE(platform::dynload::cudnnGetRNNTrainingReserveSize(
+        handle, rnn_desc_, max_length_, x_desc_, &reserve_size_));
+
+    reserve_data_.Resize({static_cast<int64_t>(reserve_size_)});
+    reserve_data_.mutable_data<uint8_t>(ctx.GetPlace());
+
+    workspace_data_.Resize({static_cast<int64_t>(workspace_size_)});
+    workspace_data_.mutable_data<uint8_t>(ctx.GetPlace());
+  }
+
+  void release() {
+    for (size_t i = 0; i < max_length_; ++i) {
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(dx_desc_[i]));
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnDestroyTensorDescriptor(dy_desc_[i]));
+    }
+
+    delete[] x_desc_;
+    delete[] y_desc_;
+    delete[] dx_desc_;
+    delete[] dy_desc_;
+
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_));
+
+    CUDNN_ENFORCE(
+        platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(w_desc_));
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_));
+  }
+};
+
+template <typename T>
+class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const Tensor *x = ctx.Input<Tensor>("Input");
+    const Tensor *init_h = ctx.Input<Tensor>("InitH");
+    const Tensor *init_c = ctx.Input<Tensor>("InitC");
+
+    auto w = ctx.Input<Tensor>("W");
+
+    Tensor *out = ctx.Output<Tensor>("Out");
+    Tensor *last_h = ctx.Output<Tensor>("last_h");
+    Tensor *last_c = ctx.Output<Tensor>("last_c");
+
+    const T *x_data = x->data<T>();
+    const T *init_h_data = init_h->data<T>();
+    const T *init_c_data = init_c->data<T>();
+
+    const T *w_data = w->data<T>();
+
+    T *out_data = out->mutable_data<T>(ctx.GetPlace());
+    T *last_h_data = last_h->mutable_data<T>(ctx.GetPlace());
+    T *last_c_data = last_c->mutable_data<T>(ctx.GetPlace());
+
+    size_t max_len = ctx.Attr<int>("max_len");
+    float dropout_prob = ctx.Attr<float>("dropout_prob");
+    bool is_bidirec = ctx.Attr<bool>("is_bidirec");
+    int input_size = ctx.Attr<int>("input_size");
+    int hidden_size = ctx.Attr<int>("hidden_size");
+    int num_layers = ctx.Attr<int>("num_layers");
+    bool is_test = ctx.Attr<bool>("is_test");
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+    auto *cache_var = ctx.InputVar("Cache");
+    if (!cache_var) {
+      // The RAW type cache variable wouldn't be created and broadcasted on
+      // multi-devices before the first running.
+      // use parent scope to make cache persistable
+      auto *scope = const_cast<framework::Scope *>(ctx.scope().parent());
+      auto cache_var_name = ctx.Inputs("Cache")[0];
+      cache_var = scope->Var(cache_var_name);
+    }
+    CudnnRNNCache *cudnn_rnn_cache = nullptr;
+    if (cache_var->IsInitialized()) {
+      cudnn_rnn_cache = const_cast<framework::Variable *>(cache_var)
+                            ->GetMutable<CudnnRNNCache>();
+    } else {
+      cudnn_rnn_cache = const_cast<framework::Variable *>(cache_var)
+                            ->GetMutable<CudnnRNNCache>();
+      std::random_device rnd;
+      int seed = ctx.Attr<int>("seed");
+      if (seed == -1) {
+        seed = rnd();
+      }
+
+      auto input_w_numel = w->numel();
+      auto batch_size = x->dims()[1];
+      cudnn_rnn_cache->init(handle, ctx, max_len, batch_size, input_size,
+                            hidden_size, num_layers, dropout_prob, is_bidirec,
+                            seed, input_w_numel);
+    }
+
+    auto run_seq_len = x->dims()[0];
+
+    if (is_test) {
+      // for inference
+      CUDNN_ENFORCE(platform::dynload::cudnnRNNForwardInference(
+          handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
+          cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_,
+          init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data,
+          cudnn_rnn_cache->w_desc_, w_data, cudnn_rnn_cache->y_desc_, out_data,
+          cudnn_rnn_cache->hy_desc_, last_h_data, cudnn_rnn_cache->cy_desc_,
+          last_c_data, cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
+          cudnn_rnn_cache->workspace_size_));
+    } else {
+      // for train
+      CUDNN_ENFORCE(platform::dynload::cudnnRNNForwardTraining(
+          handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
+          cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_,
+          init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data,
+          cudnn_rnn_cache->w_desc_, w_data, cudnn_rnn_cache->y_desc_, out_data,
+          cudnn_rnn_cache->hy_desc_, last_h_data, cudnn_rnn_cache->cy_desc_,
+          last_c_data, cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
+          cudnn_rnn_cache->workspace_size_,
+          cudnn_rnn_cache->reserve_data_.data<uint8_t>(),
+          cudnn_rnn_cache->reserve_size_));
+    }
+  }
+};
+
+template <typename T>
+class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *input = ctx.Input<Tensor>("Input");
+    auto *weight = ctx.Input<Tensor>("W");
+    auto *init_h = ctx.Input<Tensor>("InitH");
+    auto *init_c = ctx.Input<Tensor>("InitC");
+    // auto * last_h = ctx.Input<Tensor>("last_h");
+    // auto * last_c = ctx.Input<Tensor>("last_c");
+    auto *out = ctx.Input<Tensor>("Out");
+    auto *out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto *last_h_grad = ctx.Input<Tensor>(framework::GradVarName("last_h"));
+    auto *last_c_grad = ctx.Input<Tensor>(framework::GradVarName("last_c"));
+
+    // auto* init_h = ctx.Input<Tensor>("init_h");
+    // auto* init_c = ctx.Input<Tensor>("init_c");
+
+    auto *in_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    auto *weight_grad = ctx.Output<Tensor>(framework::GradVarName("W"));
+    auto *init_h_grad = ctx.Output<Tensor>(framework::GradVarName("InitH"));
+    auto *init_c_grad = ctx.Output<Tensor>(framework::GradVarName("InitC"));
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+    auto *cache_var = ctx.InputVar("Cache");
+    PADDLE_ENFORCE(cache_var->IsInitialized());
+    CudnnRNNCache *cudnn_rnn_cache =
+        const_cast<framework::Variable *>(cache_var)
+            ->GetMutable<CudnnRNNCache>();
+
+    auto input_dims = input->dims();
+    auto weight_dims = weight->dims();
+    auto init_h_dims = init_h->dims();
+    auto init_c_dims = init_c->dims();
+    in_grad->mutable_data<T>(ctx.GetPlace());
+    weight_grad->mutable_data<T>(ctx.GetPlace());
+    math::SetConstant<paddle::platform::CUDADeviceContext, T> zero;
+    zero(dev_ctx, in_grad, static_cast<T>(0.0));
+    zero(dev_ctx, weight_grad, static_cast<T>(0.0));
+
+    T *init_h_grad_data = NULL;
+    if (init_h_grad == nullptr) {
+      Tensor init_h_grad_temp;
+      init_h_grad_temp.mutable_data<T>(init_h_dims, ctx.GetPlace());
+      zero(dev_ctx, &init_h_grad_temp, static_cast<T>(0.0));
+
+      init_h_grad_data = init_h_grad_temp.data<T>();
+    } else {
+      init_h_grad->mutable_data<T>(init_h_dims, ctx.GetPlace());
+      zero(dev_ctx, init_h_grad, static_cast<T>(0.0));
+      init_h_grad_data = init_h_grad->data<T>();
+    }
+
+    T *init_c_grad_data = NULL;
+    if (init_c_grad == nullptr) {
+      Tensor init_c_grad_temp;
+      init_c_grad_temp.mutable_data<T>(init_c_dims, ctx.GetPlace());
+      zero(dev_ctx, &init_c_grad_temp, static_cast<T>(0.0));
+
+      init_c_grad_data = init_c_grad_temp.data<T>();
+    } else {
+      init_c_grad->mutable_data<T>(init_c_dims, ctx.GetPlace());
+      zero(dev_ctx, init_c_grad, static_cast<T>(0.0));
+      init_c_grad_data = init_c_grad->data<T>();
+    }
+
+    const T *last_h_grad_data = NULL;
+    if (last_h_grad == nullptr) {
+      Tensor last_h_grad_temp;
+      last_h_grad_temp.mutable_data<T>(init_h_dims, ctx.GetPlace());
+      zero(dev_ctx, &last_h_grad_temp, static_cast<T>(0.0));
+
+      last_h_grad_data = (const T *)last_h_grad_temp.data<T>();
+    } else {
+      last_h_grad_data = last_h_grad->data<T>();
+    }
+
+    const T *last_c_grad_data = NULL;
+    if (last_c_grad == nullptr) {
+      Tensor last_c_grad_temp;
+      last_c_grad_temp.mutable_data<T>(init_c_dims, ctx.GetPlace());
+      zero(dev_ctx, &last_c_grad_temp, static_cast<T>(0.0));
+
+      last_c_grad_data = (const T *)last_c_grad_temp.data<T>();
+    } else {
+      last_c_grad_data = last_c_grad->data<T>();
+    }
+
+    const T *out_grad_data = NULL;
+    if (out_grad == nullptr) {
+      Tensor out_grad_temp;
+      out_grad_temp.mutable_data<T>(out->dims(), ctx.GetPlace());
+      zero(dev_ctx, &out_grad_temp, static_cast<T>(0.0));
+
+      out_grad_data = (const T *)out_grad_temp.data<T>();
+    } else {
+      out_grad_data = out_grad->data<T>();
+    }
+
+    // zero( dev_ctx, last_h_grad, static_cast<T>(0.0));
+    // zero( dev_ctx, last_c_grad, static_cast<T>(0.0));
+
+    auto out_data = out->data<T>();
+    // auto out_grad_data = out_grad->data<T>();
+    auto weight_data = weight->data<T>();
+    auto init_h_data = init_h->data<T>();
+    auto init_c_data = init_c->data<T>();
+    auto in_grad_data = in_grad->data<T>();
+
+    auto work_data = cudnn_rnn_cache->workspace_data_.data<uint8_t>();
+    auto reserve_data = cudnn_rnn_cache->reserve_data_.data<uint8_t>();
+
+    auto run_seq_len = input_dims[0];
+    PADDLE_ENFORCE_LE((size_t)run_seq_len, cudnn_rnn_cache->max_length_,
+                      "cudnn running seq_len CAN not greater max_lengh");
+    CUDNN_ENFORCE(platform::dynload::cudnnRNNBackwardData(
+        handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
+        cudnn_rnn_cache->y_desc_, out_data, cudnn_rnn_cache->dy_desc_,
+        out_grad_data, cudnn_rnn_cache->dhy_desc_, last_h_grad_data,
+        cudnn_rnn_cache->dcy_desc_, last_c_grad_data, cudnn_rnn_cache->w_desc_,
+        weight_data, cudnn_rnn_cache->hx_desc_, init_h_data,
+        cudnn_rnn_cache->cx_desc_, init_c_data, cudnn_rnn_cache->dx_desc_,
+        in_grad_data, cudnn_rnn_cache->dhx_desc_, init_h_grad_data,
+        cudnn_rnn_cache->dcx_desc_, init_c_grad_data, work_data,
+        cudnn_rnn_cache->workspace_size_, reserve_data,
+        cudnn_rnn_cache->reserve_size_));
+
+    CUDNN_ENFORCE(platform::dynload::cudnnRNNBackwardWeights(
+        handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
+        cudnn_rnn_cache->x_desc_, input->data<T>(), cudnn_rnn_cache->hx_desc_,
+        init_h->data<T>(), cudnn_rnn_cache->y_desc_, out->data<T>(),
+        cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
+        cudnn_rnn_cache->workspace_size_, cudnn_rnn_cache->dw_desc_,
+        weight_grad->data<T>(), cudnn_rnn_cache->reserve_data_.data<uint8_t>(),
+        cudnn_rnn_cache->reserve_size_));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(cudnn_lstm, ops::CudnnLSTMGPUKernel<float>);
+REGISTER_OP_CUDA_KERNEL(cudnn_lstm_grad, ops::CudnnLSTMGPUGradKernel<float>);
diff --git a/paddle/fluid/operators/detection/box_coder_op.h b/paddle/fluid/operators/detection/box_coder_op.h
index 5ed8520acddfa8fe2105a7c1615bcb3243cb130f..b2a2bcdce932032a761a1fc064fe622f7629f9bf 100644
--- a/paddle/fluid/operators/detection/box_coder_op.h
+++ b/paddle/fluid/operators/detection/box_coder_op.h
@@ -43,6 +43,9 @@ class BoxCoderKernel : public framework::OpKernel<T> {
     const T* prior_box_var_data = nullptr;
     if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
 
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(2)
+#endif
     for (int64_t i = 0; i < row; ++i) {
       for (int64_t j = 0; j < col; ++j) {
         T prior_box_width = prior_box_data[j * len + 2] -
@@ -96,6 +99,9 @@ class BoxCoderKernel : public framework::OpKernel<T> {
     const T* prior_box_var_data = nullptr;
     if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
 
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(2)
+#endif
     for (int64_t i = 0; i < row; ++i) {
       for (int64_t j = 0; j < col; ++j) {
         size_t offset = i * col * len + j * len;
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 21db93958a4a586c74a1e060f1f04b5af1dcd889..101dbe9c89616b7025337261469e2b1aa3e8bc76 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -9,36 +9,47 @@ else()
 endif()
 configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY)
 
+set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+
 if(WITH_GRPC)
   grpc_library(sendrecvop_grpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
-        request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc
+        request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc collective_client.cc collective_server.cc
       PROTO send_recv.proto 
-      DEPS lod_tensor selected_rows memory)
-  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+      DEPS lod_tensor selected_rows_functor memory)
+
   set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+
   cc_test(grpc_serde_test SRCS grpc_serde_test.cc 
     DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
+
   cc_test(rpc_server_test SRCS rpc_server_test.cc
     DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_sparse_table_op SERIAL)
+
   cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler)
-  return()
-endif()
 
+  if(WITH_GPU)
+  cc_test(collective_server_test SRCS collective_server_test.cc 
+      DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
+      selected_rows_functor  scope math_function SERIAL)
+  endif()
 
-set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_grpc memory)
+else()
+  set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
+      brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
-set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
-    brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc
+      brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc
+    PROTO send_recv.proto
+    DEPS lod_tensor selected_rows memory)
 
-brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc 
-    brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc
-  PROTO send_recv.proto
-  DEPS lod_tensor selected_rows memory)
+  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_brpc memory)
 
-set(brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy)
+  set(brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy)
 
-cc_test(brpc_server_test SRCS rpc_server_test.cc 
-    DEPS ${brpc_test_depends} SERIAL)
+  cc_test(brpc_server_test SRCS rpc_server_test.cc
+      DEPS ${brpc_test_depends} SERIAL)
 
-cc_test(brpc_serde_test SRCS brpc_serde_test.cc 
-    DEPS ${brpc_test_depends} SERIAL)
+  cc_test(brpc_serde_test SRCS brpc_serde_test.cc
+      DEPS ${brpc_test_depends} SERIAL)
+endif()
diff --git a/paddle/fluid/operators/distributed/brpc_client.cc b/paddle/fluid/operators/distributed/brpc_client.cc
index b394c678fb6503eb73a1e11e6feb814251e9e940..350969f74be258ffbfef687b56083a9c6508bc81 100644
--- a/paddle/fluid/operators/distributed/brpc_client.cc
+++ b/paddle/fluid/operators/distributed/brpc_client.cc
@@ -158,7 +158,7 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
   for (int i = 0; i < FLAGS_brpc_channel_num; ++i) {
     std::shared_ptr<ChannelContext> c(new ChannelContext());
     if (c->channel.Init(ep.c_str(), &options) != 0) {
-      LOG(ERROR) << "Fail to initialize channel";
+      LOG(FATAL) << "Fail to initialize channel";
       return nullptr;
     }
 
diff --git a/paddle/fluid/operators/distributed/collective_client.cc b/paddle/fluid/operators/distributed/collective_client.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6d3f53431113621fc859eda8e7448383772d20a3
--- /dev/null
+++ b/paddle/fluid/operators/distributed/collective_client.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <condition_variable>  // NOLINT
+#include <string>
+#include "gflags/gflags.h"
+
+#include "paddle/fluid/operators/distributed/collective_client.h"
+
+DECLARE_int32(rpc_deadline);
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+std::once_flag CollectiveClient::init_flag_;
+std::unique_ptr<CollectiveClient> CollectiveClient::client_(nullptr);
+
+bool CollectiveClient::Gather(const std::vector<RemoteVar>& remote_vars,
+                              std::vector<const framework::SelectedRows*>* dst,
+                              const platform::DeviceContext& ctx,
+                              framework::Scope* scope, int64_t time_out) {
+  for (auto r : remote_vars) {
+    VLOG(50) << "begin gather from ep:" << r.String();
+    scope->Var(r.var_name_)->GetMutable<framework::SelectedRows>();
+    VarHandlePtr ptr = rpc_client_->AsyncGetMonomerVariable(
+        r.ep_, ctx, *scope, r.var_name_, time_out);
+  }
+
+  rpc_client_->Wait();
+
+  for (auto r : remote_vars) {
+    auto select_rows =
+        scope->FindVar(r.var_name_)->GetMutable<framework::SelectedRows>();
+    dst->push_back(select_rows);
+
+    VLOG(4) << "gather from ep:" << r.String()
+            << ", select_rows:" << GetSelectedRowsInfo(*select_rows);
+
+    rpc_client_->AsyncGetMonomerBarrier(r.ep_, r.var_name_);
+  }
+
+  rpc_client_->Wait();
+  return true;
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/collective_client.h b/paddle/fluid/operators/distributed/collective_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..53b03c531a2b8859e6d7c904e9ab4d1b7a5c8b9b
--- /dev/null
+++ b/paddle/fluid/operators/distributed/collective_client.h
@@ -0,0 +1,93 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <condition_variable>  // NOLINT
+#include <string>
+#include <vector>
+#include "gflags/gflags.h"
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
+
+DECLARE_int32(rpc_deadline);
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+inline std::string GetSelectedRowsInfo(const framework::SelectedRows& slr) {
+  std::stringstream ss;
+  ss << ", height:" << slr.height() << ", rows:[";
+  for (unsigned int i = 0; i < slr.rows().size(); i++) {
+    if (i != slr.rows().size() - 1) {
+      ss << slr.rows()[i] << ",";
+    } else {
+      ss << slr.rows()[i];
+    }
+  }
+  ss << "], dims:" << slr.value().dims();
+  return ss.str();
+}
+
+struct RemoteVar {
+  std::string ep_;
+  std::string var_name_;
+  int trainer_id_{0};
+
+  std::string String() {
+    std::stringstream ss;
+    ss << "ep:" << ep_ << ", var_name:" << var_name_
+       << ", trainer_id:" << trainer_id_;
+
+    return ss.str();
+  }
+};
+
+class CollectiveClient {
+ public:
+  CollectiveClient() {
+    rpc_client_.reset(new RPCCLIENT_T());
+    rpc_client_->InitImpl();
+  }
+  virtual ~CollectiveClient() {}
+
+  // note this function will retain the rank order.
+  bool Gather(const std::vector<RemoteVar>& remote_vars,
+              std::vector<const framework::SelectedRows*>* dst,
+              const platform::DeviceContext& ctx, framework::Scope* scope,
+              int64_t time_out = FLAGS_rpc_deadline);
+
+  static CollectiveClient* GetInstance() {
+    std::call_once(init_flag_, [&]() {
+      if (client_.get() == nullptr) {
+        client_.reset(new CollectiveClient());
+      }
+    });
+    return client_.get();
+  }
+
+ private:
+  std::unique_ptr<RPCClient> rpc_client_;
+
+  static std::once_flag init_flag_;
+  static std::unique_ptr<CollectiveClient> client_;
+};
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/collective_server.cc b/paddle/fluid/operators/distributed/collective_server.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c95652400c27acd406ca3f70a0dfa8d329e94358
--- /dev/null
+++ b/paddle/fluid/operators/distributed/collective_server.cc
@@ -0,0 +1,74 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stdio.h>  // for removing the port file
+#include <csignal>
+#include <cstdlib>
+#include <fstream>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "paddle/fluid/operators/distributed/collective_server.h"
+
+DEFINE_int32(collective_get_thread_num, 5, "number of threads for rpc get");
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+std::once_flag CollectiveServer::init_flag_;
+std::shared_ptr<CollectiveServer> CollectiveServer::collective_server_(nullptr);
+
+CollectiveServer::CollectiveServer(const std::string& end_point, int fan_in) {
+  VLOG(1) << "Create colllective server:" << end_point << ", fan_in:" << fan_in;
+  rpc_server_.reset(new RPCSERVER_T(end_point, fan_in));
+}
+
+void CollectiveServer::Stop() {
+  rpc_server_->ShutDown();
+  server_thread_->join();
+  loop_thread_->join();
+}
+
+void CollectiveServer::StartServer() {
+  get_monomer_handler_.reset(new GetMonomerHandler());
+  get_monomer_handler_->SetRPCServer(rpc_server_.get());
+
+  get_barrier_handler_.reset(new GetMonomerBarrierHandler());
+  get_barrier_handler_->SetRPCServer(rpc_server_.get());
+
+  rpc_server_->RegisterRPC(distributed::kRequestGetMonomerVariable,
+                           get_monomer_handler_.get(),
+                           FLAGS_collective_get_thread_num);
+  rpc_server_->RegisterRPC(distributed::kRequestGetMonomerBarrier,
+                           get_barrier_handler_.get(), 1);
+
+  server_thread_.reset(new std::thread([&]() { rpc_server_->StartServer(); }));
+  rpc_server_->WaitServerReady();
+
+  loop_thread_.reset(new std::thread([&]() {
+    while (true) {
+      if (rpc_server_->IsExit()) {
+        LOG(WARNING) << "get exit!rpc_processor break!";
+        break;
+      }
+      sleep(1);
+    }
+    VLOG(1) << "CollectiveServer loop_thread end";
+  }));
+}
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/collective_server.h b/paddle/fluid/operators/distributed/collective_server.h
new file mode 100644
index 0000000000000000000000000000000000000000..a23dc18b4de86421a0995b9951e0ae6f4dc76150
--- /dev/null
+++ b/paddle/fluid/operators/distributed/collective_server.h
@@ -0,0 +1,110 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <set>
+#include <string>
+#include <thread>  // NOLINT
+#include <utility>
+#include <vector>
+
+#include "gflags/gflags.h"
+
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
+#include "paddle/fluid/operators/distributed/request_handler_impl.h"
+#include "paddle/fluid/operators/distributed/rpc_server.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+class CollectiveServer;
+
+class GetMonomerHandler final : public RequestHandler {
+ public:
+  GetMonomerHandler() : RequestHandler(true) {}
+  virtual ~GetMonomerHandler() {}
+  bool Handle(const std::string& var_name, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar,
+              const int trainer_id, const std::string& out_var_name = "",
+              const std::string& table_name = "") override {
+    VLOG(50) << "GetMonomerHandler recv " << var_name;
+
+    *outvar = scope->FindVar(var_name);
+    PADDLE_ENFORCE(outvar != nullptr, "%s not found", var_name);
+
+    return true;
+  }
+};
+
+class GetMonomerBarrierHandler final : public RequestHandler {
+ public:
+  GetMonomerBarrierHandler() : RequestHandler(true) {}
+  virtual ~GetMonomerBarrierHandler() {}
+  bool Handle(const std::string& var_name, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar,
+              const int trainer_id, const std::string& out_var_name = "",
+              const std::string& table_name = "") override {
+    VLOG(50) << "GetMonomerHandler recv " << var_name;
+
+    rpc_server_->IncreaseVarBarrier(var_name);
+
+    return true;
+  }
+};
+
+class CollectiveServer final {
+ public:
+  explicit CollectiveServer(const std::string& end_point, int fan_in);
+
+  virtual ~CollectiveServer() {}
+
+  void StartServer();
+
+  static CollectiveServer* GetInstance(const std::string& end_point,
+                                       int fan_in) {
+    std::call_once(init_flag_, [&]() {
+      if (collective_server_.get() == nullptr) {
+        collective_server_.reset(new CollectiveServer(end_point, fan_in));
+        collective_server_->StartServer();
+      }
+    });
+
+    return collective_server_.get();
+  }
+
+  std::shared_ptr<RPCServer> GetRPCServer() { return rpc_server_; }
+
+  void Stop();
+
+ private:
+  std::unique_ptr<GetMonomerHandler> get_monomer_handler_;
+  std::unique_ptr<GetMonomerBarrierHandler> get_barrier_handler_;
+
+  std::shared_ptr<distributed::RPCServer> rpc_server_;
+  std::shared_ptr<std::thread> server_thread_;
+  std::shared_ptr<std::thread> loop_thread_;
+
+  bool ready_{false};
+
+  static std::once_flag init_flag_;
+  static std::shared_ptr<CollectiveServer> collective_server_;
+};
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/collective_server_test.cc b/paddle/fluid/operators/distributed/collective_server_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0a9c69e393257068371e88253b82a500f58ed837
--- /dev/null
+++ b/paddle/fluid/operators/distributed/collective_server_test.cc
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+#include <string>
+#include <thread>  // NOLINT
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/collective_client.h"
+#include "paddle/fluid/operators/distributed/collective_server.h"
+#include "paddle/fluid/operators/distributed/request_handler_impl.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace distributed = paddle::operators::distributed;
+
+std::unique_ptr<distributed::CollectiveServer> StartServer(
+    const std::string& ep, int fan_in, framework::Scope* scope,
+    platform::DeviceContext* dev_ctx) {
+  distributed::CollectiveServer* server =
+      distributed::CollectiveServer::GetInstance(ep, fan_in);
+
+  auto rpc_server = server->GetRPCServer();
+  rpc_server->RegisterVar("var1", distributed::kRequestGetMonomerVariable,
+                          scope, dev_ctx);
+
+  std::cout << "StartServer return" << std::endl;
+  return std::unique_ptr<distributed::CollectiveServer>(server);
+}
+
+std::unique_ptr<framework::Scope> GenerateVars(platform::Place place) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& ctx = *pool.Get(place);
+
+  framework::Scope* scope = new framework::Scope();
+  framework::Variable* var = scope->Var("var1");
+  auto* slr = var->GetMutable<framework::SelectedRows>();
+  slr->set_height(1000);
+
+  auto* tensor = slr->mutable_value();
+  auto* rows = slr->mutable_rows();
+
+  tensor->Resize(framework::make_ddim({3, 5}));
+  tensor->mutable_data<float>(place);
+
+  paddle::operators::math::set_constant(ctx, tensor, 32.7);
+  for (int i = 0; i < 3; ++i) rows->push_back(i);
+
+  std::cout << "src:" << distributed::GetSelectedRowsInfo(*slr);
+
+  return std::unique_ptr<framework::Scope>(scope);
+}
+
+void Gather(const std::vector<distributed::RemoteVar>& vars,
+            platform::DeviceContext* dev_ctx) {
+  distributed::CollectiveClient* client =
+      distributed::CollectiveClient::GetInstance();
+
+  framework::Scope* scope = new framework::Scope();
+  framework::Variable* var = scope->Var("var1");
+  var->GetMutable<framework::SelectedRows>();
+
+  std::vector<const framework::SelectedRows*> dst;
+  client->Gather(vars, &dst, *dev_ctx, scope);
+  std::cout << "dst:" << distributed::GetSelectedRowsInfo(*dst[0]);
+}
+
+TEST(PREFETCH, GPU) {
+  platform::CUDAPlace place;
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& ctx = *pool.Get(place);
+
+  std::string ep = "127.0.0.1:7164";
+  auto scope = GenerateVars(place);
+
+  auto* v1 = scope->FindVar("var1");
+  std::cout << "var1:" << v1 << std::endl;
+
+  auto server = StartServer(ep, 2, scope.get(), &ctx);
+  auto rpc_server = server->GetRPCServer();
+
+  distributed::RemoteVar var;
+  var.ep_ = ep;
+  var.var_name_ = "var1";
+  var.trainer_id_ = 0;
+
+  std::vector<distributed::RemoteVar> vars{var};
+  Gather(vars, &ctx);
+  Gather(vars, &ctx);
+
+  std::cout << "begin WaitVarBarrier" << std::endl;
+  rpc_server->WaitVarBarrier("var1");
+  rpc_server->ClearRegisteredVars();
+  server->Stop();
+
+  scope.release();
+  server.release();
+}
diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc
index 62a2c4d94dea51f87c23503390713776d6b2adce..f14dfcdb238a9580affde96e4d5a0093743eb6c8 100644
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -28,11 +28,11 @@ namespace paddle {
 namespace operators {
 namespace distributed {
 
-void GRPCClient::InitImpl() { InitEventLoop(); }
-
-void GRPCClient::InitEventLoop() {
+void GRPCClient::InitImpl() {
   // start the client process thread
   // TODO(wuyi): can make this in a threadpool
+  PADDLE_ENFORCE(client_thread_ == nullptr,
+                 "please not re init proceed thread");
   client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this)));
 }
 
@@ -106,6 +106,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
 
 void ProcGetResponse(const VarHandle& var_h,
                      const ::grpc::ByteBuffer& ret_msg) {
+  VLOG(100) << "ProcGetResponse";
   framework::Variable* outvar = nullptr;
   // get response's trainer_id is not used
   int trainer_id;
@@ -126,6 +127,24 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
                                      const framework::Scope& scope,
                                      const std::string& var_name,
                                      int64_t time_out) {
+  return _AsyncGetVar(ep, ctx, scope, var_name,
+                      "/sendrecv.SendRecvService/GetVariable", time_out);
+}
+
+VarHandlePtr GRPCClient::AsyncGetMonomerVariable(
+    const std::string& ep, const platform::DeviceContext& ctx,
+    const framework::Scope& scope, const std::string& var_name,
+    int64_t time_out) {
+  return _AsyncGetVar(ep, ctx, scope, var_name,
+                      "/sendrecv.SendRecvService/GetMonomerVariable", time_out);
+}
+
+VarHandlePtr GRPCClient::_AsyncGetVar(const std::string& ep,
+                                      const platform::DeviceContext& ctx,
+                                      const framework::Scope& scope,
+                                      const std::string& var_name,
+                                      const std::string& rpc_path,
+                                      int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string var_name_val = var_name;
@@ -136,7 +155,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
   VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
   s->Prepare(h, time_out);
 
-  framework::AsyncIO([var_name_val, s, method, p_ctx, h, this] {
+  framework::AsyncIO([var_name_val, s, method, p_ctx, h, rpc_path, this] {
     // prepare input
     sendrecv::VariableMessage req;
     req.set_varname(var_name_val);
@@ -151,8 +170,8 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
 
     platform::RecordRPCEvent record_event(method, p_ctx);
 
-    auto call = s->stub_g_.PrepareUnaryCall(
-        s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_);
+    auto call =
+        s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_);
     call->StartCall();
     call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
 
@@ -171,11 +190,13 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
                                           const framework::Scope& scope,
                                           const std::string& in_var_name,
                                           const std::string& out_var_name,
+                                          const std::string& table_name,
                                           int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string in_var_name_val = in_var_name;
   const std::string out_var_name_val = out_var_name;
+  const std::string table_name_val = table_name;
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
   GetProcessor* s = new GetProcessor(ch);
@@ -186,11 +207,12 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
   s->Prepare(h, time_out);
 
   framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
-                      s, method, h, this] {
+                      s, method, h, table_name_val, this] {
     auto* var = p_scope->FindVar(in_var_name_val);
 
     ::grpc::ByteBuffer req;
-    SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val);
+    SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val,
+                          0, table_name_val);
 
     VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
 
@@ -265,6 +287,34 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
   return h;
 }
 
+VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
+                                                const std::string& var_name,
+                                                int64_t time_out) {
+  const auto ch = GetChannel(ep);
+  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
+  const std::string method = "SendMonomerFetchBarrierRPC";
+  VarHandlePtr h(
+      new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
+  s->Prepare(h, time_out);
+
+  VLOG(30) << s->GetVarHandlePtr()->String() << " begin";
+
+  sendrecv::VariableMessage req;
+  req.set_varname(var_name);
+
+  platform::RecordRPCEvent record_event(method, nullptr);
+
+  auto rpc = s->stub_->AsyncGetMonomerBarrier(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+  req_count_++;
+
+  if (UNLIKELY(platform::IsProfileEnabled())) {
+    h->Wait();
+  }
+
+  return h;
+}
+
 VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
                                            int64_t time_out) {
   const auto ch = GetChannel(ep);
@@ -340,8 +390,7 @@ void GRPCClient::Proceed() {
       VLOG(3) << c->GetVarHandlePtr()->String() << " process";
       c->Process();
     } else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) {
-      // FIXME(gongwb): parse error_details?
-      LOG(ERROR) << c->GetVarHandlePtr()->String()
+      LOG(FATAL) << c->GetVarHandlePtr()->String()
                  << " meets grpc error, error_code:" << c->status_.error_code()
                  << " error_message:" << c->status_.error_message()
                  << " error_details:" << c->status_.error_details();
diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h
index d8e9cee85bd734c2ed4b1cae03ecee04e304b651..01bf46cc313b4707c7af7a9605926a8b298d679d 100644
--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@@ -189,18 +189,28 @@ class GRPCClient : public RPCClient {
                            const std::string& var_name,
                            int64_t time_out = FLAGS_rpc_deadline) override;
 
+  VarHandlePtr AsyncGetMonomerVariable(
+      const std::string& ep, const platform::DeviceContext& ctx,
+      const framework::Scope& scope, const std::string& var_name,
+      int64_t time_out = FLAGS_rpc_deadline) override;
+
   VarHandlePtr AsyncPrefetchVar(const std::string& ep,
                                 const platform::DeviceContext& ctx,
                                 const framework::Scope& scope,
                                 const std::string& in_var_name,
                                 const std::string& out_var_name,
+                                const std::string& table_name = "",
                                 int64_t time_out = FLAGS_rpc_deadline) override;
 
   VarHandlePtr AsyncSendBatchBarrier(
       const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
 
-  VarHandlePtr AsyncSendFetchBarrier(
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
+  VarHandlePtr AsyncSendFetchBarrier(const std::string& ep,
+                                     int64_t time_out) override;
+
+  VarHandlePtr AsyncGetMonomerBarrier(
+      const std::string& ep, const std::string& var_name,
+      int64_t time_out = FLAGS_rpc_deadline) override;
 
   VarHandlePtr AsyncCheckpointNotify(
       const std::string& ep, const std::string& dir,
@@ -213,21 +223,22 @@ class GRPCClient : public RPCClient {
 
   void SendComplete() override;
 
- protected:
   void InitImpl() override;
 
  private:
-  // InitEventLoop should only be called by Init()
-  void InitEventLoop();
-
   void Proceed();
 
   std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
+  VarHandlePtr _AsyncGetVar(const std::string& ep,
+                            const platform::DeviceContext& ctx,
+                            const framework::Scope& scope,
+                            const std::string& var_name, const std::string& rpc,
+                            int64_t time_out);
 
  private:
   grpc::CompletionQueue cq_;
   std::unordered_map<std::string, std::shared_ptr<grpc::Channel>> channels_;
-  std::unique_ptr<std::thread> client_thread_;
+  std::unique_ptr<std::thread> client_thread_{nullptr};
 
   // mutex for Wait client sync
   std::mutex sync_mutex_;
diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc_serde.cc
index e6856676d49e867214801810949076151e34356a..31fac2133cf159719474207407c52bb96e80e131 100644
--- a/paddle/fluid/operators/distributed/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde.cc
@@ -42,7 +42,8 @@ static void SerializeDestroyCallback(void* payload) {
 void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                            const platform::DeviceContext& ctx,
                            ::grpc::ByteBuffer* msg, const std::string& out_name,
-                           const int trainer_id) {
+                           const int trainer_id,
+                           const std::string& table_name) {
   platform::RecordRPCEvent record_event("serial", &ctx);
   VarMsg request;
   TensorPayload* payload = nullptr;
@@ -63,6 +64,9 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
   if (!out_name.empty()) {
     request.set_out_varname(out_name);
   }
+  if (!table_name.empty()) {
+    request.set_table_name(table_name);
+  }
   if (var->IsType<framework::LoDTensor>()) {
     request.set_type(::sendrecv::LOD_TENSOR);
     payload = new TensorPayload(GetTensorPayload(var, ctx, &request));
diff --git a/paddle/fluid/operators/distributed/grpc_serde.h b/paddle/fluid/operators/distributed/grpc_serde.h
index 17290d3fb4478191c59623913a82d4142d3c49f9..16f5293b0eb413dc43a28193cfd224090aeed659 100644
--- a/paddle/fluid/operators/distributed/grpc_serde.h
+++ b/paddle/fluid/operators/distributed/grpc_serde.h
@@ -40,7 +40,8 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                            const platform::DeviceContext& ctx,
                            ::grpc::ByteBuffer* msg,
                            const std::string& out_varname = std::string(),
-                           const int trainer_id = 0);
+                           const int trainer_id = 0,
+                           const std::string& table_name = std::string());
 
 void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
                                const platform::DeviceContext& ctx,
diff --git a/paddle/fluid/operators/distributed/grpc_serde_test.cc b/paddle/fluid/operators/distributed/grpc_serde_test.cc
index 96ea05e74ed76768248a27ab435dc801b7d1b995..1936c2c623a779c2599aa560247fa5e24f28cd62 100644
--- a/paddle/fluid/operators/distributed/grpc_serde_test.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde_test.cc
@@ -130,7 +130,8 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
   math::set_constant(ctx, tensor, 31.9);
 
   ::grpc::ByteBuffer msg;
-  operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg);
+  operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg,
+                                                "outvar", 0, "table_name");
   EXPECT_GT(msg.Length(), static_cast<size_t>(0));
 
   // deserialize
diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc
index 28a8f1eda043880a2b99a1259c7c5071f3aef61c..c3974138f4d4665c46bdfccaef09c0bd84b9d028 100644
--- a/paddle/fluid/operators/distributed/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
@@ -158,6 +158,98 @@ class RequestGet final : public RequestBase {
   ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
 };
 
+class RequestGetMonomerVariable final : public RequestBase {
+ public:
+  explicit RequestGetMonomerVariable(GrpcService::AsyncService* service,
+                                     ::grpc::ServerCompletionQueue* cq,
+                                     RequestHandler* request_handler,
+                                     int req_id, RPCServer* rpc_server)
+      : RequestBase(service, cq, request_handler, req_id),
+        responder_(&ctx_),
+        rpc_server_(rpc_server) {
+    auto method_id =
+        static_cast<int>(distributed::GrpcMethod::kGetMonomerVariable);
+    service_->RequestAsyncUnary(
+        method_id, &ctx_, &request_, &responder_, cq_, cq_,
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
+  }
+
+  virtual ~RequestGetMonomerVariable() {}
+
+  std::string GetReqName() override { return request_.varname(); }
+
+  void Process() override {
+    // proc request.
+    std::string varname = request_.varname();
+
+    rpc_server_->WaitVarCond(varname);
+    MonomerHandle h = rpc_server_->GetMonomer(varname);
+
+    auto scope = h.scope_;
+    auto invar = scope->FindVar(varname);
+    framework::Variable* outvar = nullptr;
+
+    request_handler_->Handle(varname, scope, invar, &outvar,
+                             request_.trainer_id());
+
+    if (outvar) {
+      SerializeToByteBuffer(varname, outvar, *h.dev_ctx_, &reply_);
+    }
+    Finish(reply_, &responder_);
+  }
+
+ protected:
+  sendrecv::VariableMessage request_;
+  ::grpc::ByteBuffer reply_;
+  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
+  RPCServer* rpc_server_{nullptr};
+};
+
+class RequestGetMonomerBarrier final : public RequestBase {
+ public:
+  explicit RequestGetMonomerBarrier(GrpcService::AsyncService* service,
+                                    ::grpc::ServerCompletionQueue* cq,
+                                    RequestHandler* request_handler, int req_id,
+                                    RPCServer* rpc_server)
+      : RequestBase(service, cq, request_handler, req_id),
+        responder_(&ctx_),
+        rpc_server_(rpc_server) {
+    auto method_id =
+        static_cast<int>(distributed::GrpcMethod::kGetMonomerBarrier);
+    service_->RequestAsyncUnary(
+        method_id, &ctx_, &request_, &responder_, cq_, cq_,
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
+  }
+
+  virtual ~RequestGetMonomerBarrier() {}
+
+  std::string GetReqName() override { return request_.varname(); }
+
+  void Process() override {
+    // proc request.
+    std::string varname = request_.varname();
+    VLOG(4) << "RequestGetMonomerBarrier " << varname;
+
+    rpc_server_->WaitVarCond(varname);
+    MonomerHandle h = rpc_server_->GetMonomer(varname);
+
+    framework::Scope* scope = nullptr;
+    framework::Variable* invar = nullptr;
+    framework::Variable* outvar = nullptr;
+
+    request_handler_->Handle(varname, scope, invar, &outvar,
+                             request_.trainer_id());
+
+    Finish(reply_, &responder_);
+  }
+
+ protected:
+  sendrecv::VariableMessage request_;
+  sendrecv::VoidMessage reply_;
+  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
+  RPCServer* rpc_server_{nullptr};
+};
+
 class RequestPrefetch final : public RequestBase {
  public:
   explicit RequestPrefetch(GrpcService::AsyncService* service,
@@ -183,6 +275,7 @@ class RequestPrefetch final : public RequestBase {
     // prefetch process...
     std::string in_var_name = request_->Varname();
     std::string out_var_name = request_->OutVarname();
+    std::string table_name = request_->TableName();
     int trainer_id = request_->GetTrainerId();
     VLOG(4) << "RequestPrefetch, in_var_name: " << in_var_name
             << " out_var_name: " << out_var_name;
@@ -193,7 +286,7 @@ class RequestPrefetch final : public RequestBase {
     framework::Variable* outvar = scope->Var(out_var_name);
 
     request_handler_->Handle(in_var_name, scope, invar, &outvar, trainer_id,
-                             out_var_name);
+                             out_var_name, table_name);
 
     SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(),
                           &reply_);
@@ -248,7 +341,7 @@ class RequestCheckpointNotify final : public RequestBase {
 };
 
 void AsyncGRPCServer::WaitServerReady() {
-  VLOG(4) << "AsyncGRPCServer is wait server ready";
+  VLOG(4) << "AsyncGRPCServer is waiting server ready";
   std::unique_lock<std::mutex> lock(this->mutex_ready_);
   condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
   VLOG(4) << "AsyncGRPCServer WaitSeverReady";
@@ -367,6 +460,12 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
     b = new RequestSend(&service_, cq.get(), handler, req_id);
   } else if (rpc_name == kRequestGet) {
     b = new RequestGet(&service_, cq.get(), handler, req_id);
+  } else if (rpc_name == kRequestGetMonomerVariable) {
+    b = new RequestGetMonomerVariable(&service_, cq.get(), handler, req_id,
+                                      this);
+  } else if (rpc_name == kRequestGetMonomerBarrier) {
+    b = new RequestGetMonomerBarrier(&service_, cq.get(), handler, req_id,
+                                     this);
   } else if (rpc_name == kRequestPrefetch) {
     b = new RequestPrefetch(&service_, cq.get(), handler, req_id);
   } else if (rpc_name == kRequestCheckpoint) {
@@ -377,7 +476,7 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
 
   reqs[req_id] = b;
 
-  VLOG(4) << "Create RequestSend status:" << b->Status();
+  VLOG(4) << "TryToRegisterNewOne status:" << b->Status();
 }
 
 void AsyncGRPCServer::HandleRequest(
diff --git a/paddle/fluid/operators/distributed/grpc_service.h b/paddle/fluid/operators/distributed/grpc_service.h
index 9ae9a31a003cbb1f808fd1127a5dd78511aa3e99..537429b5fe989269d437b6dfe558c0a7dcfc2dcc 100644
--- a/paddle/fluid/operators/distributed/grpc_service.h
+++ b/paddle/fluid/operators/distributed/grpc_service.h
@@ -81,10 +81,12 @@ enum class GrpcMethod {
   kGetVariable,
   kPrefetchVariable,
   kCheckpointNotify,
+  kGetMonomerVariable,
+  kGetMonomerBarrier,
 };
 
 static const int kGrpcNumMethods =
-    static_cast<int>(GrpcMethod::kCheckpointNotify) + 1;
+    static_cast<int>(GrpcMethod::kGetMonomerBarrier) + 1;
 
 inline const char* GrpcMethodName(GrpcMethod id) {
   switch (id) {
@@ -92,6 +94,10 @@ inline const char* GrpcMethodName(GrpcMethod id) {
       return "/sendrecv.SendRecvService/SendVariable";
     case GrpcMethod::kGetVariable:
       return "/sendrecv.SendRecvService/GetVariable";
+    case GrpcMethod::kGetMonomerVariable:
+      return "/sendrecv.SendRecvService/GetMonomerVariable";
+    case GrpcMethod::kGetMonomerBarrier:
+      return "/sendrecv.SendRecvService/GetMonomerBarrier";
     case GrpcMethod::kPrefetchVariable:
       return "/sendrecv.SendRecvService/PrefetchVariable";
     case GrpcMethod::kCheckpointNotify:
diff --git a/paddle/fluid/operators/distributed/grpc_variable_response.cc b/paddle/fluid/operators/distributed/grpc_variable_response.cc
index d6d219d4369ba785e5c369538d4a18dc682952c1..76ad02b0300a58cd19ff2541ad53d067197f4177 100644
--- a/paddle/fluid/operators/distributed/grpc_variable_response.cc
+++ b/paddle/fluid/operators/distributed/grpc_variable_response.cc
@@ -301,6 +301,20 @@ int GRPCVariableResponse::Parse(Source* source) {
         meta_.set_trainer_id(trainer_id);
         break;
       }
+      case sendrecv::VariableMessage::kTableNameFieldNumber: {
+        uint32_t length;
+        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
+          return tag;
+        }
+
+        std::string temp;
+        if (!input.ReadString(&temp, length)) {
+          return tag;
+        }
+
+        meta_.set_table_name(temp);
+        break;
+      }
       default: {
         // Unknown tag, return unknown error.
         return -1;
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cf14538b1c284d297242197088a66cc156b1762c
--- /dev/null
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -0,0 +1,255 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <set>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/tensor.h"
+
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/rpc_client.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
+#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using SelectedRows = framework::SelectedRows;
+using DDim = framework::DDim;
+
+static size_t GetSectionIndex(int64_t id,
+                              const std::vector<int64_t>& abs_sections) {
+  for (size_t i = 1; i < abs_sections.size(); ++i) {
+    if (id < abs_sections[i]) {
+      return i - 1;
+    }
+  }
+  return abs_sections.size() - 1;
+}
+
+static std::vector<int64_t> ToAbsoluteSection(
+    const std::vector<int>& height_sections) {
+  std::vector<int64_t> abs_sections;
+  abs_sections.resize(height_sections.size());
+  abs_sections[0] = 0;
+  for (size_t i = 1; i < height_sections.size(); ++i) {
+    abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1];
+  }
+  return abs_sections;
+}
+
+static std::vector<std::vector<int64_t>> SplitIds(
+    const std::vector<int64_t>& ids_vector,
+    const std::vector<int>& height_section, framework::Scope* scope) {
+  std::set<int64_t> all_ids;
+  for (auto id : ids_vector) {
+    all_ids.insert(id);
+  }
+
+  auto abs_sections = ToAbsoluteSection(height_section);
+  std::vector<std::vector<int64_t>> splited_ids;
+  splited_ids.resize(height_section.size() + 1);
+  for (auto& id : all_ids) {
+    auto section_index = GetSectionIndex(id, abs_sections);
+    splited_ids[section_index].push_back(id - abs_sections[section_index]);
+  }
+  return splited_ids;
+}
+
+static void SplitIdsIntoMultipleVarsBySection(
+    const std::vector<std::string>& in_var_names,
+    const std::vector<int>& height_section,
+    const std::vector<std::vector<int64_t>>& splited_ids,
+    framework::Scope* scope) {
+  PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size(), "");
+
+  auto place = platform::CPUPlace();
+
+  for (size_t i = 0; i < in_var_names.size(); ++i) {
+    auto* id_tensor =
+        scope->Var(in_var_names[i])->GetMutable<framework::LoDTensor>();
+    auto& ids = splited_ids[i];
+    if (!ids.empty()) {
+      auto* id_tensor_data = id_tensor->mutable_data<int64_t>(
+          framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
+      memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size());
+    }
+  }
+}
+
+static void MergeMultipleVarsIntoOneBySection(
+    const std::string& id_name, const std::vector<int64_t>& ids_vector,
+    const std::string& out_name, const std::vector<std::string>& out_var_names,
+    const std::vector<int>& height_section,
+    const std::vector<std::vector<int64_t>>& splited_ids,
+    const framework::ExecutionContext& context, framework::Scope* scope,
+    platform::DeviceContext* actual_ctx) {
+  PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size(), "");
+
+  auto cpu_place = platform::CPUPlace();
+
+  auto abs_sections = ToAbsoluteSection(height_section);
+  std::unordered_map<int64_t, std::vector<size_t>> id_to_offset;
+  for (size_t i = 0; i < ids_vector.size(); ++i) {
+    id_to_offset[ids_vector[i]].push_back(i);
+  }
+
+  auto& id_tensor = scope->FindVar(id_name)->Get<framework::LoDTensor>();
+  auto* out_tensor =
+      scope->FindVar(out_name)->GetMutable<framework::LoDTensor>();
+  auto* out_tensor_data = out_tensor->mutable_data<float>(id_tensor.place());
+
+  bool is_on_cpu_place = true;
+  if (!platform::is_cpu_place(id_tensor.place())) {
+    is_on_cpu_place = false;
+  }
+
+  for (size_t section_idx = 0; section_idx < out_var_names.size();
+       ++section_idx) {
+    auto& ids_in_this_section = splited_ids[section_idx];
+    if (!ids_in_this_section.empty()) {
+      auto& prefetch_out_var =
+          scope->Var(out_var_names[section_idx])->Get<framework::LoDTensor>();
+      const auto* out_var_data = prefetch_out_var.data<float>();
+      auto& dims = prefetch_out_var.dims();
+
+      PADDLE_ENFORCE_EQ(dims.size(), 2, "");
+      PADDLE_ENFORCE_EQ(ids_in_this_section.size(), dims[0]);
+
+      auto row_numel = dims[1];
+
+      for (size_t i = 0; i < dims[0]; ++i) {
+        auto id = ids_in_this_section[i];
+        auto origin_id = id + abs_sections[section_idx];
+        auto& offsets = id_to_offset[origin_id];
+        for (auto& offset : offsets) {
+          // should support GPU tensor
+          if (is_on_cpu_place) {
+            memory::Copy(cpu_place, out_tensor_data + offset * row_numel,
+                         cpu_place, out_var_data + i * row_numel,
+                         sizeof(float) * row_numel);
+          } else {
+#ifndef PADDLE_WITH_CUDA
+            PADDLE_THROW("paddle is not compiled with CUDA!");
+#else
+            auto stream =
+                static_cast<platform::CUDADeviceContext*>(actual_ctx)->stream();
+            memory::Copy(boost::get<platform::CUDAPlace>(id_tensor.place()),
+                         out_tensor_data + offset * row_numel, cpu_place,
+                         out_var_data + i * row_numel,
+                         sizeof(float) * row_numel, stream);
+#endif
+          }
+        }
+      }
+    } else {
+      VLOG(3) << "ids in this section is empty";
+    }
+  }
+}
+
+void prefetch(const std::string& id_name, const std::string& out_name,
+              const std::vector<std::string>& table_names,
+              const std::vector<std::string>& epmap,
+              const std::vector<int>& height_sections,
+              const framework::ExecutionContext& context) {
+  auto& local_scope = context.scope().NewScope();
+
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& cpu_ctx = *pool.Get(platform::CPUPlace());
+  auto& actual_ctx = *pool.Get(context.GetPlace());
+
+  distributed::RPCClient* rpc_client =
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>(
+          context.Attr<int>("trainer_id"));
+
+  std::vector<std::string> in_var_names;
+  std::vector<std::string> out_var_names;
+  for (size_t i = 0; i < epmap.size(); ++i) {
+    in_var_names.push_back(id_name + "@" + epmap[i]);
+    out_var_names.push_back(out_name + "@" + epmap[i]);
+  }
+
+  auto& id_tensor = local_scope.FindVar(id_name)->Get<framework::LoDTensor>();
+  std::vector<int64_t> ids_vector;
+  if (platform::is_cpu_place(id_tensor.place())) {
+    auto* id_data = id_tensor.data<int64_t>();
+    for (size_t i = 0; i < id_tensor.numel(); ++i) {
+      ids_vector.push_back(id_data[i]);
+    }
+  } else {
+#ifndef PADDLE_WITH_CUDA
+    PADDLE_THROW("paddle is not compiled with CUDA!");
+#else
+    auto cpu_place = platform::CPUPlace();
+    framework::Tensor cpu_tensor;
+    auto* cpu_tensor_data =
+        cpu_tensor.mutable_data<int64_t>(id_tensor.dims(), cpu_place);
+    auto stream =
+        static_cast<platform::CUDADeviceContext*>(&actual_ctx)->stream();
+    memory::Copy(cpu_place, cpu_tensor_data,
+                 boost::get<platform::CUDAPlace>(id_tensor.place()),
+                 id_tensor.data<int64_t>(), sizeof(int64_t) * id_tensor.numel(),
+                 stream);
+    for (size_t i = 0; i < cpu_tensor.numel(); ++i) {
+      ids_vector.push_back(cpu_tensor_data[i]);
+    }
+#endif
+  }
+
+  auto splited_ids = SplitIds(ids_vector, height_sections, &local_scope);
+  SplitIdsIntoMultipleVarsBySection(in_var_names, height_sections, splited_ids,
+                                    &local_scope);
+
+  // create output var in local scope
+  for (auto& name : out_var_names) {
+    local_scope.Var(name)->GetMutable<framework::LoDTensor>();
+  }
+
+  std::vector<distributed::VarHandlePtr> rets;
+  for (size_t i = 0; i < in_var_names.size(); i++) {
+    if (NeedSend(local_scope, in_var_names[i])) {
+      VLOG(3) << "sending " << in_var_names[i] << " to " << epmap[i]
+              << " to get " << out_var_names[i] << " back";
+      rets.push_back(rpc_client->AsyncPrefetchVar(
+          epmap[i], cpu_ctx, local_scope, in_var_names[i], out_var_names[i],
+          table_names[i]));
+    } else {
+      VLOG(3) << "don't send no-initialied variable: " << out_var_names[i];
+    }
+  }
+
+  for (size_t i = 0; i < rets.size(); i++) {
+    PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+  }
+
+  MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name,
+                                    out_var_names, height_sections, splited_ids,
+                                    context, &local_scope, &actual_ctx);
+
+  context.scope().DeleteScope(&local_scope);
+}
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h
new file mode 100644
index 0000000000000000000000000000000000000000..53b0fbfb51f60fa86351cca34fd1665c7802591b
--- /dev/null
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.h
@@ -0,0 +1,34 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+void prefetch(const std::string& id_name, const std::string& out_name,
+              const std::vector<std::string>& table_names,
+              const std::vector<std::string>& epmap,
+              const std::vector<int>& height_sections,
+              const framework::ExecutionContext& context);
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
index 3c1db147098055e9974c9dc607266cdaf2e43dae..62b24f150b41efead24c8bdbe08c9b44e160445a 100644
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -37,6 +37,8 @@ namespace distributed {
 
 constexpr char kRequestSend[] = "RequestSend";
 constexpr char kRequestGet[] = "RequestGet";
+constexpr char kRequestGetMonomerVariable[] = "RequestGetMonomerVariable";
+constexpr char kRequestGetMonomerBarrier[] = "RequestGetMonomerBarrier";
 constexpr char kRequestPrefetch[] = "RequestPrefetch";
 constexpr char kRequestCheckpoint[] = "RequestCheckpoint";
 constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
@@ -191,7 +193,8 @@ class RequestHandler {
   virtual bool Handle(const std::string& varname, framework::Scope* scope,
                       framework::Variable* var, framework::Variable** outvar,
                       const int trainer_id,
-                      const std::string& out_var_name = "") = 0;
+                      const std::string& out_var_name = "",
+                      const std::string& table_name = "") = 0;
 
  protected:
   const bool sync_mode_;
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index 025528fe70b8f4d353ab92f29b1bd71c77cf7850..9722f8c96e91d2dfbe929dcc11645a40c44afb4e 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include <iostream>
 #include <string>
 #include <vector>
@@ -20,7 +21,7 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
+#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/operators/distributed/rpc_server.h"
 #include "paddle/fluid/string/printf.h"
 
@@ -37,7 +38,8 @@ bool RequestSendHandler::Handle(const std::string& varname,
                                 framework::Variable* invar,
                                 framework::Variable** outvar,
                                 const int trainer_id,
-                                const std::string& out_var_name) {
+                                const std::string& out_var_name,
+                                const std::string& table_name) {
   VLOG(4) << "RequestSendHandler:" << varname;
 
   // Sync
@@ -77,8 +79,10 @@ bool RequestGetHandler::Handle(const std::string& varname,
                                framework::Variable* invar,
                                framework::Variable** outvar,
                                const int trainer_id,
-                               const std::string& out_var_name) {
+                               const std::string& out_var_name,
+                               const std::string& table_name) {
   VLOG(4) << "RequestGetHandler:" << varname;
+
   if (sync_mode_) {
     if (varname == FETCH_BARRIER_MESSAGE) {
       VLOG(3) << "sync: recv fetch barrier message";
@@ -113,14 +117,22 @@ bool RequestPrefetchHandler::Handle(const std::string& varname,
                                     framework::Variable* invar,
                                     framework::Variable** outvar,
                                     const int trainer_id,
-                                    const std::string& out_var_name) {
+                                    const std::string& out_var_name,
+                                    const std::string& table_name) {
   VLOG(4) << "RequestPrefetchHandler " << varname;
 
-  auto var_desc = program_->Block(0).FindVar(out_var_name);
-  InitializeVariable(*outvar, var_desc->GetType());
-  executor_->RunPreparedContext(
-      (*prefetch_var_name_to_prepared_ctx_)[varname].get(), scope);
-
+  if (table_name.empty()) {
+    auto var_desc = program_->Block(0).FindVar(out_var_name);
+    InitializeVariable(*outvar, var_desc->GetType());
+    executor_->RunPreparedContext(
+        (*prefetch_var_name_to_prepared_ctx_)[varname].get(), scope);
+  } else {
+    (*outvar)->GetMutable<framework::LoDTensor>();
+    auto lookup_table_op =
+        BuildLookupTableOp(table_name, varname, out_var_name);
+    paddle::platform::CPUPlace cpu_place;
+    lookup_table_op->Run(*scope, cpu_place);
+  }
   return true;
 }
 
@@ -129,7 +141,8 @@ bool RequestCheckpointHandler::Handle(const std::string& varname,
                                       framework::Variable* invar,
                                       framework::Variable** outvar,
                                       const int trainer_id,
-                                      const std::string& out_var_name) {
+                                      const std::string& out_var_name,
+                                      const std::string& table_name) {
   PADDLE_ENFORCE(
       checkpoint_notify_id != -1,
       "when checkpoint_notify_id = -1, there should be no RPC invoke.");
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h
index c1afda9dd2445e492d8b93659c9ff13e6e1030b8..5e0b25c5c2ce161dee0948a07baab32dfff9be6f 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.h
+++ b/paddle/fluid/operators/distributed/request_handler_impl.h
@@ -24,6 +24,7 @@
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
@@ -43,8 +44,8 @@ class RequestSendHandler final : public RequestHandler {
   virtual ~RequestSendHandler() {}
   bool Handle(const std::string& varname, framework::Scope* scope,
               framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id,
-              const std::string& out_var_name = "") override;
+              const int trainer_id, const std::string& out_var_name = "",
+              const std::string& table_name = "") override;
 
  private:
   bool enable_dc_asgd_;
@@ -59,21 +60,44 @@ class RequestGetHandler final : public RequestHandler {
   virtual ~RequestGetHandler() {}
   bool Handle(const std::string& varname, framework::Scope* scope,
               framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id,
-              const std::string& out_var_name = "") override;
+              const int trainer_id, const std::string& out_var_name = "",
+              const std::string& table_name = "") override;
 
  private:
   bool enable_dc_asgd_;
 };
 
+static inline void BuildVar(const std::string& param_name,
+                            std::initializer_list<const char*> arguments,
+                            paddle::framework::proto::OpDesc::Var* var) {
+  var->set_parameter(param_name);
+  for (auto& arg_name : arguments) {
+    *var->mutable_arguments()->Add() = arg_name;
+  }
+}
+
 class RequestPrefetchHandler final : public RequestHandler {
  public:
   explicit RequestPrefetchHandler(bool sync_mode) : RequestHandler(sync_mode) {}
   virtual ~RequestPrefetchHandler() {}
   bool Handle(const std::string& varname, framework::Scope* scope,
               framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id,
-              const std::string& out_var_name = "") override;
+              const int trainer_id, const std::string& out_var_name = "",
+              const std::string& table_name = "") override;
+
+ private:
+  std::unique_ptr<paddle::framework::OperatorBase> BuildLookupTableOp(
+      const std::string& table_name, const std::string& id_name,
+      const std::string& out_name) {
+    paddle::framework::proto::OpDesc op_desc;
+    op_desc.set_type("lookup_table");
+    BuildVar("W", {table_name.data()}, op_desc.add_inputs());
+    BuildVar("Ids", {id_name.data()}, op_desc.add_inputs());
+    BuildVar("Out", {out_name.data()}, op_desc.add_outputs());
+
+    auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+    return op;
+  }
 };
 
 class RequestCheckpointHandler final : public RequestHandler {
@@ -85,8 +109,8 @@ class RequestCheckpointHandler final : public RequestHandler {
   virtual ~RequestCheckpointHandler() {}
   bool Handle(const std::string& varname, framework::Scope* scope,
               framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id,
-              const std::string& out_var_name = "") override;
+              const int trainer_id, const std::string& out_var_name = "",
+              const std::string& table_name = "") override;
 
  private:
   int checkpoint_notify_id;
diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h
index 1983802e49506c79041112ac87d429e4c084ddfd..b668d869787a47ebd36f570061421ddbeae5a09a 100644
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -45,10 +45,15 @@ class RPCClient {
                                    const std::string& var_name,
                                    int64_t time_out = FLAGS_rpc_deadline) = 0;
 
+  virtual VarHandlePtr AsyncGetMonomerVariable(
+      const std::string& ep, const platform::DeviceContext& ctx,
+      const framework::Scope& scope, const std::string& var_name,
+      int64_t time_out = FLAGS_rpc_deadline) = 0;
+
   virtual VarHandlePtr AsyncPrefetchVar(
       const std::string& ep, const platform::DeviceContext& ctx,
       const framework::Scope& scope, const std::string& in_var_name,
-      const std::string& out_var_name,
+      const std::string& out_var_name, const std::string& table_name = "",
       int64_t time_out = FLAGS_rpc_deadline) = 0;
 
   virtual VarHandlePtr AsyncSendBatchBarrier(
@@ -57,6 +62,10 @@ class RPCClient {
   virtual VarHandlePtr AsyncSendFetchBarrier(
       const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
 
+  virtual VarHandlePtr AsyncGetMonomerBarrier(
+      const std::string& ep, const std::string& var_name,
+      int64_t time_out = FLAGS_rpc_deadline) = 0;
+
   virtual VarHandlePtr AsyncCheckpointNotify(
       const std::string& ep, const std::string& dir,
       int64_t time_out = FLAGS_rpc_deadline) = 0;
@@ -87,8 +96,9 @@ class RPCClient {
     }
   }
 
- protected:
   virtual void InitImpl() {}
+
+ protected:
   // each trainer have exact one trainer id, it should be static
   static int trainer_id_;
 
diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc
index 3e30ed4ac86bd2cb3f7c4301163e54a947c3d5b4..122619d41b25da488742b4a7192b6a18b8bf9283 100644
--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -132,6 +132,96 @@ void RPCServer::WaitCond(const std::string& rpc_name) {
       lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); });
 }
 
+void RPCServer::RegisterVar(const std::string& var_name,
+                            const std::string& rpc_name,
+                            framework::Scope* scope,
+                            platform::DeviceContext* dev_ctx) {
+  MonomerHandle h;
+  h.var_name_ = var_name;
+  h.rpc_name_ = rpc_name;
+  h.scope_ = scope;
+  h.dev_ctx_ = dev_ctx;
+
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (var_map_.find(var_name) != var_map_.end()) {
+      PADDLE_ENFORCE(false, "%s alreay in var_map", var_name);
+    }
+    var_map_[var_name] = h;
+  }
+
+  rpc_cond_.notify_all();
+  VLOG(4) << "RegisterVar context:" << h.String();
+}
+
+void RPCServer::IncreaseVarBarrier(const std::string& var_name) {
+  int b = 0;
+  MonomerHandle h;
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    b = ++var_map_[var_name].barrier_;
+    h = var_map_[var_name];
+  }
+
+  if (b >= client_num_) {
+    barrier_cond_.notify_all();
+  }
+
+  VLOG(4) << "IncreaseVarBarrier context:" << h.String();
+}
+
+void RPCServer::WaitVarBarrier(const std::string& var_name) {
+  VLOG(4) << "WaitBarrier var_name:" << var_name;
+
+  std::unique_lock<std::mutex> lock(mutex_);
+  barrier_cond_.wait(lock, [&]() {
+    return ((var_map_[var_name].barrier_ >= client_num_ && client_num_ != 0) ||
+            exit_flag_.load());
+  });
+
+  VLOG(4) << "WaitBarrier context: " << var_map_[var_name].String();
+}
+
+void RPCServer::SetVarCond(const std::string& var_name) {
+  VLOG(4) << "SetVarCond var_name:" << var_name;
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (var_map_.find(var_name) != var_map_.end()) {
+      rpc_cond_.notify_all();
+    }
+  }
+}
+
+void RPCServer::WaitVarCond(const std::string& var_name) {
+  VLOG(4) << "WaitVarCond var_name:" << var_name;
+
+  std::unique_lock<std::mutex> lock(mutex_);
+  rpc_cond_.wait(lock, [=] {
+    return (var_map_.find(var_name) != var_map_.end() || exit_flag_.load());
+  });
+
+  VLOG(4) << "WaitVarCond var_name:" << var_name << " end";
+}
+
+MonomerHandle RPCServer::GetMonomer(const std::string& var_name) {
+  MonomerHandle h;
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    h = var_map_[var_name];
+  }
+
+  return h;
+}
+
+void RPCServer::ClearRegisteredVars() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  var_map_.clear();
+}
+
+void RPCServer::ClearVar(const std::string& var_name) {
+  std::unique_lock<std::mutex> lock(mutex_);
+  var_map_.erase(var_name);
+}
 }  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h
index c78c5007a7f262f15305b6c284e8c4fbddef42a0..45d1d3479ce731894c26bbff40f456bbfdc13d44 100644
--- a/paddle/fluid/operators/distributed/rpc_server.h
+++ b/paddle/fluid/operators/distributed/rpc_server.h
@@ -21,12 +21,30 @@
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
+#include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
 namespace distributed {
 
+struct MonomerHandle {
+  std::string var_name_;
+  std::string rpc_name_;
+  framework::Scope* scope_{nullptr};
+  platform::DeviceContext* dev_ctx_{nullptr};
+  int64_t barrier_{0};
+
+  std::string String() {
+    std::stringstream ss;
+    ss << "var_name:" << var_name_ << ", rpc_name:" << rpc_name_
+       << ", scope:" << scope_ << ", dev_ctx:" << dev_ctx_
+       << ", barrier_:" << barrier_;
+    return ss.str();
+  }
+};
+
 class RPCServer {
  public:
   explicit RPCServer(const std::string& address, int client_num)
@@ -67,6 +85,16 @@ class RPCServer {
   void WaitCond(const std::string& rpc_name);
   void IncreaseBatchBarrier(const std::string rpc_name);
 
+  void RegisterVar(const std::string& var_name, const std::string& rpc_name,
+                   framework::Scope* scope, platform::DeviceContext* dev_ctx);
+  void IncreaseVarBarrier(const std::string& var_name);
+  void WaitVarBarrier(const std::string& var_name);
+  void SetVarCond(const std::string& var_name);
+  void WaitVarCond(const std::string& var_name);
+  void ClearRegisteredVars();
+  void ClearVar(const std::string& var_name);
+  MonomerHandle GetMonomer(const std::string& var_name);
+
   void Complete();
 
   void ResetBarrierCounter();
@@ -95,6 +123,9 @@ class RPCServer {
   std::unordered_map<std::string, RequestHandler*> rpc_call_map_;
   std::unordered_map<std::string, int> rpc_thread_num_;
   friend class RequestHandler;
+
+  // TODO(gongwb): use more cond to notify or wait;
+  std::unordered_map<std::string, MonomerHandle> var_map_;
 };
 
 };  // namespace distributed
diff --git a/paddle/fluid/operators/distributed/send_recv.proto.in b/paddle/fluid/operators/distributed/send_recv.proto.in
index 55820c980e8139625c1b589f9d2d68dfee74a212..2637619f304d246fa535bbfc7be3474209b63b0f 100644
--- a/paddle/fluid/operators/distributed/send_recv.proto.in
+++ b/paddle/fluid/operators/distributed/send_recv.proto.in
@@ -28,6 +28,9 @@ service SendRecvService {
   rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
 
   rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {}
+
+  rpc GetMonomerVariable(VariableMessage) returns (VariableMessage) {}
+  rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {}
 }
 
 // VariableMessage is serialized paddle variable message.
@@ -80,6 +83,7 @@ message VariableMessage {
   // when profile switches from 1 to 2.
   int64 profile = 11;
   int64 trainer_id = 12;
+  string table_name = 13;
 }
 
 message VoidMessage {}
diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h
index 4c7fcbbdfb305ce6b4fc9d1edd9738899b200ec6..a4324f67bb99bfdaa19c1a6dba8e907f17635d14 100644
--- a/paddle/fluid/operators/distributed/variable_response.h
+++ b/paddle/fluid/operators/distributed/variable_response.h
@@ -85,6 +85,7 @@ class VariableResponse {
   inline framework::Scope* GetMutableLocalScope() const { return local_scope_; }
   inline std::string Varname() const { return meta_.varname(); }
   inline std::string OutVarname() const { return meta_.out_varname(); }
+  inline std::string TableName() const { return meta_.table_name(); }
 
   // should call parse first.
   framework::Variable* GetVar() {
diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu
index e011f47e086183a4ef3a3373c17acd6c21b6cf7e..d65491267de1ce3495d8b8250cf0cff570dfcc6a 100644
--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/random.h>
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
index 2fb7eeb4b9e3119a6eea3e69a2a6002a80f6c0f3..fed12785f47e1b8eea3f053712830901bee3bdc9 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/platform/float16.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
index c5a1a7e08d89f3ef205af4c37246f8fa288189f3..1a149298fd33f132a90ff5de3b35dd5894a4ae68 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cu b/paddle/fluid/operators/elementwise/elementwise_max_op.cu
index a90dcd3ecf0da114110db5946e111a8b3a925e42..5d086a1b29febd8e57507eced7683f414ca34e07 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise/elementwise_max_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cu b/paddle/fluid/operators/elementwise/elementwise_min_op.cu
index ab77709c28c15a925bd3deac07c43e12b12cb781..cf93e5a97a3f3110aae907c593f58dbab0f9d090 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise/elementwise_min_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index 4d16bc38e1d8e4cbbe3afbe08f233e14329e0f2e..833c4072826c58277bc23e03b787fafbbaa73d03 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index dc25bc57103286ce183a4649964fd96c62169b7f..a8b8a67a114b956f2d6b1b072ef343a179114b34 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -60,15 +60,37 @@ template <typename DeviceContext, typename T>
 class ElementwiseMulKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto x_var = ctx.InputVar("X");
+    PADDLE_ENFORCE(x_var != nullptr,
+                   "Cannot get input Variable X, variable name = %s",
+                   ctx.op().Input("X"));
     auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* z = ctx.Output<framework::LoDTensor>("Out");
+
+    framework::Tensor x, *z;
+    if (x_var->IsType<framework::SelectedRows>()) {
+      PADDLE_ENFORCE(y->dims().size() == 1 && y->dims()[0] == 1,
+                     "For elementwise_op, if X is Sparse, Y must be scalar.");
+      auto& x_sele = x_var->Get<framework::SelectedRows>();
+      auto out_sele = ctx.Output<framework::SelectedRows>("Out");
+      x = x_sele.value();
+      out_sele->set_rows(x_sele.rows());
+      out_sele->set_height(x_sele.height());
+      out_sele->mutable_value()->Resize(x_sele.value().dims());
+      out_sele->mutable_value()->mutable_data(ctx.GetPlace(), x.type());
+      z = ctx.Output<framework::SelectedRows>("Out")->mutable_value();
+    } else if (x_var->IsType<framework::LoDTensor>()) {
+      x = x_var->Get<framework::LoDTensor>();
+      z = ctx.Output<framework::LoDTensor>("Out");
+    } else {
+      PADDLE_THROW("X's type[%s] is not supported by elementwise_op.",
+                   x_var->Type().name());
+    }
 
     z->mutable_data<T>(ctx.GetPlace());
-    if (x->numel() == y->numel()) {
-      elementwise_mul<DeviceContext, T>(ctx, x, y, z);
+    if (x.numel() == y->numel()) {
+      elementwise_mul<DeviceContext, T>(ctx, &x, y, z);
     } else {
-      default_elementwise_mul<DeviceContext, T>(ctx, x, y, z);
+      default_elementwise_mul<DeviceContext, T>(ctx, &x, y, z);
     }
   }
 };
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index 85a7817be9b3a82d40853b417d78a7fdf67f6c1f..87bf7c6b156f32b8f6a1abc30b0676e1d4711d64 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -40,21 +40,28 @@ class ElementwiseOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of elementwise op should not be null.");
 
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("X").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("X").front(), ctx->GetInputsVarType("X").front());
     PADDLE_ENFORCE(
         ctx->GetInputsVarType("Y").front() ==
             framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Y").front(), ctx->GetInputsVarType("Y").front());
-
-    auto x_dim = ctx->GetInputDim("X");
-    auto y_dim = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
-                      "Rank of first input must >= rank of second input.");
+        "The input var's type should be LoDTensor, but the received is %s [%s]",
+        ctx->GetInputsVarType("Y").front(), ctx->Inputs("Y").front());
+
+    if (ctx->GetInputsVarType("X").front() ==
+        framework::proto::VarType::LOD_TENSOR) {
+      auto x_dim = ctx->GetInputDim("X");
+      auto y_dim = ctx->GetInputDim("Y");
+      PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
+                        "Rank of first input must >= rank of second input.");
+    } else if (ctx->GetInputsVarType("X").front() ==
+               framework::proto::VarType::SELECTED_ROWS) {
+      PADDLE_ENFORCE((ctx->GetInputDim("Y").size() == 1u) &&
+                         (ctx->GetInputDim("Y")[0] == 1),
+                     "For elementwise_op, if X is Sparse, "
+                     "Y must be scalar.");
+    } else {
+      PADDLE_THROW("X's type[%s] is not supported by elementwise_op.",
+                   ctx->GetInputsVarType("X").front());
+    }
 
     ctx->ShareDim("X", /*->*/ "Out");
     ctx->ShareLoD("X", /*->*/ "Out");
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu b/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
index 6ee0779f23bc2c734aa1d439abb12f366227e686..9263dbfebfd00451f3e67c3ca9d2081b5b4904bd 100644
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
@@ -8,8 +8,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise/elementwise_pow_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
index 8d9bf7c4d81d49d83b5d1cf0369be5c9957242b4..6f17d3292f307b009c640738109d5a4f4ca4caa9 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/expand_op.cu b/paddle/fluid/operators/expand_op.cu
index 60363bfc86d7d1a79d7b018cee43a41c1247a994..d95c9b61802b5fe7059e1c95a50776db5aa7ad93 100644
--- a/paddle/fluid/operators/expand_op.cu
+++ b/paddle/fluid/operators/expand_op.cu
@@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
-
 #include "paddle/fluid/operators/expand_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
index 6d463538d232e1a38f845e7abc3786568ca3bb21..1eb6523a2dfb358490a07bf1b806d5638442a4d5 100644
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
@@ -217,13 +217,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
   auto& act_gate_str = ctx.Attr<std::string>("gate_activation");               \
   auto& act_cell_str = ctx.Attr<std::string>("cell_activation");               \
   auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");          \
-  if (platform::jit::MayIUse(platform::jit::avx)) {                            \
-    math::VecActivations<T, platform::jit::avx> act_functor;                   \
+  if (platform::MayIUse(platform::avx)) {                                      \
+    math::VecActivations<T, platform::avx> act_functor;                        \
     act_gate = act_functor(act_gate_str);                                      \
     act_cell = act_functor(act_cell_str);                                      \
     act_cand = act_functor(act_cand_str);                                      \
   } else {                                                                     \
-    math::VecActivations<T, platform::jit::isa_any> act_functor;               \
+    math::VecActivations<T, platform::isa_any> act_functor;                    \
     act_gate = act_functor(act_gate_str);                                      \
     act_cell = act_functor(act_cell_str);                                      \
     act_cand = act_functor(act_cand_str);                                      \
diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
index 288b56fc2485138b20c5b53af3e950f1c1886ba5..17ed9771d074cf7ae8c6735e4cb859139503a0af 100644
--- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
@@ -151,11 +151,11 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {
 
     std::function<void(const int, const T*, T*)> fc_act;
     auto& fc_act_str = ctx.Attr<std::string>("fc_activation");
-    if (platform::jit::MayIUse(platform::jit::avx)) {
-      math::VecActivations<T, platform::jit::avx> act_functor;
+    if (platform::MayIUse(platform::avx)) {
+      math::VecActivations<T, platform::avx> act_functor;
       fc_act = act_functor(fc_act_str);
     } else {
-      math::VecActivations<T, platform::jit::isa_any> act_functor;
+      math::VecActivations<T, platform::isa_any> act_functor;
       fc_act = act_functor(fc_act_str);
     }
 
diff --git a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a4ae19d9c1e3bb2af3eb95650fbb5aabb8944a36
--- /dev/null
+++ b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
@@ -0,0 +1,117 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+
+namespace paddle {
+namespace operators {
+
+class GetTensorFromSelectedRowsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "GetTensorFromSelectedRowsOp must has input X.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "GetTensorFromSelectedRowsOp must has output Out.");
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("X").front() ==
+            framework::proto::VarType::SELECTED_ROWS,
+        "The input X's type should be SelectedRows, but the received is %s",
+        ctx->Inputs("X").front(), ctx->GetInputsVarType("X").front());
+    PADDLE_ENFORCE(
+        ctx->GetOutputsVarType("Out").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The output Out's type should be LoDTensor, but the received is %s",
+        ctx->Outputs("Out").front(), ctx->GetOutputsVarType("Out").front());
+
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::GetDataTypeOfVar(ctx.InputVar("X")), ctx.device_context());
+  }
+};
+
+class GetTensorFromSelectedRowsKernel {
+ public:
+  void operator()(const framework::ExecutionContext &ctx) const {
+    auto *x = ctx.Input<framework::SelectedRows>("X");
+    auto *out = ctx.Output<framework::LoDTensor>("Out");
+
+    out->Resize(x->value().dims());
+    out->mutable_data(ctx.GetPlace(), x->value().type());
+    framework::TensorCopy(x->value(), ctx.GetPlace(), ctx.device_context(),
+                          out);
+  }
+};
+
+class GetTensorFromSelectedRowsOpProtoMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input type is SelectedRows.");
+    AddOutput("Out", "The output type is LoDTensor.");
+    AddComment(
+        R"DOC(
+GetTensorFromSelectedRows Operator
+
+GetTensorFromSelectedRows is used to get the tensor from SelectedRows.
+
+)DOC");
+  }
+};
+
+class GetTensorFromSelectedRowsOpVarTypeInference
+    : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const final {
+    auto out_var_name = op_desc.Output("Out").front();
+    auto in_var_name = op_desc.Input("X").front();
+
+    auto out_var = block->FindRecursiveOrCreateVar(out_var_name);
+    auto in_var = block->FindRecursiveOrCreateVar(in_var_name);
+    out_var.SetType(framework::proto::VarType::LOD_TENSOR);
+    out_var.SetDataType(in_var.GetDataType());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(get_tensor_from_selected_rows,
+                  ops::GetTensorFromSelectedRowsOp,
+                  ops::GetTensorFromSelectedRowsOpProtoMaker,
+                  ops::GetTensorFromSelectedRowsOpVarTypeInference);
+
+REGISTER_OP_CPU_KERNEL_FUNCTOR(get_tensor_from_selected_rows, float,
+                               ops::GetTensorFromSelectedRowsKernel, double,
+                               ops::GetTensorFromSelectedRowsKernel, int,
+                               ops::GetTensorFromSelectedRowsKernel, int64_t,
+                               ops::GetTensorFromSelectedRowsKernel);
+
+#ifdef PADDLE_WITH_CUDA
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(get_tensor_from_selected_rows, float,
+                                ops::GetTensorFromSelectedRowsKernel, double,
+                                ops::GetTensorFromSelectedRowsKernel, int,
+                                ops::GetTensorFromSelectedRowsKernel, int64_t,
+                                ops::GetTensorFromSelectedRowsKernel);
+#endif
diff --git a/paddle/fluid/operators/gru_unit_op.cu b/paddle/fluid/operators/gru_unit_op.cu
index fc92b3d4a7a5a933f31b21d18238de386b3afb4d..37689901ecbeeda44f52a2fc7a686f4edf6682bb 100644
--- a/paddle/fluid/operators/gru_unit_op.cu
+++ b/paddle/fluid/operators/gru_unit_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/gru_unit_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
index 972dcf5494e9acd47e7ff615db45f056a43724a6..0dbcc442dfa1a395cdb0ffbd69eb78ad66cfaa17 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
@@ -150,14 +150,14 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
                    "Output(W@Grad should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
                    "Output(X@Grad should not be null.");
-    if (!ctx->Attrs().Get<bool>("is_sparse")) {
-      if (ctx->HasOutput(framework::GradVarName("Bias"))) {
-        ctx->SetOutputDim(framework::GradVarName("Bias"),
-                          ctx->GetInputDim("Bias"));
-      }
-      ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
+
+    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
+      ctx->SetOutputDim(framework::GradVarName("Bias"),
+                        ctx->GetInputDim("Bias"));
     }
+    ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
   }
 
  protected:
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h
index 07ff8f947e59d2954783e2ba537bfce3cb320f22..b73a32af89e882ac02623dd1d312f400a78fc47a 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -185,7 +185,6 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
           ctx.Output<framework::SelectedRows>(framework::GradVarName("W"));
       w_grad->set_rows(real_rows);
       // Build a map of id -> row_index to speed up finding the index of one id
-      w_grad->SyncIndex();
       w_grad->set_height(w.dims()[0]);
       auto* w_grad_value = w_grad->mutable_value();
       framework::DDim temp_dim(w.dims());
diff --git a/paddle/fluid/operators/hinge_loss_op.cu b/paddle/fluid/operators/hinge_loss_op.cu
index 9c0a85bee6e28865225c1848ea5a378f48932ceb..b5ea0a702e0e540c1831ca241a5def19f86c239c 100644
--- a/paddle/fluid/operators/hinge_loss_op.cu
+++ b/paddle/fluid/operators/hinge_loss_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/hinge_loss_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/huber_loss_op.cu b/paddle/fluid/operators/huber_loss_op.cu
index 659464df9dc0e7c8cd276bd0bbf7072361aa3abf..09c743c4275169ba8c53ccbd428100b2fc4483d6 100644
--- a/paddle/fluid/operators/huber_loss_op.cu
+++ b/paddle/fluid/operators/huber_loss_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/huber_loss_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/im2sequence_op.cu b/paddle/fluid/operators/im2sequence_op.cu
index e0a5a90c1c3c47ea45b3f83ae969c1861783ff60..1c34640618d58d3b5fe627fa6596260a7b687d05 100644
--- a/paddle/fluid/operators/im2sequence_op.cu
+++ b/paddle/fluid/operators/im2sequence_op.cu
@@ -11,8 +11,6 @@
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/im2sequence_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/isfinite_op.cu b/paddle/fluid/operators/isfinite_op.cu
index 8d1268b18c6fec03063051f545075209a6fcde27..995969cd42f08c7fa948262e42793106e745b3a7 100644
--- a/paddle/fluid/operators/isfinite_op.cu
+++ b/paddle/fluid/operators/isfinite_op.cu
@@ -11,8 +11,6 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/isfinite_op.h"
 #include "paddle/fluid/platform/float16.h"
 
diff --git a/paddle/fluid/operators/l1_norm_op.cu b/paddle/fluid/operators/l1_norm_op.cu
index 1b48571dd7378c1c2a6628662024bc7bcc08d2a6..a5c29bbf5debdd11f6e5b28b3a8b48c2c484517a 100644
--- a/paddle/fluid/operators/l1_norm_op.cu
+++ b/paddle/fluid/operators/l1_norm_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/l1_norm_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc
index 0522a94195786c767194ec727d982a60451e7c62..9d1423915afc25889b9fa96963d6f9514bea2870 100644
--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -32,16 +32,26 @@ class LoadCombineOp : public framework::OperatorBase {
                const platform::Place &place) const override {
     auto filename = Attr<std::string>("file_path");
     auto load_as_fp16 = Attr<bool>("load_as_fp16");
-
-    std::ifstream fin(filename);
-    PADDLE_ENFORCE(static_cast<bool>(fin),
-                   "Cannot open file %s for load_combine op", filename);
-
+    auto model_from_memory = Attr<bool>("model_from_memory");
     auto out_var_names = Outputs("Out");
     PADDLE_ENFORCE_GT(
         static_cast<int>(out_var_names.size()), 0,
         "The number of output variables should be greater than 0.");
-
+    if (!model_from_memory) {
+      std::ifstream fin(filename);
+      PADDLE_ENFORCE(static_cast<bool>(fin),
+                     "Cannot open file %s for load_combine op", filename);
+      LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names);
+    } else {
+      PADDLE_ENFORCE(!filename.empty(), "Cannot load file from memory");
+      std::stringstream fin(filename);
+      LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names);
+    }
+  }
+  void LoadParamsFromBuffer(
+      const framework::Scope &scope, const platform::Place &place,
+      std::istream *buffer, bool load_as_fp16,
+      const std::vector<std::string> &out_var_names) const {
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(place);
 
@@ -54,11 +64,10 @@ class LoadCombineOp : public framework::OperatorBase {
       auto *tensor = out_var->GetMutable<framework::LoDTensor>();
 
       // Error checking
-      PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot read more from file %s",
-                     filename);
+      PADDLE_ENFORCE(static_cast<bool>(buffer), "Cannot read more");
 
       // Get data from fin to tensor
-      DeserializeFromStream(fin, tensor, dev_ctx);
+      DeserializeFromStream(*buffer, tensor, dev_ctx);
 
       auto in_dtype = framework::ToDataType(tensor->type());
       auto out_dtype =
@@ -103,11 +112,17 @@ class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
                          "LoDTensors will be loaded from \"file_path\".")
         .AddCustomChecker(
             [](const std::string &path) { return !path.empty(); });
+    AddAttr<bool>("model_from_memory",
+                  "(boolean, default false)"
+                  "If true, file_path is in memory, and LoDTensors will be "
+                  "loaded directly from memory")
+        .SetDefault(false);
     AddComment(R"DOC(
 LoadCombine Operator.
 
-LoadCombine operator loads LoDTensor variables from a file. The file should 
-contain one or more LoDTensors serialized using the SaveCombine operator. The 
+LoadCombine operator loads LoDTensor variables from a file, which could be 
+loaded in memory already. The file should contain one or more LoDTensors 
+serialized using the SaveCombine operator. The
 LoadCombine operator applies a deserialization strategy to appropriately load 
 the LodTensors, and this strategy complements the serialization strategy used 
 in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled
diff --git a/paddle/fluid/operators/log_loss_op.cu b/paddle/fluid/operators/log_loss_op.cu
index e8bf7d8159bf8b16bf4397e7765918c060124db3..280913c43a2749ddd5fbd3ae1905f1b823dd525d 100644
--- a/paddle/fluid/operators/log_loss_op.cu
+++ b/paddle/fluid/operators/log_loss_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/log_loss_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index 3226a727b1f5f6de9e97ce2068381be7c9b69ff3..0029932bc068c7f61ddb41cf3f87c9e1a5cd7749 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -87,6 +87,25 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(boolean, default false) "
                   "If the grad op reuse the input's variable.")
         .SetDefault(false);
+
+    // for parameter prefetch
+    AddAttr<bool>("remote_prefetch", "").SetDefault(false);
+    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
+    AddAttr<std::vector<int>>("height_sections",
+                              "Height for each output SelectedRows.")
+        .SetDefault(std::vector<int>({}));
+    AddAttr<std::vector<std::string>>(
+        "epmap",
+        "(string vector, default 127.0.0.1:6164)"
+        "Server endpoints in the order of input variables for mapping")
+        .SetDefault({});
+    AddAttr<std::vector<std::string>>(
+        "table_names",
+        "(string vector, the splited table names that will be fetched from "
+        "parameter server)"
+        "in the order of input variables for mapping")
+        .SetDefault({});
+
     AddComment(R"DOC(
 Lookup Table Operator.
 
diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
index abd5dce8f7e7146a1671a387328c177e5e6e0a85..6a0d6bad512fe7cc15e60ed25028bc3cbbbca2ab 100644
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -31,8 +31,8 @@ __global__ void LookupTable(T *output, const T *table, const int64_t *ids,
 
   while (idy < K) {
     int64_t id = ids[idy];
-    PADDLE_ASSERT(id >= 0);
-    PADDLE_ASSERT(id < N);
+    PADDLE_ASSERT_MSG_CODE(id >= 0, "received id:", id);
+    PADDLE_ASSERT_MSG_CODE(id < N, "received id:", id);
     T *out = output + idy * D;
     const T *tab = table + id * D;
     for (int i = idx; i < D; i += BlockDimX) {
@@ -57,9 +57,9 @@ __global__ void LookupTableGrad(T *table, const T *output, const int64_t *ids,
   int idy = blockIdx.x + threadIdx.y * GridDimX;
 
   while (idy < K) {
-    int id = ids[idy];
-    PADDLE_ASSERT(id >= 0);
-    PADDLE_ASSERT(id < N);
+    int64_t id = ids[idy];
+    PADDLE_ASSERT_MSG_CODE(id >= 0, "received id:", id);
+    PADDLE_ASSERT_MSG_CODE(id < N, "received id:", id);
     const T *out = output + idy * D;
     T *tab = table + id * D;
     for (int i = idx; i < D; i += BlockDimX) {
@@ -78,27 +78,47 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
     auto *output_t = context.Output<LoDTensor>("Out");
     int64_t padding_idx = context.Attr<int64_t>("padding_idx");
 
-    size_t N = table_t->dims()[0];
-    size_t D = table_t->dims()[1];
-    size_t K = ids_t->numel();
-
-    auto *ids = ids_t->data<int64_t>();
-    auto *table = table_t->data<T>();
-    auto *output = output_t->mutable_data<T>(context.GetPlace());
-
-    dim3 threads(128, 8);
-    dim3 grids(8, 1);
-
-    if (padding_idx == -1)
-      LookupTable<
-          T, 128, 8, 8,
-          false><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-          output, table, ids, N, K, D, padding_idx);
-    else
-      LookupTable<
-          T, 128, 8, 8,
-          true><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
-          output, table, ids, N, K, D, padding_idx);
+    auto id_name = context.Inputs("Ids").front();
+    auto out_name = context.Outputs("Out").front();
+
+    // for remote prefetch
+    auto epmap = context.Attr<std::vector<std::string>>("epmap");
+    auto height_sections = context.Attr<std::vector<int>>("height_sections");
+    auto table_names = context.Attr<std::vector<std::string>>("table_names");
+
+    if (!epmap.empty()) {
+// if epmap is not empty, then the parameter will be fetched from remote
+// parameter
+// server
+#ifdef PADDLE_WITH_DISTRIBUTE
+      operators::distributed::prefetch(id_name, out_name, table_names, epmap,
+                                       height_sections, context);
+#else
+      PADDLE_THROW(
+          "paddle is not compiled with distribute support, can not do "
+          "parameter prefetch!");
+#endif
+    } else {
+      size_t N = table_t->dims()[0];
+      size_t D = table_t->dims()[1];
+      size_t K = ids_t->numel();
+
+      auto *ids = ids_t->data<int64_t>();
+      auto *table = table_t->data<T>();
+      auto *output = output_t->mutable_data<T>(context.GetPlace());
+
+      dim3 threads(128, 8);
+      dim3 grids(8, 1);
+
+      if (padding_idx == -1)
+        LookupTable<T, 128, 8, 8, false><<<
+            grids, threads, 0, context.cuda_device_context().stream()>>>(
+            output, table, ids, N, K, D, padding_idx);
+      else
+        LookupTable<T, 128, 8, 8, true><<<
+            grids, threads, 0, context.cuda_device_context().stream()>>>(
+            output, table, ids, N, K, D, padding_idx);
+    }
   }
 };
 
@@ -109,6 +129,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
     auto &dev_ctx =
         context.template device_context<platform::CUDADeviceContext>();
     bool is_sparse = context.Attr<bool>("is_sparse");
+
     // Since paddings are not trainable and fixed in forward, the gradient of
     // paddings makes no sense and we don't deal with it in backward.
     if (is_sparse) {
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index e504c4f0cd5c0feaef4a251fad57b389a10a2ce7..3a73a7637c6d7d3eff7443802a4a52be9149e0ef 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -23,6 +23,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/operators/math/blas.h"
 
+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -41,44 +45,66 @@ class LookupTableKernel : public framework::OpKernel<T> {
     auto *output_t = context.Output<LoDTensor>("Out");  // float tensor
     auto *table_var = context.InputVar("W");
 
-    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
-    int64_t *ids = const_cast<int64_t *>(ids_t->data<int64_t>());
-    int64_t ids_numel = ids_t->numel();
-
-    if (table_var->IsType<LoDTensor>()) {
-      auto *table_t = context.Input<LoDTensor>("W");
-      int64_t row_number = table_t->dims()[0];
-      int64_t row_width = table_t->dims()[1];
-
-      auto *table = table_t->data<T>();
-      auto *output = output_t->mutable_data<T>(context.GetPlace());
-
-      for (int64_t i = 0; i < ids_numel; ++i) {
-        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
-          memset(output + i * row_width, 0, row_width * sizeof(T));
-        } else {
-          PADDLE_ENFORCE_LT(ids[i], row_number);
-          PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i);
-          memcpy(output + i * row_width, table + ids[i] * row_width,
-                 row_width * sizeof(T));
+    auto id_name = context.Inputs("Ids").front();
+    auto out_name = context.Outputs("Out").front();
+
+    // for remote prefetch
+    auto epmap = context.Attr<std::vector<std::string>>("epmap");
+    auto height_sections = context.Attr<std::vector<int>>("height_sections");
+    auto table_names = context.Attr<std::vector<std::string>>("table_names");
+
+    if (!epmap.empty()) {
+// if epmap is not empty, then the parameter will be fetched from remote
+// parameter
+// server
+#ifdef PADDLE_WITH_DISTRIBUTE
+      operators::distributed::prefetch(id_name, out_name, table_names, epmap,
+                                       height_sections, context);
+#else
+      PADDLE_THROW(
+          "paddle is not compiled with distribute support, can not do "
+          "parameter prefetch!");
+#endif
+    } else {
+      int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+      int64_t *ids = const_cast<int64_t *>(ids_t->data<int64_t>());
+      int64_t ids_numel = ids_t->numel();
+
+      if (table_var->IsType<LoDTensor>()) {
+        auto *table_t = context.Input<LoDTensor>("W");
+        int64_t row_number = table_t->dims()[0];
+        int64_t row_width = table_t->dims()[1];
+
+        auto *table = table_t->data<T>();
+        auto *output = output_t->mutable_data<T>(context.GetPlace());
+
+        for (int64_t i = 0; i < ids_numel; ++i) {
+          if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+            memset(output + i * row_width, 0, row_width * sizeof(T));
+          } else {
+            PADDLE_ENFORCE_LT(ids[i], row_number);
+            PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i);
+            memcpy(output + i * row_width, table + ids[i] * row_width,
+                   row_width * sizeof(T));
+          }
         }
-      }
-    } else if (table_var->IsType<SelectedRows>()) {
-      const auto &table_t = table_var->Get<SelectedRows>();
-      int64_t row_width = table_t.value().dims()[1];
-      const auto *table = table_t.value().data<T>();
-      auto *output = output_t->mutable_data<T>(context.GetPlace());
-
-      auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
-      for (int64_t i = 0; i < ids_numel; ++i) {
-        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
-          memset(output + i * row_width, 0, row_width * sizeof(T));
-        } else {
-          PADDLE_ENFORCE_GE(ids[i], 0);
-          auto id_index = table_t.Index(ids[i]);
-          PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists.");
-          blas.VCOPY(row_width, table + id_index * row_width,
-                     output + i * row_width);
+      } else if (table_var->IsType<SelectedRows>()) {
+        const auto &table_t = table_var->Get<SelectedRows>();
+        int64_t row_width = table_t.value().dims()[1];
+        const auto *table = table_t.value().data<T>();
+        auto *output = output_t->mutable_data<T>(context.GetPlace());
+
+        auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+        for (int64_t i = 0; i < ids_numel; ++i) {
+          if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+            memset(output + i * row_width, 0, row_width * sizeof(T));
+          } else {
+            PADDLE_ENFORCE_GE(ids[i], 0);
+            auto id_index = table_t.Index(ids[i]);
+            PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists.");
+            blas.VCOPY(row_width, table + id_index * row_width,
+                       output + i * row_width);
+          }
         }
       }
     }
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 63363086adbf12c38ac09949ac20483116ccf4ee..b3d2ea38eb1bfffadc1f68c5a34bc4d557bdea3b 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -59,6 +59,7 @@ math_library(matrix_bit_code)
 
 math_library(unpooling)
 math_library(vol2col)
+math_library(prelu)
 
 cc_test(math_function_test SRCS math_function_test.cc DEPS math_function)
 cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index 6734df1530893777fca3ccf66b1e8aab40e41cfc..9f3a81f22cc52bef719f472e43f91bc81dfe2af6 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -168,6 +168,9 @@ class Blas {
   template <typename T>
   void SCAL(int n, const T a, T* x) const;
 
+  template <typename T>
+  T ASUM(int n, T* x, int inc) const;
+
   template <typename T>
   void BatchedGEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N,
                    int K, T alpha, const T* A, const T* B, T beta, T* C,
@@ -269,6 +272,11 @@ class BlasT : private Blas<DeviceContext> {
     Base()->template SCAL<T>(args...);
   }
 
+  template <typename... ARGS>
+  T ASUM(ARGS... args) const {
+    return Base()->template ASUM<T>(args...);
+  }
+
   template <typename... ARGS>
   void BatchedGEMM(ARGS... args) const {
     Base()->template BatchedGEMM<T>(args...);
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index 93bf7c7c88db36807143b136ea800d6e5e49dd43..c84087bb1e4849b27d53e05f046c93f631150f6f 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -84,6 +84,11 @@ struct CBlas<float> {
     platform::dynload::cblas_sscal(args...);
   }
 
+  template <typename... ARGS>
+  static float ASUM(ARGS... args) {
+    return platform::dynload::cblas_sasum(args...);
+  }
+
   template <typename... ARGS>
   static void GEMM_BATCH(ARGS... args) {
     platform::dynload::cblas_sgemm_batch(args...);
@@ -174,6 +179,11 @@ struct CBlas<double> {
     platform::dynload::cblas_dscal(args...);
   }
 
+  template <typename... ARGS>
+  static double ASUM(ARGS... args) {
+    return platform::dynload::cblas_dasum(args...);
+  }
+
   template <typename... ARGS>
   static void GEMM_BATCH(ARGS... args) {
     platform::dynload::cblas_dgemm_batch(args...);
@@ -268,6 +278,7 @@ struct CBlas<platform::float16> {
   static void VPOW(...) { PADDLE_THROW("float16 VPOW not supported on CPU"); }
   static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); };
   static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); };
+  static void ASUM(...) { PADDLE_THROW("float16 ASUM not supported on CPU"); };
 #ifdef PADDLE_WITH_MKLML
   static void GEMM_BATCH(...) {
     PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
@@ -476,6 +487,21 @@ void Blas<platform::CPUDeviceContext>::SCAL(int n, const T a, T *x) const {
 #endif
 }
 
+template <>
+template <typename T>
+T Blas<platform::CPUDeviceContext>::ASUM(int n, T *x, int inc) const {
+  auto sum = static_cast<T>(0.0);
+#ifdef PADDLE_WITH_MKLML
+  sum = CBlas<T>::ASUM(n, x, inc);
+#else
+  // TODO(jczaja): check if openblas does provide cblas_sasum/cblas_dasum
+  for (int c = 0; c < n; ++c) {
+    sum += x[c];
+  }
+#endif
+  return sum;
+}
+
 template <>
 template <typename T>
 void Blas<platform::CPUDeviceContext>::GEMV(bool trans_a, int M, int N, T alpha,
diff --git a/paddle/fluid/operators/math/context_project.cu b/paddle/fluid/operators/math/context_project.cu
index 16205c0e145ef70666d4eca564488d80bde26d2e..f04b2d15349be329ee228fc8903c9b38a5349634 100644
--- a/paddle/fluid/operators/math/context_project.cu
+++ b/paddle/fluid/operators/math/context_project.cu
@@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
-
 #include "paddle/fluid/operators/math/context_project.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h
index 7d81aee596934308763002d440f52400f45b5f20..e1e4d168db3ca594b44396a6e30c5bfc03483eaf 100644
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -77,7 +77,7 @@ inline void vec_scal<double>(const int n, const double a, double* x) {
 #endif
 
 // MKL scal only support inplace, choose this if src and dst are not equal
-template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+template <typename T, platform::cpu_isa_t isa = platform::isa_any>
 inline void vec_scal(const int n, const T a, const T* x, T* y) {
   for (int i = 0; i < n; ++i) {
     y[i] = a * x[i];
@@ -85,12 +85,12 @@ inline void vec_scal(const int n, const T a, const T* x, T* y) {
 }
 
 template <>
-inline void vec_scal<float, platform::jit::avx>(const int n, const float a,
-                                                const float* x, float* y) {
+inline void vec_scal<float, platform::avx>(const int n, const float a,
+                                           const float* x, float* y) {
 #ifdef __AVX__
   constexpr int block = YMM_FLOAT_BLOCK;
   if (n < block) {
-    vec_scal<float, platform::jit::isa_any>(n, a, x, y);
+    vec_scal<float, platform::isa_any>(n, a, x, y);
     return;
   }
   const int rest = n % block;
@@ -114,24 +114,24 @@ inline void vec_scal<float, platform::jit::avx>(const int n, const float a,
     y[i] = a * x[i];
   }
 #else
-  vec_scal<float, platform::jit::isa_any>(n, a, x, y);
+  vec_scal<float, platform::isa_any>(n, a, x, y);
 #endif
 }
 
 template <>
-inline void vec_scal<float, platform::jit::avx2>(const int n, const float a,
-                                                 const float* x, float* y) {
-  vec_scal<float, platform::jit::avx>(n, a, x, y);
+inline void vec_scal<float, platform::avx2>(const int n, const float a,
+                                            const float* x, float* y) {
+  vec_scal<float, platform::avx>(n, a, x, y);
 }
 
 template <>
-inline void vec_scal<float, platform::jit::avx512f>(const int n, const float a,
-                                                    const float* x, float* y) {
+inline void vec_scal<float, platform::avx512f>(const int n, const float a,
+                                               const float* x, float* y) {
   // TODO(TJ): enable me
-  vec_scal<float, platform::jit::avx2>(n, a, x, y);
+  vec_scal<float, platform::avx2>(n, a, x, y);
 }
 
-template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+template <typename T, platform::cpu_isa_t isa = platform::isa_any>
 inline void vec_bias_sub(const int n, const T a, const T* x, T* y) {
   for (int i = 0; i < n; ++i) {
     y[i] = a - x[i];
@@ -139,12 +139,12 @@ inline void vec_bias_sub(const int n, const T a, const T* x, T* y) {
 }
 
 template <>
-inline void vec_bias_sub<float, platform::jit::avx>(const int n, const float a,
-                                                    const float* x, float* y) {
+inline void vec_bias_sub<float, platform::avx>(const int n, const float a,
+                                               const float* x, float* y) {
 #ifdef __AVX__
   constexpr int block = YMM_FLOAT_BLOCK;
   if (n < block) {
-    vec_bias_sub<float, platform::jit::isa_any>(n, a, x, y);
+    vec_bias_sub<float, platform::isa_any>(n, a, x, y);
     return;
   }
   const int rest = n % block;
@@ -168,27 +168,25 @@ inline void vec_bias_sub<float, platform::jit::avx>(const int n, const float a,
     y[i] = a - x[i];
   }
 #else
-  vec_bias_sub<float, platform::jit::isa_any>(n, a, x, y);
+  vec_bias_sub<float, platform::isa_any>(n, a, x, y);
 #endif
 }
 
 template <>
-inline void vec_bias_sub<float, platform::jit::avx2>(const int n, const float a,
-                                                     const float* x, float* y) {
-  vec_bias_sub<float, platform::jit::avx>(n, a, x, y);
+inline void vec_bias_sub<float, platform::avx2>(const int n, const float a,
+                                                const float* x, float* y) {
+  vec_bias_sub<float, platform::avx>(n, a, x, y);
 }
 
 template <>
-inline void vec_bias_sub<float, platform::jit::avx512f>(const int n,
-                                                        const float a,
-                                                        const float* x,
-                                                        float* y) {
+inline void vec_bias_sub<float, platform::avx512f>(const int n, const float a,
+                                                   const float* x, float* y) {
   // TODO(TJ): enable me
-  vec_bias_sub<float, platform::jit::avx2>(n, a, x, y);
+  vec_bias_sub<float, platform::avx2>(n, a, x, y);
 }
 
 // out = x*y + (1-x)*z
-template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+template <typename T, platform::cpu_isa_t isa = platform::isa_any>
 inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) {
   for (int i = 0; i < n; ++i) {
     out[i] = x[i] * y[i] + (static_cast<T>(1) - x[i]) * z[i];
@@ -196,13 +194,13 @@ inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) {
 }
 
 template <>
-inline void vec_cross<float, platform::jit::avx>(const int n, const float* x,
-                                                 const float* y, const float* z,
-                                                 float* out) {
+inline void vec_cross<float, platform::avx>(const int n, const float* x,
+                                            const float* y, const float* z,
+                                            float* out) {
 #ifdef __AVX__
   constexpr int block = YMM_FLOAT_BLOCK;
   if (n < block) {
-    vec_cross<float, platform::jit::isa_any>(n, x, y, z, out);
+    vec_cross<float, platform::isa_any>(n, x, y, z, out);
     return;
   }
   const int rest = n % block;
@@ -228,25 +226,26 @@ inline void vec_cross<float, platform::jit::avx>(const int n, const float* x,
     out[i] = x[i] * y[i] + (1.f - x[i]) * z[i];
   }
 #else
-  vec_cross<float, platform::jit::isa_any>(n, x, y, z, out);
+  vec_cross<float, platform::isa_any>(n, x, y, z, out);
 #endif
 }
 
 template <>
-inline void vec_cross<float, platform::jit::avx2>(const int n, const float* x,
-                                                  const float* y,
-                                                  const float* z, float* out) {
-  vec_cross<float, platform::jit::avx>(n, x, y, z, out);
+inline void vec_cross<float, platform::avx2>(const int n, const float* x,
+                                             const float* y, const float* z,
+                                             float* out) {
+  vec_cross<float, platform::avx>(n, x, y, z, out);
 }
 
 template <>
-inline void vec_cross<float, platform::jit::avx512f>(
-    const int n, const float* x, const float* y, const float* z, float* out) {
+inline void vec_cross<float, platform::avx512f>(const int n, const float* x,
+                                                const float* y, const float* z,
+                                                float* out) {
   // TODO(TJ): enable me
-  vec_cross<float, platform::jit::avx>(n, x, y, z, out);
+  vec_cross<float, platform::avx>(n, x, y, z, out);
 }
 
-template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+template <typename T, platform::cpu_isa_t isa = platform::isa_any>
 inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
   for (int i = 0; i < n; ++i) {
     y[i] = x[i] + a;
@@ -254,12 +253,12 @@ inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
 }
 
 template <>
-inline void vec_add_bias<float, platform::jit::avx>(const int n, const float a,
-                                                    const float* x, float* y) {
+inline void vec_add_bias<float, platform::avx>(const int n, const float a,
+                                               const float* x, float* y) {
 #ifdef __AVX__
   constexpr int block = YMM_FLOAT_BLOCK;
   if (n < block) {
-    vec_add_bias<float, platform::jit::isa_any>(n, a, x, y);
+    vec_add_bias<float, platform::isa_any>(n, a, x, y);
     return;
   }
   const int rest = n % block;
@@ -283,32 +282,30 @@ inline void vec_add_bias<float, platform::jit::avx>(const int n, const float a,
     y[i] = x[i] + a;
   }
 #else
-  vec_add_bias<float, platform::jit::isa_any>(n, a, x, y);
+  vec_add_bias<float, platform::isa_any>(n, a, x, y);
 #endif
 }
 
 template <>
-inline void vec_add_bias<float, platform::jit::avx2>(const int n, const float a,
-                                                     const float* x, float* y) {
-  vec_add_bias<float, platform::jit::avx>(n, a, x, y);
+inline void vec_add_bias<float, platform::avx2>(const int n, const float a,
+                                                const float* x, float* y) {
+  vec_add_bias<float, platform::avx>(n, a, x, y);
 }
 
 template <>
-inline void vec_add_bias<float, platform::jit::avx512f>(const int n,
-                                                        const float a,
-                                                        const float* x,
-                                                        float* y) {
+inline void vec_add_bias<float, platform::avx512f>(const int n, const float a,
+                                                   const float* x, float* y) {
   // TODO(TJ): enable me
-  vec_add_bias<float, platform::jit::avx2>(n, a, x, y);
+  vec_add_bias<float, platform::avx2>(n, a, x, y);
 }
 
-template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+template <typename T, platform::cpu_isa_t isa = platform::isa_any>
 inline void vec_identity(const int n, const T* x, T* y) {
   // do nothing
   return;
 }
 
-template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+template <typename T, platform::cpu_isa_t isa = platform::isa_any>
 inline void vec_sigmoid(const int n, const T* x, T* y) {
   const T min = SIGMOID_THRESHOLD_MIN;
   const T max = SIGMOID_THRESHOLD_MAX;
@@ -323,12 +320,12 @@ inline void vec_sigmoid(const int n, const T* x, T* y) {
 }
 
 template <>
-inline void vec_sigmoid<float, platform::jit::avx>(const int n, const float* x,
-                                                   float* y) {
+inline void vec_sigmoid<float, platform::avx>(const int n, const float* x,
+                                              float* y) {
 #ifdef __AVX__
   constexpr int block = YMM_FLOAT_BLOCK;
   if (n < block) {
-    vec_sigmoid<float, platform::jit::isa_any>(n, x, y);
+    vec_sigmoid<float, platform::isa_any>(n, x, y);
     return;
   }
   const int rest = n % block;
@@ -377,25 +374,24 @@ inline void vec_sigmoid<float, platform::jit::avx>(const int n, const float* x,
     y[i] = 1.f / (1.f + y[i]);
   }
 #else
-  vec_sigmoid<float, platform::jit::isa_any>(n, x, y);
+  vec_sigmoid<float, platform::isa_any>(n, x, y);
 #endif
 }
 
 template <>
-inline void vec_sigmoid<float, platform::jit::avx2>(const int n, const float* x,
-                                                    float* y) {
-  vec_sigmoid<float, platform::jit::avx>(n, x, y);
+inline void vec_sigmoid<float, platform::avx2>(const int n, const float* x,
+                                               float* y) {
+  vec_sigmoid<float, platform::avx>(n, x, y);
 }
 
 template <>
-inline void vec_sigmoid<float, platform::jit::avx512f>(const int n,
-                                                       const float* x,
-                                                       float* y) {
+inline void vec_sigmoid<float, platform::avx512f>(const int n, const float* x,
+                                                  float* y) {
   // TODO(TJ): enable me
-  vec_sigmoid<float, platform::jit::avx2>(n, x, y);
+  vec_sigmoid<float, platform::avx2>(n, x, y);
 }
 
-template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+template <typename T, platform::cpu_isa_t isa = platform::isa_any>
 inline void vec_tanh(const int n, const T* x, T* y) {
   vec_scal<T, isa>(n, static_cast<T>(2), x, y);
   vec_sigmoid<T, isa>(n, y, y);
@@ -404,7 +400,7 @@ inline void vec_tanh(const int n, const T* x, T* y) {
 }
 
 // TODO(TJ): make relu clip
-template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+template <typename T, platform::cpu_isa_t isa = platform::isa_any>
 inline void vec_relu(const int n, const T* x, T* y) {
   for (int i = 0; i < n; ++i) {
     y[i] = x[i] > 0 ? x[i] : 0;
@@ -412,12 +408,12 @@ inline void vec_relu(const int n, const T* x, T* y) {
 }
 
 template <>
-inline void vec_relu<float, platform::jit::avx>(const int n, const float* x,
-                                                float* y) {
+inline void vec_relu<float, platform::avx>(const int n, const float* x,
+                                           float* y) {
 #ifdef __AVX__
   constexpr int block = YMM_FLOAT_BLOCK;
   if (n < block * 4) {
-    vec_relu<float, platform::jit::isa_any>(n, x, y);
+    vec_relu<float, platform::isa_any>(n, x, y);
     return;
   }
 
@@ -441,26 +437,26 @@ inline void vec_relu<float, platform::jit::avx>(const int n, const float* x,
 #undef MOVE_ONE_STEP
 
 #else
-  vec_relu<float, platform::jit::isa_any>(n, x, y);
+  vec_relu<float, platform::isa_any>(n, x, y);
 #endif
 }
 
 template <>
-inline void vec_relu<float, platform::jit::avx2>(const int n, const float* x,
-                                                 float* y) {
-  vec_relu<float, platform::jit::avx>(n, x, y);
+inline void vec_relu<float, platform::avx2>(const int n, const float* x,
+                                            float* y) {
+  vec_relu<float, platform::avx>(n, x, y);
 }
 
 template <>
-inline void vec_relu<float, platform::jit::avx512f>(const int n, const float* x,
-                                                    float* y) {
+inline void vec_relu<float, platform::avx512f>(const int n, const float* x,
+                                               float* y) {
   // TODO(TJ): enable me
-  vec_relu<float, platform::jit::avx2>(n, x, y);
+  vec_relu<float, platform::avx2>(n, x, y);
 }
 
 // TODO(TJ): optimize double of sigmoid, tanh and relu if necessary
 
-template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+template <typename T, platform::cpu_isa_t isa = platform::isa_any>
 class VecActivations {
  public:
   std::function<void(const int, const T*, T*)> operator()(
diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc
index c37fa291a259550a3cb6d4f3dd9d5a415c3a2130..28eb9cadc9d4258bf4f8f71a06e029531e448014 100644
--- a/paddle/fluid/operators/math/cpu_vec_test.cc
+++ b/paddle/fluid/operators/math/cpu_vec_test.cc
@@ -104,38 +104,42 @@ void TestAndBench(const int n, std::function<void(const int, const T*, T*)> tgt,
 }
 
 TEST(CpuVecTest, sigmoid) {
-  namespace jit = paddle::platform::jit;
+  namespace platform = paddle::platform;
   using namespace paddle::operators::math;  // NOLINT
   for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     TestAndBench<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
-    TestAndBench<float>(sz, vec_sigmoid<float, jit::avx>, ref_sigmoid<float>);
-    TestAndBench<float>(sz, vec_sigmoid<float, jit::avx2>, ref_sigmoid<float>);
-    TestAndBench<float>(sz, vec_sigmoid<float, jit::avx512f>,
+    TestAndBench<float>(sz, vec_sigmoid<float, platform::avx>,
+                        ref_sigmoid<float>);
+    TestAndBench<float>(sz, vec_sigmoid<float, platform::avx2>,
+                        ref_sigmoid<float>);
+    TestAndBench<float>(sz, vec_sigmoid<float, platform::avx512f>,
                         ref_sigmoid<float>);
   }
   TestAndBench<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
 }
 
 TEST(CpuVecTest, tanh) {
-  namespace jit = paddle::platform::jit;
+  namespace platform = paddle::platform;
   using namespace paddle::operators::math;  // NOLINT
   for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     TestAndBench<float>(sz, vec_tanh<float>, ref_tanh<float>);
-    TestAndBench<float>(sz, vec_tanh<float, jit::avx>, ref_tanh<float>);
-    TestAndBench<float>(sz, vec_tanh<float, jit::avx2>, ref_tanh<float>);
-    TestAndBench<float>(sz, vec_tanh<float, jit::avx512f>, ref_tanh<float>);
+    TestAndBench<float>(sz, vec_tanh<float, platform::avx>, ref_tanh<float>);
+    TestAndBench<float>(sz, vec_tanh<float, platform::avx2>, ref_tanh<float>);
+    TestAndBench<float>(sz, vec_tanh<float, platform::avx512f>,
+                        ref_tanh<float>);
   }
   TestAndBench<double>(30, vec_tanh<double>, ref_tanh<double>);
 }
 
 TEST(CpuVecTest, relu) {
-  namespace jit = paddle::platform::jit;
+  namespace platform = paddle::platform;
   using namespace paddle::operators::math;  // NOLINT
   for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     TestAndBench<float>(sz, vec_relu<float>, ref_relu<float>);
-    TestAndBench<float>(sz, vec_relu<float, jit::avx>, ref_relu<float>);
-    TestAndBench<float>(sz, vec_relu<float, jit::avx2>, ref_relu<float>);
-    TestAndBench<float>(sz, vec_relu<float, jit::avx512f>, ref_relu<float>);
+    TestAndBench<float>(sz, vec_relu<float, platform::avx>, ref_relu<float>);
+    TestAndBench<float>(sz, vec_relu<float, platform::avx2>, ref_relu<float>);
+    TestAndBench<float>(sz, vec_relu<float, platform::avx512f>,
+                        ref_relu<float>);
   }
   TestAndBench<double>(30, vec_relu<double>, ref_relu<double>);
 }
@@ -162,38 +166,40 @@ void TestInplace(const int n, std::function<void(const int, const T*, T*)> tgt,
 }
 
 TEST(CpuVecTest, inplace_sigmoid) {
-  namespace jit = paddle::platform::jit;
+  namespace platform = paddle::platform;
   using namespace paddle::operators::math;  // NOLINT
   for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     TestInplace<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
-    TestInplace<float>(sz, vec_sigmoid<float, jit::avx>, ref_sigmoid<float>);
-    TestInplace<float>(sz, vec_sigmoid<float, jit::avx2>, ref_sigmoid<float>);
-    TestInplace<float>(sz, vec_sigmoid<float, jit::avx512f>,
+    TestInplace<float>(sz, vec_sigmoid<float, platform::avx>,
+                       ref_sigmoid<float>);
+    TestInplace<float>(sz, vec_sigmoid<float, platform::avx2>,
+                       ref_sigmoid<float>);
+    TestInplace<float>(sz, vec_sigmoid<float, platform::avx512f>,
                        ref_sigmoid<float>);
   }
   TestInplace<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
 }
 
 TEST(CpuVecTest, inplace_tanh) {
-  namespace jit = paddle::platform::jit;
+  namespace platform = paddle::platform;
   using namespace paddle::operators::math;  // NOLINT
   for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     TestInplace<float>(sz, vec_tanh<float>, ref_tanh<float>);
-    TestInplace<float>(sz, vec_tanh<float, jit::avx>, ref_tanh<float>);
-    TestInplace<float>(sz, vec_tanh<float, jit::avx2>, ref_tanh<float>);
-    TestInplace<float>(sz, vec_tanh<float, jit::avx512f>, ref_tanh<float>);
+    TestInplace<float>(sz, vec_tanh<float, platform::avx>, ref_tanh<float>);
+    TestInplace<float>(sz, vec_tanh<float, platform::avx2>, ref_tanh<float>);
+    TestInplace<float>(sz, vec_tanh<float, platform::avx512f>, ref_tanh<float>);
   }
   TestInplace<double>(30, vec_tanh<double>, ref_tanh<double>);
 }
 
 TEST(CpuVecTest, inplace_relu) {
-  namespace jit = paddle::platform::jit;
+  namespace platform = paddle::platform;
   using namespace paddle::operators::math;  // NOLINT
   for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
     TestInplace<float>(sz, vec_relu<float>, ref_relu<float>);
-    TestInplace<float>(sz, vec_relu<float, jit::avx>, ref_relu<float>);
-    TestInplace<float>(sz, vec_relu<float, jit::avx2>, ref_relu<float>);
-    TestInplace<float>(sz, vec_relu<float, jit::avx512f>, ref_relu<float>);
+    TestInplace<float>(sz, vec_relu<float, platform::avx>, ref_relu<float>);
+    TestInplace<float>(sz, vec_relu<float, platform::avx2>, ref_relu<float>);
+    TestInplace<float>(sz, vec_relu<float, platform::avx512f>, ref_relu<float>);
   }
   TestInplace<double>(30, vec_relu<double>, ref_relu<double>);
 }
diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc
index 52cbdf685dee651cbc1490dc6faacb8680004c89..78d0c3e8808f0daf6a18d2217664e965773b95ff 100644
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -22,7 +22,7 @@ namespace math {
 namespace jitkernel {
 namespace gen {
 
-using namespace platform::jit;  // NOLINT
+using namespace platform;  // NOLINT
 
 bool VXXJitCode::init(int d, int scalar_index) {
   // It's not necessary to use avx512 since it would slow down the frequency
diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h
index a9214621295a7740b804b26c02d216dd5118d8bb..e2b4761435594fdc952ff5dba5b5fa4f4aa98e6c 100644
--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
@@ -179,7 +179,7 @@ class VActJitCode : public JitCode {
   template <typename JMM>
   void exp_jmm(JMM& dst, JMM& src, int src_idx = 11, int fx_idx = 12,  // NOLINT
                int fy_idx = 13, int mask_idx = 14, int tmp_idx = 15) {
-    using namespace platform::jit;  // NOLINT
+    using namespace platform;  // NOLINT
     // check all idx can not equal
     JMM jmm_src = JMM(src_idx);
     JMM jmm_fx = JMM(fx_idx);
diff --git a/paddle/fluid/operators/math/jit_gen.cc b/paddle/fluid/operators/math/jit_gen.cc
index 6af39518ed926554c8c839bba701d3827923dba0..5c6672928e8c03ccb1920bd828f785084e422fc2 100644
--- a/paddle/fluid/operators/math/jit_gen.cc
+++ b/paddle/fluid/operators/math/jit_gen.cc
@@ -36,7 +36,7 @@ void JitCode::preCode() {
   for (int i = 0; i < num_g_abi_regs; ++i) {
     push(Xbyak::Reg64(g_abi_regs[i]));
   }
-  if (platform::jit::MayIUse(platform::jit::avx512f)) {
+  if (platform::MayIUse(platform::avx512f)) {
     mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt);
   }
 }
diff --git a/paddle/fluid/operators/math/jit_kernel.cc b/paddle/fluid/operators/math/jit_kernel.cc
index 68b708b345334bc63b5e2e88c308d20ca6378e6b..118696ba47986e2dbf97535333c9817b7c264a54 100644
--- a/paddle/fluid/operators/math/jit_kernel.cc
+++ b/paddle/fluid/operators/math/jit_kernel.cc
@@ -21,8 +21,6 @@ namespace operators {
 namespace math {
 namespace jitkernel {
 
-namespace jit = platform::jit;
-
 KernelPool& KernelPool::Instance() {
   static thread_local KernelPool g_jit_kernels;
   return g_jit_kernels;
diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc
index a0f93fd8e7eb7d81211724a6991a681e2a0ed9ce..8cf588efba52314650bfd376b95b10e6d4336b2e 100644
--- a/paddle/fluid/operators/math/jit_kernel_blas.cc
+++ b/paddle/fluid/operators/math/jit_kernel_blas.cc
@@ -30,7 +30,6 @@ namespace paddle {
 namespace operators {
 namespace math {
 namespace jitkernel {
-namespace jit = platform::jit;
 
 #ifdef PADDLE_WITH_MKLML
 template <typename T>
@@ -125,7 +124,7 @@ bool VMulKernelImpl<float>::useJIT(int d) {
 #ifdef PADDLE_WITH_MKLML
 template <>
 bool VMulKernelImpl<float>::useMKL(int d) {
-  return jit::MayIUse(jit::avx512f) && d > 512;
+  return platform::MayIUse(platform::avx512f) && d > 512;
 }
 
 template <>
diff --git a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
index 4d26b81948238f18b097f535534fcfe9049b93c3..eeb305a88bee8f0e21b205684d24b19ca4631f65 100644
--- a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
+++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
@@ -25,10 +25,8 @@ namespace operators {
 namespace math {
 namespace jitkernel {
 
-namespace jit = platform::jit;
-
 /* CRF Decode JitKernel */
-template <typename T, platform::jit::cpu_isa_t isa, jit_block>
+template <typename T, platform::cpu_isa_t isa, jit_block>
 class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
  public:
   explicit CRFDecodeKernelImpl(int tag_num) : CRFDecodeKernel<T>() {
@@ -101,7 +99,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
 
 #define INTRIAVX_FLOAT(block)                                                  \
   template <>                                                                  \
-  CRFDecodeKernelImpl<float, jit::avx, block>::CRFDecodeKernelImpl(            \
+  CRFDecodeKernelImpl<float, platform::avx, block>::CRFDecodeKernelImpl(       \
       int tag_num)                                                             \
       : CRFDecodeKernel<float>() {                                             \
     this->num_ = tag_num;                                                      \
@@ -109,7 +107,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
     this->rest_ = this->num_ % YMM_FLOAT_BLOCK;                                \
   }                                                                            \
   template <>                                                                  \
-  void CRFDecodeKernelImpl<float, jit::avx, block>::Compute(                   \
+  void CRFDecodeKernelImpl<float, platform::avx, block>::Compute(              \
       const int seq_len, const float* x, const float* w, float* alpha,         \
       int* track) const {                                                      \
     INIT_ALPHA(YMM_FLOAT_BLOCK)                                                \
@@ -204,7 +202,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
 
 #define INTRIAVX512_FLOAT(block)                                               \
   template <>                                                                  \
-  CRFDecodeKernelImpl<float, jit::avx512f, block>::CRFDecodeKernelImpl(        \
+  CRFDecodeKernelImpl<float, platform::avx512f, block>::CRFDecodeKernelImpl(   \
       int tag_num)                                                             \
       : CRFDecodeKernel<float>() {                                             \
     this->num_ = tag_num;                                                      \
@@ -212,7 +210,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
     this->rest_ = this->num_ % ZMM_FLOAT_BLOCK;                                \
   }                                                                            \
   template <>                                                                  \
-  void CRFDecodeKernelImpl<float, jit::avx512f, block>::Compute(               \
+  void CRFDecodeKernelImpl<float, platform::avx512f, block>::Compute(          \
       const int seq_len, const float* x, const float* w, float* alpha,         \
       int* track) const {                                                      \
     INIT_ALPHA(ZMM_FLOAT_BLOCK)                                                \
@@ -270,14 +268,14 @@ INTRIAVX_FLOAT(kEQ16);
 INTRIAVX_FLOAT(kGT16);
 #endif
 #ifdef __AVX2__
-INTRIAVX2_FLOAT(jit::avx2, kEQ8);
-INTRIAVX2_FLOAT(jit::avx2, kGT8LT16);
-INTRIAVX2_FLOAT(jit::avx2, kEQ16);
-INTRIAVX2_FLOAT(jit::avx2, kGT16);
+INTRIAVX2_FLOAT(platform::avx2, kEQ8);
+INTRIAVX2_FLOAT(platform::avx2, kGT8LT16);
+INTRIAVX2_FLOAT(platform::avx2, kEQ16);
+INTRIAVX2_FLOAT(platform::avx2, kGT16);
 #endif
 #ifdef __AVX512F__
-INTRIAVX2_FLOAT(jit::avx512f, kEQ8);
-INTRIAVX2_FLOAT(jit::avx512f, kGT8LT16);
+INTRIAVX2_FLOAT(platform::avx512f, kEQ8);
+INTRIAVX2_FLOAT(platform::avx512f, kGT8LT16);
 INTRIAVX512_FLOAT(kEQ16);
 INTRIAVX512_FLOAT(kGT16);
 #endif
diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc
index 686f3dd9836cb9192088771753065c6add639620..7945cfb253a61b7d1191c39537254126e2bb85dd 100644
--- a/paddle/fluid/operators/math/jit_kernel_exp.cc
+++ b/paddle/fluid/operators/math/jit_kernel_exp.cc
@@ -29,7 +29,6 @@ namespace paddle {
 namespace operators {
 namespace math {
 namespace jitkernel {
-namespace jit = platform::jit;
 
 #ifdef PADDLE_WITH_MKLML
 // try to use MKL to speedup
diff --git a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc
index 49904e6e8c7cd346bcbfb67c3a7574118b36e058..cb49e66488bd69d92430cbf6de1d08348ffe0202 100644
--- a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc
+++ b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc
@@ -22,10 +22,8 @@ namespace operators {
 namespace math {
 namespace jitkernel {
 
-namespace jit = platform::jit;
-
 /* Layer Norm JitKernel */
-template <typename T, platform::jit::cpu_isa_t isa, jit_block>
+template <typename T, platform::cpu_isa_t isa, jit_block>
 class LayerNormKernelImpl : public LayerNormKernel<T> {
  public:
   explicit LayerNormKernelImpl(int right) : LayerNormKernel<T>() {
@@ -81,16 +79,16 @@ class LayerNormKernelImpl : public LayerNormKernel<T> {
   }
 };
 
-#define INTRIAVX_FLOAT(isa, block)                                             \
+#define INTRIAVX_FLOAT(isa, jit_block)                                         \
   template <>                                                                  \
-  LayerNormKernelImpl<float, isa, block>::LayerNormKernelImpl(int right)       \
+  LayerNormKernelImpl<float, isa, jit_block>::LayerNormKernelImpl(int right)   \
       : LayerNormKernel<float>() {                                             \
     this->num_ = right;                                                        \
     this->rest_ = this->num_ % YMM_FLOAT_BLOCK;                                \
     this->end_ = this->num_ - this->rest_;                                     \
   }                                                                            \
   template <>                                                                  \
-  void LayerNormKernelImpl<float, jit::avx, block>::Compute(                   \
+  void LayerNormKernelImpl<float, isa, jit_block>::Compute(                    \
       float* x, float* out, float* mean, float* var, const float* scale,       \
       const float* bias, int height, const float epsilon) const {              \
     __m256 sum;                                                                \
@@ -99,6 +97,7 @@ class LayerNormKernelImpl : public LayerNormKernel<T> {
     __m256 tmp;                                                                \
     size_t offset;                                                             \
     size_t j;                                                                  \
+    size_t block = YMM_FLOAT_BLOCK;                                            \
     __m256 reverse_num_vec =                                                   \
         _mm256_div_ps(_mm256_set1_ps(1.0), _mm256_set1_ps(this->num_));        \
     __m256 epsilon_vec = _mm256_set1_ps(epsilon);                              \
@@ -219,16 +218,18 @@ class LayerNormKernelImpl : public LayerNormKernel<T> {
   }
 
 #ifdef __AVX__
-INTRIAVX_FLOAT(jit::avx, kEQ8);
-INTRIAVX_FLOAT(jit::avx, kGT8LT16);
-INTRIAVX_FLOAT(jit::avx, kEQ16);
-INTRIAVX_FLOAT(jit::avx, kGT16);
-#endif
-#ifdef __AVX2__
-INTRIAVX_FLOAT(jit::avx2, kEQ8);
-INTRIAVX_FLOAT(jit::avx2, kGT8LT16);
-INTRIAVX_FLOAT(jit::avx2, kEQ16);
-INTRIAVX_FLOAT(jit::avx2, kGT16);
+INTRIAVX_FLOAT(platform::avx, kEQ8);
+INTRIAVX_FLOAT(platform::avx, kGT8LT16);
+INTRIAVX_FLOAT(platform::avx, kEQ16);
+INTRIAVX_FLOAT(platform::avx, kGT16);
+INTRIAVX_FLOAT(platform::avx2, kEQ8);
+INTRIAVX_FLOAT(platform::avx2, kGT8LT16);
+INTRIAVX_FLOAT(platform::avx2, kEQ16);
+INTRIAVX_FLOAT(platform::avx2, kGT16);
+INTRIAVX_FLOAT(platform::avx512f, kEQ8);
+INTRIAVX_FLOAT(platform::avx512f, kGT8LT16);
+INTRIAVX_FLOAT(platform::avx512f, kEQ16);
+INTRIAVX_FLOAT(platform::avx512f, kGT16);
 #endif
 
 #undef INTRIAVX_FLOAT
diff --git a/paddle/fluid/operators/math/jit_kernel_macro.h b/paddle/fluid/operators/math/jit_kernel_macro.h
index 5a3efd979f803d396a5084c199b1d71b88a77126..4dba3b56810794cb4839d26386ae77a8f4507977 100644
--- a/paddle/fluid/operators/math/jit_kernel_macro.h
+++ b/paddle/fluid/operators/math/jit_kernel_macro.h
@@ -92,7 +92,6 @@ namespace jitkernel {
                           JITKERNEL_DECLARE, JITKERNEL_FIND_KEY,     \
                           JITKERNEL_IMPL)
 
-namespace jit = platform::jit;
 // TODO(TJ): below defines are deprecated, would be remove recently
 #define SEARCH_BLOCK(macro_, ker, dtype, isa)              \
   if (d < YMM_FLOAT_BLOCK) {                               \
@@ -107,15 +106,15 @@ namespace jit = platform::jit;
     macro_(ker, dtype, isa, kGT16);                        \
   }
 
-#define SEARCH_ISA_BLOCK(macro_, ker, dtype)        \
-  if (jit::MayIUse(jit::avx512f)) {                 \
-    SEARCH_BLOCK(macro_, ker, dtype, jit::avx512f); \
-  } else if (jit::MayIUse(jit::avx2)) {             \
-    SEARCH_BLOCK(macro_, ker, dtype, jit::avx2);    \
-  } else if (jit::MayIUse(jit::avx)) {              \
-    SEARCH_BLOCK(macro_, ker, dtype, jit::avx);     \
-  } else {                                          \
-    SEARCH_BLOCK(macro_, ker, dtype, jit::isa_any); \
+#define SEARCH_ISA_BLOCK(macro_, ker, dtype)             \
+  if (platform::MayIUse(platform::avx512f)) {            \
+    SEARCH_BLOCK(macro_, ker, dtype, platform::avx512f); \
+  } else if (platform::MayIUse(platform::avx2)) {        \
+    SEARCH_BLOCK(macro_, ker, dtype, platform::avx2);    \
+  } else if (platform::MayIUse(platform::avx)) {         \
+    SEARCH_BLOCK(macro_, ker, dtype, platform::avx);     \
+  } else {                                               \
+    SEARCH_BLOCK(macro_, ker, dtype, platform::isa_any); \
   }
 
 #define JITKERNEL_KEY(ker_key, dtype_key) \
@@ -156,10 +155,10 @@ namespace jit = platform::jit;
                                   marco_declare, macro_key, macro_impl)
 
 #define FOR_EACH_ISA(macro_, block) \
-  macro_(jit::avx512f, block);      \
-  macro_(jit::avx2, block);         \
-  macro_(jit::avx, block);          \
-  macro_(jit::isa_any, block)
+  macro_(platform::avx512f, block); \
+  macro_(platform::avx2, block);    \
+  macro_(platform::avx, block);     \
+  macro_(platform::isa_any, block)
 
 #define FOR_EACH_BLOCK(macro_, isa) \
   macro_(isa, kLT8);                \
@@ -168,11 +167,11 @@ namespace jit = platform::jit;
   macro_(isa, kEQ16);               \
   macro_(isa, kGT16)
 
-#define FOR_EACH_ISA_BLOCK(macro_)      \
-  FOR_EACH_BLOCK(macro_, jit::avx512f); \
-  FOR_EACH_BLOCK(macro_, jit::avx2);    \
-  FOR_EACH_BLOCK(macro_, jit::avx);     \
-  FOR_EACH_BLOCK(macro_, jit::isa_any)
+#define FOR_EACH_ISA_BLOCK(macro_)           \
+  FOR_EACH_BLOCK(macro_, platform::avx512f); \
+  FOR_EACH_BLOCK(macro_, platform::avx2);    \
+  FOR_EACH_BLOCK(macro_, platform::avx);     \
+  FOR_EACH_BLOCK(macro_, platform::isa_any)
 
 }  // namespace jitkernel
 }  // namespace math
diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc
index ed86a47e159cacd4f5572e22c7633f725aaeb516..19f7bd8909499c12fd5bee4db0d0a71a632e7f19 100644
--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
@@ -705,7 +705,7 @@ TEST(JitKernel, pool) {
   jit::lstm_attr_t attr(frame_size, act_gate, act_cand, act_cell, false);
 
   // empty call it to avoid unknown flag 'use_pinned_memory' on Mac
-  paddle::platform::jit::MayIUse(paddle::platform::jit::avx);
+  paddle::platform::MayIUse(paddle::platform::avx);
   const auto& plstm1 =
       jit::KernelPool::Instance()
           .template Get<jit::LSTMKernel<float>, const jit::lstm_attr_t&>(attr);
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index 79b7538ad05b0ff348b8264d50b63211b5254e80..9372d63f0bea2b0c9f37d47376d7b7014e381a33 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/operators/math/blas.h"
diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc
index 71b9293eeded77553ca06a8574cca3941fa36b6a..5a6e64b6f87d33249f0153e5f391deaf78e53de5 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.cc
+++ b/paddle/fluid/operators/math/matrix_bit_code.cc
@@ -89,6 +89,8 @@ template <typename T>
 void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat,
                                   const framework::Tensor& weight,
                                   const framework::Tensor& input) {
+  auto blas =
+      GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
   size_t num_samples = tmat->dims()[0];
   size_t tmat_width = tmat->dims()[1];
   size_t input_width = input.dims()[1];
@@ -99,13 +101,12 @@ void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat,
   for (size_t i = 0; i < num_samples; ++i) {
     auto code = code_table_->get_code(i);
     int code_length = code->get_length();
+    const T* input_row = input_value + input_width * i;
     for (int j = 0; j < code_length; ++j) {
       size_t index = code->calc_index(j);
+      const T* weight_row = weight_value + weight_width * index;
       T sum = static_cast<T>(0.0);
-      for (size_t k = 0; k < input_width; ++k) {
-        sum += weight_value[weight_width * index + k] *
-               input_value[input_width * i + k];
-      }
+      sum = blas.DOT(input_width, weight_row, input_row);
       tmat_value[i * tmat_width + j] += sum;
     }
   }
@@ -115,6 +116,8 @@ template <typename T>
 void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
                                             framework::Tensor* weight,
                                             const framework::Tensor& input) {
+  auto blas =
+      GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
   size_t num_samples = tmat.dims()[0];
   size_t input_width = input.dims()[1];
   size_t tmat_width = tmat.dims()[1];
@@ -122,16 +125,25 @@ void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
   auto tmat_value = tmat.data<T>();
   auto weight_value = weight->data<T>();
   auto input_value = input.data<T>();
+
+  std::unordered_map<int, std::vector<std::pair<T, const T*>>> ops;
+
   for (size_t i = 0; i < num_samples; ++i) {
     auto code = code_table_->get_code(i);
     int code_length = code->get_length();
+    const T* input_value_row = input_value + input_width * i;
+    const T* tmat_row = tmat_value + i * tmat_width;
     for (int j = 0; j < code_length; ++j) {
-      size_t index = code->calc_index(j);
-
-      for (size_t k = 0; k < input_width; ++k) {
-        weight_value[weight_width * index + k] +=
-            tmat_value[i * tmat_width + j] * input_value[input_width * i + k];
-      }
+      ops[code->calc_index(j)].emplace_back(tmat_row[j], input_value_row);
+    }
+  }
+  for (auto& op : ops) {
+    auto& op_in_row = op.second;
+    for (auto& pair : op_in_row) {
+      auto& scale = pair.first;
+      auto* input_row = pair.second;
+      T* weight_row = weight_value + op.first * weight_width;
+      blas.AXPY(input_width, scale, input_row, weight_row);
     }
   }
 }
@@ -140,6 +152,8 @@ template <typename T>
 void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
                                             framework::SelectedRows* weight,
                                             const framework::Tensor& input) {
+  auto blas =
+      GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
   size_t num_samples = tmat.dims()[0];
   size_t input_width = input.dims()[1];
   size_t tmat_width = tmat.dims()[1];
@@ -147,17 +161,28 @@ void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
   auto tmat_value = tmat.data<T>();
   auto weight_value = weight->mutable_value()->data<T>();
   auto input_value = input.data<T>();
+
+  std::unordered_map<int, std::vector<std::pair<T, const T*>>> ops;
+  ops.reserve(weight->rows().size());
+
   for (size_t i = 0; i < num_samples; ++i) {
     auto code = code_table_->get_code(i);
     int code_length = code->get_length();
+    const T* input_value_row = input_value + input_width * i;
+    const T* tmat_row = tmat_value + i * tmat_width;
     for (int j = 0; j < code_length; ++j) {
-      size_t index = code->calc_index(j);
-      for (size_t k = 0; k < input_width; ++k) {
-        int64_t row_index = weight->GetIndexFromId(static_cast<int64_t>(index));
-        weight_value[row_index * weight_width + k] +=
-            tmat_value[i * tmat_width + j] * input_value[input_width * i + k];
-      }
+      ops[code->calc_index(j)].emplace_back(tmat_row[j], input_value_row);
+    }
+  }
+
+  for (auto& row : weight->rows()) {
+    auto& op_in_row = ops[row];
+    for (auto& pair : op_in_row) {
+      auto& scale = pair.first;
+      auto* input_row = pair.second;
+      blas.AXPY(input_width, scale, input_row, weight_value);
     }
+    weight_value += weight_width;
   }
 }
 
diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h
index c30bb52641e865efe57659a551bc4b493634c6b9..35ca73802b48982ddf3ed7485b56f50221c9f28c 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -13,10 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <unordered_map>
+#include <utility>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/device_context.h"
 
 #if defined(_WIN32)
diff --git a/paddle/fluid/operators/math/prelu.cu b/paddle/fluid/operators/math/prelu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..701a802080f65ea32b95402682dc46362ccf0966
--- /dev/null
+++ b/paddle/fluid/operators/math/prelu.cu
@@ -0,0 +1,148 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/prelu.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+static const int CUDA_NUM_THREADS = 1024;
+static const int CUDA_MAX_NUM_BLOCKS = 65535;
+inline static int GET_NUM_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+template <typename T>
+__global__ void PReluChannelWiseKernel(const T *input, const T *alpha,
+                                       T *output, int channel,
+                                       size_t spatial_size) {
+  size_t offset = blockIdx.x * spatial_size;
+  const T *in = input + offset;
+  T *out = output + offset;
+  T scale = alpha[blockIdx.x % channel];
+
+  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
+    T x = in[i];
+    out[i] = (x > 0) ? x : scale * x;
+  }
+}
+
+template <typename T>
+__global__ void PReluElementWiseKernel(const T *input, const T *alpha,
+                                       T *output, size_t spatial_size) {
+  size_t offset = blockIdx.x * spatial_size;
+  const T *in = input + offset;
+  const T *scale = alpha + offset;
+  T *out = output + offset;
+
+  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
+    T x = in[i];
+    out[i] = (x > 0) ? x : scale[i] * x;
+  }
+}
+
+template <typename T>
+__global__ void PReluScalarKernel(const T *input, const T *alpha, T *output,
+                                  size_t spatial_size) {
+  size_t offset = blockIdx.x * spatial_size;
+  const T *in = input + offset;
+  T scale = *alpha;
+  T *out = output + offset;
+
+  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
+    T x = in[i];
+    out[i] = (x > 0) ? x : scale * x;
+  }
+}
+
+template <typename T>
+static inline void PReluChannelWise(cudaStream_t stream, const T *input,
+                                    const T *alpha, T *output,
+                                    std::vector<int> input_shape) {
+  size_t unroll = input_shape[0] * input_shape[1];
+  size_t spatial_size = input_shape[2] * input_shape[3];
+  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
+  PReluChannelWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
+      input, alpha, output, input_shape[1], spatial_size);
+}
+
+template <typename T>
+static inline void PReluElementWise(cudaStream_t stream, const T *input,
+                                    const T *alpha, T *output,
+                                    std::vector<int> input_shape) {
+  size_t unroll = input_shape[0] * input_shape[1];
+  size_t spatial_size = input_shape[2] * input_shape[3];
+  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
+  PReluElementWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
+      input, alpha, output, spatial_size);
+}
+
+template <typename T>
+static inline void PReluScalar(cudaStream_t stream, const T *input,
+                               const T *alpha, T *output,
+                               std::vector<int> input_shape) {
+  size_t unroll = input_shape[0] * input_shape[1];
+  size_t spatial_size = input_shape[2] * input_shape[3];
+  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
+  PReluScalarKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
+      input, alpha, output, spatial_size);
+}
+
+template <typename T>
+void PreluChannelWiseDirectCUDAFunctor<T>::operator()(
+    cudaStream_t stream, const T *input, const T *alpha, T *output,
+    std::vector<int> input_shape) {
+  size_t unroll = input_shape[0] * input_shape[1];
+  size_t spatial_size = input_shape[2] * input_shape[3];
+  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
+  PReluChannelWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
+      input, alpha, output, input_shape[1], spatial_size);
+}
+
+template <typename T>
+void PreluElementWiseDirectCUDAFunctor<T>::operator()(
+    cudaStream_t stream, const T *input, const T *alpha, T *output,
+    std::vector<int> input_shape) {
+  size_t unroll = input_shape[0] * input_shape[1];
+  size_t spatial_size = input_shape[2] * input_shape[3];
+  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
+  PReluElementWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
+      input, alpha, output, spatial_size);
+}
+
+template <typename T>
+void PreluScalarDirectCUDAFunctor<T>::operator()(cudaStream_t stream,
+                                                 const T *input, const T *alpha,
+                                                 T *output,
+                                                 std::vector<int> input_shape) {
+  size_t unroll = input_shape[0] * input_shape[1];
+  size_t spatial_size = input_shape[2] * input_shape[3];
+  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
+  PReluScalarKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
+      input, alpha, output, spatial_size);
+}
+
+template class PreluChannelWiseDirectCUDAFunctor<float>;
+template class PreluChannelWiseDirectCUDAFunctor<double>;
+
+template class PreluElementWiseDirectCUDAFunctor<float>;
+template class PreluElementWiseDirectCUDAFunctor<double>;
+
+template class PreluScalarDirectCUDAFunctor<float>;
+template class PreluScalarDirectCUDAFunctor<double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/prelu.h b/paddle/fluid/operators/math/prelu.h
new file mode 100644
index 0000000000000000000000000000000000000000..3237c6d4cbf956aafb4046ea2ffa42efe62e7b28
--- /dev/null
+++ b/paddle/fluid/operators/math/prelu.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+#ifdef PADDLE_WITH_CUDA
+template <typename T>
+class PreluChannelWiseDirectCUDAFunctor {
+ public:
+  void operator()(cudaStream_t stream, const T *input, const T *alpha,
+                  T *output, std::vector<int> input_shape);
+};
+
+template <typename T>
+class PreluElementWiseDirectCUDAFunctor {
+ public:
+  void operator()(cudaStream_t stream, const T *input, const T *alpha,
+                  T *output, std::vector<int> input_shape);
+};
+
+template <typename T>
+class PreluScalarDirectCUDAFunctor {
+ public:
+  void operator()(cudaStream_t stream, const T *input, const T *alpha,
+                  T *output, std::vector<int> input_shape);
+};
+#endif
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/sequence2batch.cu b/paddle/fluid/operators/math/sequence2batch.cu
index be73adfc0cbe37ed8831b5ad34e66bc95e342e9d..9ab13659c1cc5b59d28395bcebcfb43fac5b4544 100644
--- a/paddle/fluid/operators/math/sequence2batch.cu
+++ b/paddle/fluid/operators/math/sequence2batch.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/math/sequence2batch.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index 2e9669049e36478549b793e3fa76220825888e21..71d137398267f61d8cc01907d6a9498eef8d62dc 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
-
 #include <vector>
 
 #include "paddle/fluid/operators/math/math_function.h"
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index 0f3e5b20086378da8ef1138a5f5c005b724f7fa2..9e99e44822b2fce971b751967ca8076a1f1384ec 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
 
@@ -100,11 +101,8 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
 
     blas.VEXP(num_classes * batch_size, out_data, out_data);
     for (int n = 0; n < batch_size; ++n) {
-      entities[n] = out_data[n * num_classes];
-      for (int c = 1; c < num_classes; ++c) {
-        entities[n] += out_data[n * num_classes + c];
-      }
-      blas.SCAL(num_classes, 1.0f / entities[n], &out_data[n * num_classes]);
+      auto sum = blas.ASUM(num_classes, &out_data[n * num_classes], 1);
+      blas.SCAL(num_classes, 1.0f / sum, &out_data[n * num_classes]);
     }
   }
 };
diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu
index 413b8ace67bd0a36849373812950834523b62216..921c2e1298906655767c1e7f30dc34b2c564c671 100644
--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
-
 #include "paddle/fluid/operators/mean_op.h"
 #include "paddle/fluid/platform/float16.h"
 
diff --git a/paddle/fluid/operators/merge_selected_rows_op.cc b/paddle/fluid/operators/merge_selected_rows_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3c15c839554599104d21a5225c078d41735c4a60
--- /dev/null
+++ b/paddle/fluid/operators/merge_selected_rows_op.cc
@@ -0,0 +1,72 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/merge_selected_rows_op.h"
+
+namespace paddle {
+namespace operators {
+
+class MergeSelectedRowsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of MergeSelectedRowsOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of MergeSelectedRowsOp should not be null.");
+    ctx->ShareDim("X", /*->*/ "Out");
+  }
+};
+
+class MergeSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input type is SelectedRows, and the selected rows may be "
+             "duplicated.");
+    AddOutput("Out",
+              "The output type is SelectedRows, and the selected rows are not "
+              "duplicated.");
+    AddComment(
+        R"DOC(
+MergeSelectedRows Operator.
+
+MergeSelectedRows is used to merge the duplicated rows of the input.
+)DOC");
+  }
+};
+
+class MergeSelectedRowsOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OPERATOR(merge_selected_rows, ops::MergeSelectedRowsOp,
+                  ops::MergeSelectedRowsOpMaker,
+                  ops::MergeSelectedRowsOpInferVarType);
+
+REGISTER_OP_CPU_KERNEL(
+    merge_selected_rows,
+    ops::MergeSelectedRowsKernel<plat::CPUDeviceContext, float>,
+    ops::MergeSelectedRowsKernel<plat::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/merge_selected_rows_op.cu.cc b/paddle/fluid/operators/merge_selected_rows_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..90d5fb3eaeb1f155eeea29ea0cf3f5ecd610f5f0
--- /dev/null
+++ b/paddle/fluid/operators/merge_selected_rows_op.cu.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/merge_selected_rows_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    merge_selected_rows,
+    ops::MergeSelectedRowsKernel<plat::CUDADeviceContext, float>,
+    ops::MergeSelectedRowsKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/merge_selected_rows_op.h b/paddle/fluid/operators/merge_selected_rows_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c977e94b175c988e4253b273365b0cabc4b87aa
--- /dev/null
+++ b/paddle/fluid/operators/merge_selected_rows_op.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class MergeSelectedRowsKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<framework::SelectedRows>("X");
+    auto* out = context.Output<framework::SelectedRows>("Out");
+
+    math::scatter::MergeAdd<DeviceContext, T> merge_func;
+    merge_func(context.template device_context<DeviceContext>(), *x, out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/metrics/auc_op.h b/paddle/fluid/operators/metrics/auc_op.h
index fb370842d1942c3b3eebecb1fe5e8ffb845cb34b..4ab5cfe53c67eeaa995d7e955eec63a065c5eec5 100644
--- a/paddle/fluid/operators/metrics/auc_op.h
+++ b/paddle/fluid/operators/metrics/auc_op.h
@@ -75,8 +75,13 @@ class AucKernel : public framework::OpKernel<T> {
     const auto *label_data = label->data<int64_t>();
 
     for (size_t i = 0; i < batch_size; i++) {
-      uint32_t binIdx = static_cast<uint32_t>(
-          inference_data[i * inference_width + 1] * num_thresholds);
+      auto predict_data = inference_data[i * inference_width + 1];
+      PADDLE_ENFORCE_LE(predict_data, 1,
+                        "The predict data must less or equal 1.");
+      PADDLE_ENFORCE_GE(predict_data, 0,
+                        "The predict data must gather or equal 0.");
+
+      uint32_t binIdx = static_cast<uint32_t>(predict_data * num_thresholds);
       if (label_data[i]) {
         (*stat_pos)[binIdx] += 1.0;
       } else {
diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cu b/paddle/fluid/operators/optimizers/adadelta_op.cu
index 3fbfee5df05770a1206ab3170d3baffdd20bc77b..562a157f063b44d65254d556d44439eee3636c4c 100644
--- a/paddle/fluid/operators/optimizers/adadelta_op.cu
+++ b/paddle/fluid/operators/optimizers/adadelta_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/optimizers/adadelta_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cu b/paddle/fluid/operators/optimizers/adagrad_op.cu
index 4efe56855a4bdca41d24f02c29a618a8d4232887..5043468d4c5f721ae0906b1a319eb3ec10b26580 100644
--- a/paddle/fluid/operators/optimizers/adagrad_op.cu
+++ b/paddle/fluid/operators/optimizers/adagrad_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/operators/optimizers/adagrad_op.h"
diff --git a/paddle/fluid/operators/optimizers/adam_op.cu b/paddle/fluid/operators/optimizers/adam_op.cu
index e8090ebacfe85153aba9e275c9cd1c55fd7af15e..4eb2db717d45a730798eef48d3d10bce9d387c4b 100644
--- a/paddle/fluid/operators/optimizers/adam_op.cu
+++ b/paddle/fluid/operators/optimizers/adam_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/optimizers/adam_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/optimizers/adamax_op.cu b/paddle/fluid/operators/optimizers/adamax_op.cu
index e54adcb142fe0d50dad23fe5df14bd6f28220d8a..80e0219d4414db2909b5babc22599d8c0d906c7d 100644
--- a/paddle/fluid/operators/optimizers/adamax_op.cu
+++ b/paddle/fluid/operators/optimizers/adamax_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/optimizers/adamax_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu
index 84d65e39329659f82099011f9ec60468d5db6328..dc568802a2b19fee5c8d7fd8d07c929cba8ab4e3 100644
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/optimizers/decayed_adagrad_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/optimizers/ftrl_op.cu b/paddle/fluid/operators/optimizers/ftrl_op.cu
index f836b75df93861a0fd670f2a0e786e6a797a4661..acf8e38ca0f5a3cf9899f4898898013e8a2afdd2 100644
--- a/paddle/fluid/operators/optimizers/ftrl_op.cu
+++ b/paddle/fluid/operators/optimizers/ftrl_op.cu
@@ -10,8 +10,6 @@ Unless required by applicable law or agreed to in writing, software distributed
 under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/optimizers/ftrl_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu
index d1c1f747b70c3ceb806da06e6786a70b62a32995..591dead3b12763e4cd1b9c390a87816ab121fbf8 100644
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu
+++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu
@@ -10,8 +10,6 @@ Unless required by applicable law or agreed to in writing, software distributed
 under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/optimizers/proximal_adagrad_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.cu b/paddle/fluid/operators/optimizers/proximal_gd_op.cu
index 7aa0e1015008eba0c1cf63ba1278dc2b8049b20b..d556fa74f19529d0e2f80d4c6dbfca62498c9dcc 100644
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.cu
+++ b/paddle/fluid/operators/optimizers/proximal_gd_op.cu
@@ -10,8 +10,6 @@ Unless required by applicable law or agreed to in writing, software distributed
 under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/optimizers/proximal_gd_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.cu b/paddle/fluid/operators/optimizers/rmsprop_op.cu
index 69e35a309e04f61068d9ff1b6d9f1450d2524253..8b17d6a0204045a9b20adb79dbad72dff5ba267e 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op.cu
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/optimizers/rmsprop_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc
index a706d05fd7c35ef993f5199f0f893622cb863c5d..a9da21f47902e20cc7460461caca79c3f3292c5a 100644
--- a/paddle/fluid/operators/pad2d_op.cc
+++ b/paddle/fluid/operators/pad2d_op.cc
@@ -319,20 +319,46 @@ void Pad2DGradEdgeNHWC(T* d_in_data, const int num, const int channels,
   }
 }
 
+static inline void GetPaddings(int* paddings,
+                               const framework::ExecutionContext& context) {
+  auto* paddings_t = context.Input<Tensor>("Paddings");
+  if (paddings_t) {
+    auto paddings_data = paddings_t->data<int>();
+    paddings[0] = paddings_data[0];
+    paddings[1] = paddings_data[1];
+    paddings[2] = paddings_data[2];
+    paddings[3] = paddings_data[3];
+  } else {
+    auto pads = context.Attr<std::vector<int>>("paddings");
+    std::copy(pads.begin(), pads.end(), paddings);
+  }
+}
+
 template <typename T>
 class Pad2dCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto pads = context.Attr<std::vector<int>>("paddings");
+    int pads[4];
+    GetPaddings(pads, context);
     auto mode = context.Attr<std::string>("mode");
     auto data_format = context.Attr<std::string>("data_format");
     T value = context.Attr<T>("pad_value");
+
     auto* x = context.Input<Tensor>("X");
-    auto* out = context.Output<Tensor>("Out");
     auto in_dims = x->dims();
-    auto out_dims = out->dims();
     const T* in_data = x->data<T>();
+
+    auto* out = context.Output<Tensor>("Out");
+    if (data_format == "NCHW") {
+      out->Resize({in_dims[0], in_dims[1], in_dims[2] + pads[0] + pads[1],
+                   in_dims[3] + pads[2] + pads[3]});
+    } else {
+      out->Resize({in_dims[0], in_dims[1] + pads[0] + pads[1],
+                   in_dims[2] + pads[2] + pads[3], in_dims[3]});
+    }
+    auto out_dims = out->dims();
     T* out_data = out->mutable_data<T>(context.GetPlace());
+
     const int pad_top = pads[0];
     const int pad_left = pads[2];
     const int num = in_dims[0];
@@ -376,7 +402,8 @@ template <typename T>
 class Pad2dGradCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto pads = context.Attr<std::vector<int>>("paddings");
+    int pads[4];
+    GetPaddings(pads, context);
     auto mode = context.Attr<std::string>("mode");
     auto data_format = context.Attr<std::string>("data_format");
     auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
@@ -442,21 +469,35 @@ class Pad2dOp : public framework::OperatorWithKernel {
                    "Output(Out) of Pad2dOp should not be null.");
 
     auto x_dim = ctx->GetInputDim("X");
-    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
     PADDLE_ENFORCE_EQ(x_dim.size(), 4,
-                      "Size of paddings should be equal to 4.");
-    std::vector<int64_t> out_dims(x_dim.size());
+                      "The size of input(X)'s dimension should be equal to 4.");
 
+    std::vector<int64_t> out_dims(x_dim.size());
     auto data_format = ctx->Attrs().Get<std::string>("data_format");
     out_dims[0] = x_dim[0];
-    if (data_format == "NCHW") {
+    if (ctx->HasInput("Paddings")) {
+      auto paddings_dim = ctx->GetInputDim("Paddings");
+      PADDLE_ENFORCE_EQ(
+          paddings_dim.size(), 1,
+          "Size of Input(Paddings)'s dimension should be equal to 1.");
+      PADDLE_ENFORCE_EQ(paddings_dim[0], 4,
+                        "Shape of Input(Paddings) should be equal to [4].");
       out_dims[1] = x_dim[1];
-      out_dims[2] = x_dim[2] + paddings[0] + paddings[1];  // height
-      out_dims[3] = x_dim[3] + paddings[2] + paddings[3];  // width
-    } else {                                               // NHWC
+      out_dims[2] = x_dim[2];
       out_dims[3] = x_dim[3];
-      out_dims[1] = x_dim[1] + paddings[0] + paddings[1];
-      out_dims[2] = x_dim[2] + paddings[2] + paddings[3];
+    } else {
+      auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+      PADDLE_ENFORCE_EQ(paddings.size(), 4,
+                        "Size of paddings should be equal to 4.");
+      if (data_format == "NCHW") {
+        out_dims[1] = x_dim[1];
+        out_dims[2] = x_dim[2] + paddings[0] + paddings[1];  // height
+        out_dims[3] = x_dim[3] + paddings[2] + paddings[3];  // width
+      } else {                                               // NHWC
+        out_dims[3] = x_dim[3];
+        out_dims[1] = x_dim[1] + paddings[0] + paddings[1];
+        out_dims[2] = x_dim[2] + paddings[2] + paddings[3];
+      }
     }
 
     ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
@@ -466,6 +507,13 @@ class Pad2dOp : public framework::OperatorWithKernel {
       ctx->ShareLoD("X", /*->*/ "Out");
     }
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace());
+  }
 };
 
 class Pad2dOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -477,6 +525,12 @@ class Pad2dOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out",
               "The output of pad2d op. "
               "A tensor with the same shape as X.");
+    AddInput("Paddings",
+             "A 1-D tensor to describe the padding rules."
+             "paddings=[0, 1, 2, 3] means "
+             "padding 0 row to top, 1 row to bottom, 2 columns to left "
+             "and 3 columns to right. Size of paddings must be 4.")
+        .AsDispensable();
     AddAttr<std::vector<int>>(
         "paddings",
         "(vector<int>) "
@@ -554,6 +608,13 @@ class Pad2dOpGrad : public framework::OperatorWithKernel {
       ctx->SetOutputDim(x_grad_name, x_dims);
     }
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace());
+  }
 };
 
 class Pad2dOpGradMaker : public framework::SingleGradOpDescMaker {
@@ -564,6 +625,7 @@ class Pad2dOpGradMaker : public framework::SingleGradOpDescMaker {
   std::unique_ptr<framework::OpDesc> Apply() const override {
     auto* bind = new framework::OpDesc();
     bind->SetInput("X", Input("X"));
+    bind->SetInput("Paddings", Input("Paddings"));
     bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
     bind->SetOutput(framework::GradVarName("X"), InputGrad("X"));
     bind->SetAttrMap(Attrs());
diff --git a/paddle/fluid/operators/pad2d_op.cu b/paddle/fluid/operators/pad2d_op.cu
index 9ba0ddbd84a43cfd5f028ce072b5c7606fae343d..72eca08b06b144335424a669241b5754beda758d 100644
--- a/paddle/fluid/operators/pad2d_op.cu
+++ b/paddle/fluid/operators/pad2d_op.cu
@@ -287,20 +287,50 @@ __global__ void Pad2DGradEdgeNHWC(const int out_size, T* d_in_data,
   }
 }
 
+static inline void GetPaddings(int* paddings,
+                               const framework::ExecutionContext& context) {
+  auto* paddings_t = context.Input<Tensor>("Paddings");
+  if (paddings_t) {
+    Tensor pads;
+    framework::TensorCopySync(*paddings_t, platform::CPUPlace(), &pads);
+    auto pads_data = pads.data<int>();
+    paddings[0] = pads_data[0];
+    paddings[1] = pads_data[1];
+    paddings[2] = pads_data[2];
+    paddings[3] = pads_data[3];
+  } else {
+    auto pads = context.Attr<std::vector<int>>("paddings");
+    std::copy(pads.begin(), pads.end(), paddings);
+  }
+}
+
 template <typename T>
 class Pad2dCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto pads = context.Attr<std::vector<int>>("paddings");
+    int pads[4];
+    GetPaddings(pads, context);
     auto mode = context.Attr<std::string>("mode");
     auto data_format = context.Attr<std::string>("data_format");
     T value = context.Attr<T>("pad_value");
+
     auto* x = context.Input<Tensor>("X");
-    auto* out = context.Output<Tensor>("Out");
     auto in_dims = x->dims();
-    auto out_dims = out->dims();
     const T* in_data = x->data<T>();
-    T* out_data = out->mutable_data<T>(context.GetPlace());
+    auto* out = context.Output<Tensor>("Out");
+    auto out_dims = out->dims();
+    if (data_format == "NCHW") {
+      out_dims[0] = in_dims[0];
+      out_dims[1] = in_dims[1];
+      out_dims[2] = in_dims[2] + pads[0] + pads[1];
+      out_dims[3] = in_dims[3] + pads[2] + pads[3];
+    } else {
+      out_dims[0] = in_dims[0];
+      out_dims[1] = in_dims[1] + pads[0] + pads[1];
+      out_dims[2] = in_dims[2] + pads[2] + pads[3];
+      out_dims[3] = in_dims[3];
+    }
+    T* out_data = out->mutable_data<T>(out_dims, context.GetPlace());
     const int pad_top = pads[0];
     const int pad_left = pads[2];
     const int num = in_dims[0];
@@ -356,7 +386,8 @@ template <typename T>
 class Pad2dGradCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto pads = context.Attr<std::vector<int>>("paddings");
+    int pads[4];
+    GetPaddings(pads, context);
     auto mode = context.Attr<std::string>("mode");
     auto data_format = context.Attr<std::string>("data_format");
     auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
diff --git a/paddle/fluid/operators/pad_constant_like_op.cu b/paddle/fluid/operators/pad_constant_like_op.cu
index ea69577904577de353b63491973bf74b7724e18e..9e62a6dc9d34a96c59a08d0e5fd6cdd9f0d6d51d 100644
--- a/paddle/fluid/operators/pad_constant_like_op.cu
+++ b/paddle/fluid/operators/pad_constant_like_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/pad_constant_like_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/pad_op.cu b/paddle/fluid/operators/pad_op.cu
index 9cddef9cf1d3c43701a4f0ed3f70dcb30c1dbd02..95098a8dca36594c3af60ad8488217e71c673a75 100644
--- a/paddle/fluid/operators/pad_op.cu
+++ b/paddle/fluid/operators/pad_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/pad_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc
index 58cfbb76e93a1c15c9b7cf9f9e596066c29b7ebb..64d94ab6044c1992145062319120b0372f5061c0 100644
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -58,7 +58,7 @@ class PReluOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        platform::CPUPlace());
+        ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/prelu_op.cu b/paddle/fluid/operators/prelu_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..36b5259ae5106914f5668625cad535ebc8aa72ec
--- /dev/null
+++ b/paddle/fluid/operators/prelu_op.cu
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/prelu.h"
+#include "paddle/fluid/operators/prelu_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class CUDAPReluKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* alpha = context.Input<Tensor>("Alpha");
+    auto* out = context.Output<Tensor>("Out");
+
+    const T* x_ptr = x->data<T>();
+    T* o_ptr = out->mutable_data<T>(context.GetPlace());
+
+    const T* alpha_ptr = alpha->data<T>();
+    auto& mode = context.Attr<std::string>("mode");
+
+    int numel = x->numel();
+    auto dim = x->dims();
+    std::vector<int> input_shape = framework::vectorize2int(dim);
+
+    if (mode == "channel") {
+      math::PreluChannelWiseDirectCUDAFunctor<T> prelu_channel_wise;
+      prelu_channel_wise(context.cuda_device_context().stream(), x_ptr,
+                         alpha_ptr, o_ptr, input_shape);
+    } else if (mode == "element") {
+      math::PreluElementWiseDirectCUDAFunctor<T> prelu_element_wise;
+      prelu_element_wise(context.cuda_device_context().stream(), x_ptr,
+                         alpha_ptr, o_ptr, input_shape);
+    } else {
+      math::PreluScalarDirectCUDAFunctor<T> prelu_scalar;
+      prelu_scalar(context.cuda_device_context().stream(), x_ptr, alpha_ptr,
+                   o_ptr, input_shape);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    prelu, ops::CUDAPReluKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::CUDAPReluKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/psroi_pool_op.cc b/paddle/fluid/operators/psroi_pool_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6978d9c5dc5993e64793f420a63dcca020f47868
--- /dev/null
+++ b/paddle/fluid/operators/psroi_pool_op.cc
@@ -0,0 +1,173 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/psroi_pool_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+class PSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor), "
+             "the input of PSROIPoolOp. "
+             "The format of input tensor is NCHW. Where N is the batch size, "
+             "C is the number of input channels, "
+             "H is the height of the input feature map, and "
+             "W is the width.");
+    AddInput("ROIs",
+             "(LoDTensor), "
+             "ROIs (Regions of Interest) to pool over. "
+             "should be a 2-D LoDTensor of shape (num_rois, 4) "
+             "given as [(x1, y1, x2, y2), ...]. "
+             "where (x1, y1) is the top left coordinates, and "
+             "(x2, y2) is the bottom right coordinates. "
+             "The roi batch index can be calculated from LoD.");
+    AddOutput("Out",
+              "(Tensor), "
+              "the output of PSROIPoolOp is a 4-D Tensor with shape "
+              "(num_rois, output_channels, pooled_h, pooled_w).");
+    AddAttr<int>(
+        "output_channels",
+        "(int), "
+        "the number of channels of the output feature map. "
+        "For a task of C classes of objects, output_channels should be "
+        "(C + 1) for classification only.");
+    AddAttr<float>("spatial_scale",
+                   "(float, default 1.0), "
+                   "Multiplicative spatial scale factor "
+                   "to translate ROI coords from their input scale "
+                   "to the scale used when pooling.")
+        .SetDefault(1.0);
+    AddAttr<int>("pooled_height",
+                 "(int, default 1), "
+                 "the pooled output height.")
+        .SetDefault(1);
+    AddAttr<int>("pooled_width",
+                 "(int, default 1), "
+                 "the pooled output width.")
+        .SetDefault(1);
+    AddComment(R"Doc(
+**PSROIPool Operator**
+
+Position sensitive region of interest pooling (also known as PSROIPooling) is to perform
+position-sensitive average pooling on regions of interest specified by input, takes as 
+input N position-sensitive score maps and a list of num_rois regions of interest. 
+
+PSROIPooling for R-FCN. Please refer to https://arxiv.org/abs/1605.06409 for more details.
+    )Doc");
+  }
+};
+
+class PSROIPoolOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of PSROIPoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ROIs"),
+                   "Input(ROIs) of PSROIPoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of PSROIPoolOp should not be null.");
+    auto input_dims = ctx->GetInputDim("X");
+    auto rois_dims = ctx->GetInputDim("ROIs");
+
+    PADDLE_ENFORCE(input_dims.size() == 4,
+                   "The format of input tensor is NCHW");
+    PADDLE_ENFORCE(rois_dims.size() == 2,
+                   "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
+                   "given as [(x1, y1, x2, y2), ...]");
+    PADDLE_ENFORCE(rois_dims[1] == 4,
+                   "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
+                   "given as [(x1, y1, x2, y2), ...]");
+
+    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
+    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
+    int output_channels = ctx->Attrs().Get<int>("output_channels");
+    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
+
+    PADDLE_ENFORCE(
+        input_dims[1] == output_channels * pooled_height * pooled_width,
+        "the channel of X(%d) should be equal to the product of "
+        "output_channels(%d), pooled_height(%d) and pooled_width(%d)",
+        input_dims[1], output_channels, pooled_height, pooled_width);
+
+    PADDLE_ENFORCE_GT(pooled_height, 0,
+                      "The pooled output height must be greater than 0");
+    PADDLE_ENFORCE_GT(pooled_width, 0,
+                      "The pooled output width must be greater than 0");
+    PADDLE_ENFORCE_GT(output_channels, 1,
+                      "The pooled output channels must greater than 1");
+    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
+                      "The spatial scale must greater than 0.");
+
+    auto out_dims = input_dims;
+    out_dims[0] = rois_dims[0];
+    out_dims[1] =
+        output_channels;  // input_dims[1] / (pooled_height * pooled_width);
+    out_dims[2] = pooled_height;
+    out_dims[3] = pooled_width;
+    ctx->SetOutputDim("Out", out_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class PSROIPoolGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "The gradient of Out should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "The gradient of X should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(psroi_pool, ops::PSROIPoolOp, ops::PSROIPoolOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp);
+REGISTER_OP_CPU_KERNEL(
+    psroi_pool,
+    ops::CPUPSROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CPUPSROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    psroi_pool_grad,
+    ops::CPUPSROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CPUPSROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/psroi_pool_op.cu b/paddle/fluid/operators/psroi_pool_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..22fec3244fabe5ca466202784c0cce372d0bf6e5
--- /dev/null
+++ b/paddle/fluid/operators/psroi_pool_op.cu
@@ -0,0 +1,294 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/psroi_pool_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaximumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaximumNumBlocks);
+}
+
+template <typename T>
+__global__ void GPUPSROIPoolForward(
+    const int nthreads, const T* input_data, const T* input_rois,
+    const float spatial_scale, const int input_channels, const int height,
+    const int width, const int output_channels, const int pooled_height,
+    const int pooled_width, const int* rois_batch_id_data, T* output_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (size_t i = index; i < nthreads; i += offset) {
+    // The output is in order (n, c, ph, pw)
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % output_channels;
+    int n = i / pooled_width / pooled_height / output_channels;
+
+    // set roi_batch_id
+    int roi_batch_id = rois_batch_id_data[n];
+
+    // [start, end) interval for spatial sampling
+    const T* offset_input_rois = input_rois + n * 4;
+    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+    T roi_end_w =
+        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+    T roi_end_h =
+        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+
+    // Force too small ROIs to be 1x1
+    T roi_height = max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+    T roi_width = max(roi_end_w - roi_start_w, (T)0.1);
+
+    // Compute w and h at input feature map
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
+    int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
+    int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
+    int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart, 0), height);
+    hend = min(max(hend, 0), height);
+    wstart = min(max(wstart, 0), width);
+    wend = min(max(wend, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+    const T* offset_input_data =
+        input_data +
+        (roi_batch_id * input_channels + input_channel) * height * width;
+    T outsum = 0;
+
+    for (int ih = hstart; ih < hend; ++ih) {
+      for (int iw = wstart; iw < wend; ++iw) {
+        int input_index = ih * width + iw;
+        outsum += offset_input_data[input_index];
+      }
+    }
+
+    T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
+    output_data[i] = is_empty ? 0. : outsum / bin_area;
+  }
+}
+
+template <typename T>
+__global__ void GPUPSROIPoolBackward(
+    const int nthreads, const T* input_rois, const T* output_grad_data,
+    const float spatial_scale, const int input_channels, const int height,
+    const int width, const int output_channels, const int pooled_height,
+    const int pooled_width, const int* rois_batch_id_data, T* input_grad_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    // The output is in order (n, c, ph, pw)
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % output_channels;
+    int n = i / pooled_width / pooled_height / output_channels;
+
+    // set roi_batch_id
+    int roi_batch_id = rois_batch_id_data[n];
+    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+    int input_offset =
+        (roi_batch_id * input_channels + input_channel) * height * width;
+    T* offset_input_grad_data = input_grad_data + input_offset;
+
+    // [start, end) interval for spatial sampling
+    const T* offset_input_rois = input_rois + n * 4;
+    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+    T roi_end_w =
+        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+    T roi_end_h =
+        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+
+    // Force too small ROIs to be 1x1
+    T roi_height = max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+    T roi_width = max(roi_end_w - roi_start_w, (T)0.1);
+
+    // Compute w and h at input feature map
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
+    int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
+    int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
+    int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart, 0), height);
+    hend = min(max(hend, 0), height);
+    wstart = min(max(wstart, 0), width);
+    wend = min(max(wend, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    // Accumulate diff_val into input data
+    T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
+    T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area;
+    for (int ih = hstart; ih < hend; ++ih) {
+      for (int iw = wstart; iw < wend; ++iw) {
+        int input_index = ih * width + iw;
+        platform::CudaAtomicAdd(offset_input_grad_data + input_index, diff_val);
+      }
+    }
+  }
+}
+
+template <typename Place, typename T>
+class GPUPSROIPoolOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<Tensor>("X");
+    auto* rois = ctx.Input<LoDTensor>("ROIs");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto output_channels = ctx.Attr<int>("output_channels");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    auto in_dims = in->dims();
+    int batch_size = in_dims[0];
+    int input_channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+
+    PADDLE_ENFORCE_EQ(input_channels,
+                      output_channels * pooled_height * pooled_width,
+                      "the channels of input X should equal the product of "
+                      "output_channels x pooled_height x pooled_width");
+
+    int rois_num = rois->dims()[0];
+    if (rois_num == 0) return;
+
+    auto rois_lod = rois->lod().back();
+    int rois_batch_size = rois_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        rois_batch_size, batch_size,
+        "The rois_batch_size and input(X) batch_size must be the same.");
+    int rois_num_with_lod = rois_lod[rois_batch_size];
+    PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
+                      "The rois_num from input and lod must be the same.");
+
+    // set rois batch id
+    framework::Tensor rois_batch_id_list;
+    rois_batch_id_list.Resize({rois_num});
+    int* rois_batch_id_data =
+        rois_batch_id_list.mutable_data<int>(platform::CPUPlace());
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+        rois_batch_id_data[i] = n;
+      }
+    }
+
+    framework::Tensor rois_batch_id_list_gpu;
+    framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(),
+                          ctx.device_context(), &rois_batch_id_list_gpu);
+
+    int output_size = out->numel();
+    int blocks = NumBlocks(output_size);
+    int threads = kNumCUDAThreads;
+
+    // call cuda kernel function
+    GPUPSROIPoolForward<
+        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+        output_size, in->data<T>(), rois->data<T>(), spatial_scale,
+        input_channels, height, width, output_channels, pooled_height,
+        pooled_width, rois_batch_id_list_gpu.data<int>(),
+        out->mutable_data<T>(ctx.GetPlace()));
+  }
+};
+
+template <typename Place, typename T>
+class GPUPSROIPoolGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<Tensor>("X");
+    auto* rois = ctx.Input<LoDTensor>("ROIs");
+
+    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto output_channels = ctx.Attr<int>("output_channels");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    int rois_num = rois->dims()[0];
+    int input_channels = in->dims()[1];
+    int height = in->dims()[2];
+    int width = in->dims()[3];
+
+    if (input_grad) {
+      // set roi batch id
+      framework::Tensor rois_batch_id_list;
+      rois_batch_id_list.Resize({rois_num});
+      int* rois_batch_id_data =
+          rois_batch_id_list.mutable_data<int>(platform::CPUPlace());
+      auto rois_lod = rois->lod().back();
+      int rois_batch_size = rois_lod.size() - 1;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
+      }
+
+      framework::Tensor rois_batch_id_list_gpu;
+      framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(),
+                            ctx.device_context(), &rois_batch_id_list_gpu);
+
+      input_grad->mutable_data<T>(ctx.GetPlace());
+      math::SetConstant<Place, T> set_zero;
+      set_zero(ctx.cuda_device_context(), input_grad, static_cast<T>(0));
+
+      int output_grad_size = output_grad->numel();
+      int blocks = NumBlocks(output_grad_size);
+      int threads = kNumCUDAThreads;
+
+      if (output_grad_size > 0) {
+        GPUPSROIPoolBackward<
+            T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+            output_grad_size, rois->data<T>(), output_grad->data<T>(),
+            spatial_scale, input_channels, height, width, output_channels,
+            pooled_height, pooled_width, rois_batch_id_list_gpu.data<int>(),
+            input_grad->mutable_data<T>(ctx.GetPlace()));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    psroi_pool,
+    ops::GPUPSROIPoolOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GPUPSROIPoolOpKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    psroi_pool_grad,
+    ops::GPUPSROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GPUPSROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/psroi_pool_op.h b/paddle/fluid/operators/psroi_pool_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a424728f7f6c4034242fb998d5121804e38702b
--- /dev/null
+++ b/paddle/fluid/operators/psroi_pool_op.h
@@ -0,0 +1,253 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class CPUPSROIPoolOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+    auto output_channels = ctx.Attr<int>("output_channels");
+
+    auto in_dims = in->dims();
+    int batch_size = in_dims[0];
+    int input_channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+    int rois_num = rois->dims()[0];
+
+    auto in_stride = framework::stride(in_dims);
+    auto roi_stride = framework::stride(rois->dims());
+    auto out_stride = framework::stride(out->dims());
+
+    const T* input_data = in->data<T>();
+
+    framework::Tensor rois_batch_id_list;
+    rois_batch_id_list.Resize({rois_num});
+    int* rois_batch_id_data =
+        rois_batch_id_list.mutable_data<int>(ctx.GetPlace());
+
+    auto rois_lod = rois->lod().back();
+    int rois_batch_size = rois_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        rois_batch_size, batch_size,
+        "the rois_batch_size and input(X) batch_size should be the same.");
+    int rois_num_with_lod = rois_lod[rois_batch_size];
+    PADDLE_ENFORCE_EQ(rois_num_with_lod, rois_num,
+                      "the rois_num from input and lod must be the same");
+
+    PADDLE_ENFORCE_EQ(input_channels,
+                      output_channels * pooled_height * pooled_width,
+                      "the channels of input X should equal the product of "
+                      "output_channels x pooled_height x pooled_width");
+
+    // calculate batch id index for each roi according to LoD
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+        rois_batch_id_data[i] = n;
+      }
+    }
+
+    T* output_data = out->mutable_data<T>(ctx.GetPlace());
+    const T* input_rois = rois->data<T>();
+
+    // calculate psroipooling, parallel processing can be implemented per ROI
+    for (int n = 0; n < rois_num; ++n) {
+      // set roi batch id
+      int roi_batch_id = rois_batch_id_data[n];
+
+      // [start, end) interval for spatial sampling
+      const T* offset_input_rois = input_rois + n * 4;
+      T roi_start_w =
+          static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+      T roi_start_h =
+          static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+      T roi_end_w =
+          static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+      T roi_end_h =
+          static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+
+      // Force too small rois to be 1 x 1
+      T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+      T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1);
+
+      // Compute bin size w and h at input feature map
+      T bin_size_h = roi_height / static_cast<T>(pooled_height);
+      T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+      // calculate each pixel of the output feature map.
+      int out_roi_offset = n * out_stride[0];
+      for (int c = 0; c < output_channels; ++c) {
+        // per category
+        int out_plane_offset = out_roi_offset + c * out_stride[1];
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          int out_row_offset = out_plane_offset + ph * out_stride[2];
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            // calculate w and h at input feature map
+            int hstart = floor(static_cast<T>(ph) * bin_size_h + roi_start_h);
+            int wstart = floor(static_cast<T>(pw) * bin_size_w + roi_start_w);
+            int hend = ceil(static_cast<T>(ph + 1) * bin_size_h + roi_start_h);
+            int wend = ceil(static_cast<T>(pw + 1) * bin_size_w + roi_start_w);
+            //  Add roi offsets and clip to input boundaries
+            hstart = std::min(std::max(hstart, 0), height);
+            wstart = std::min(std::max(wstart, 0), width);
+            hend = std::min(std::max(hend, 0), height);
+            wend = std::min(std::max(wend, 0), width);
+
+            int output_index = out_row_offset + pw;
+            int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+            int input_plane_offset =
+                roi_batch_id * in_stride[0] + input_channel * in_stride[1];
+            const T* offset_input_data = input_data + input_plane_offset;
+            T out_sum = 0.;
+            bool is_empty = (hend <= hstart) || (wend <= wstart);
+            for (int ih = hstart; ih < hend; ++ih) {
+              for (int iw = wstart; iw < wend; ++iw) {
+                int input_index = ih * in_stride[2] + iw;
+                out_sum += offset_input_data[input_index];
+              }
+            }
+            T bin_area = (hend - hstart) * (wend - wstart);
+            output_data[output_index] = is_empty ? 0. : out_sum / bin_area;
+          }
+        }
+      }
+    }
+    return;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CPUPSROIPoolGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
+    auto* output_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* input_grad =
+        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto output_channels = ctx.Attr<int>("output_channels");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    if (input_grad) {
+      auto in_dims = in->dims();
+      int input_channels = in_dims[1];
+      int height = in_dims[2];
+      int width = in_dims[3];
+      int rois_num = rois->dims()[0];
+
+      // set roi batch id
+      framework::Tensor rois_batch_id_list;
+      rois_batch_id_list.Resize({rois_num});
+      int* rois_batch_id_data =
+          rois_batch_id_list.mutable_data<int>(ctx.GetPlace());
+      auto rois_lod = rois->lod().back();
+      int rois_batch_size = rois_lod.size() - 1;
+      // calculate batch id index for each roi according to LoD
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
+      }
+
+      const T* input_rois = rois->data<T>();
+      const T* output_grad_data = output_grad->data<T>();
+      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+
+      // set gradient of X to be 0. before backpropagate.
+      math::SetConstant<DeviceContext, T> set_zero;
+      set_zero(ctx.template device_context<DeviceContext>(), input_grad,
+               static_cast<T>(0));
+
+      // backpropagate gradient per output pixel
+      int output_grad_size = output_grad->numel();
+      for (int i = 0; i < output_grad_size; ++i) {
+        // The output is in order (n, c, ph, pw)
+        int pw = i % pooled_width;
+        int ph = (i / pooled_width) % pooled_height;
+        int c = (i / pooled_width / pooled_height) % output_channels;
+        int n = i / pooled_width / pooled_height / output_channels;
+
+        // set roi_batch_id
+        int roi_batch_id = rois_batch_id_data[n];
+        int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+        int input_offset =
+            (roi_batch_id * input_channels + input_channel) * height * width;
+        T* offset_input_grad_data = input_grad_data + input_offset;
+
+        // [start, end) interval for spatial sampling
+        const T* offset_input_rois = input_rois + n * 4;
+        T roi_start_w =
+            static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+        T roi_start_h =
+            static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+        T roi_end_w =
+            static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+        T roi_end_h =
+            static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+
+        // Force too small ROIs to be 1x1
+        T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+        T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1);
+
+        // Compute w and h at input feature map
+        T bin_size_h = roi_height / static_cast<T>(pooled_height);
+        T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+        int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
+        int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
+        int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
+        int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
+
+        // Add roi offsets and clip to input boundaries
+        hstart = std::min(std::max(hstart, 0), height);
+        hend = std::min(std::max(hend, 0), height);
+        wstart = std::min(std::max(wstart, 0), width);
+        wend = std::min(std::max(wend, 0), width);
+        bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+        // Accumulate diff_val into input data
+        T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
+        T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area;
+        for (int ih = hstart; ih < hend; ++ih) {
+          for (int iw = wstart; iw < wend; ++iw) {
+            int input_index = ih * width + iw;
+            offset_input_grad_data[input_index] += diff_val;
+          }
+        }
+      }
+    }
+    return;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h
index 9b2a11bae12d242880829628faa089e1638424b0..7fc07efe7304701794595c9fa63f4a306d61e230 100644
--- a/paddle/fluid/operators/reader/ctr_reader.h
+++ b/paddle/fluid/operators/reader/ctr_reader.h
@@ -16,6 +16,7 @@
 
 #include <sys/time.h>
 
+#include <algorithm>
 #include <chrono>  // NOLINT
 #include <cstdlib>
 #include <fstream>
@@ -55,8 +56,7 @@ class CTRReader : public framework::FileReader {
     PADDLE_ENFORCE_GT(thread_num, 0, "thread num should be larger then 0!");
     PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null");
     PADDLE_ENFORCE_GT(file_list.size(), 0, "file list should not be empty");
-    thread_num_ =
-        file_list_.size() > thread_num ? thread_num : file_list_.size();
+    thread_num_ = std::min<size_t>(file_list_.size(), thread_num);
     queue_ = queue;
     SplitFiles();
     for (size_t i = 0; i < thread_num_; ++i) {
@@ -95,10 +95,10 @@ class CTRReader : public framework::FileReader {
     queue_->ReOpen();
     VLOG(3) << "reopen success";
     VLOG(3) << "thread_num " << thread_num_;
-    for (int thread_id = 0; thread_id < thread_num_; thread_id++) {
-      read_threads_.emplace_back(new std::thread(
-          std::bind(&ReadThread, file_groups_[thread_id], slots_, batch_size_,
-                    thread_id, &read_thread_status_, queue_)));
+    for (size_t thread_id = 0; thread_id < thread_num_; thread_id++) {
+      read_threads_.emplace_back(new std::thread(std::bind(
+          &ReadThread, file_groups_[thread_id], slots_, batch_size_,
+          static_cast<int>(thread_id), &read_thread_status_, queue_)));
     }
     monitor_thread_.reset(new std::thread(
         std::bind(&MonitorThread, &read_thread_status_, queue_)));
diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
index 18acb735cecabd1e01f7821c880fd8ed5e52971f..8fceed3558b4357b7863368c18add329ea9922b3 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
@@ -36,12 +36,10 @@ class SequenceMaskOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must exist");
     PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) must exist");
 
-    auto maxlen = ctx->Attrs().Get<int>("maxlen");
-    if (maxlen > 0) {  // We can only infershape when maxlen > 0
-      auto dim = framework::vectorize2int(ctx->GetInputDim("X"));
-      dim.push_back(maxlen);
-      ctx->SetOutputDim("Y", framework::make_ddim(dim));
-    }
+    int maxlen = ctx->Attrs().Get<int>("maxlen");
+    auto dim = framework::vectorize2int(ctx->GetInputDim("X"));
+    dim.push_back(maxlen > 0 ? maxlen : -1);
+    ctx->SetOutputDim("Y", framework::make_ddim(dim));
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu
index 63cd47a38a0ff6413c430c6be6284c5f4bfc2595..4897474a485d8417854ffb53aa8ee64321c78ae7 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu
@@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
-
 #include "paddle/fluid/operators/sequence_ops/sequence_pool_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
index 193de05422bb78572c0e5eaf4cd46744c3bcb113..14746fa95159d707be7c10c69a4ffc2211e17a93 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -18,6 +18,7 @@ namespace paddle {
 namespace operators {
 
 using framework::Tensor;
+const int kIgnoreIndex = -100;
 
 class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
  public:
@@ -100,6 +101,11 @@ class SigmoidCrossEntropyWithLogitsOpMaker
     AddOutput("Out",
               "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D "
               " of elementwise logistic losses.");
+    AddAttr<int>("ignore_index",
+                 "(int, default kIgnoreIndex), Specifies a target value that "
+                 "is ignored and"
+                 "does not contribute to the input gradient.")
+        .SetDefault(kIgnoreIndex);
     AddComment(R"DOC(
 SigmoidCrossEntropyWithLogits Operator.
 
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
index 9aadac1a416034a3510dea2916d7577efbc2f8c2..a1fbc7e5fab71df486b53c31464c99e9c4557ccd 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
index faef72866eb491887bbf221d32a8121b21fc3c66..b8731c232753074fa9e76b028485d3598c9a7295 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
@@ -15,33 +15,72 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/legacy/utils/Logging.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+struct SigmoidCrossEntropyWithLogitsForward {
+  HOSTDEVICE SigmoidCrossEntropyWithLogitsForward(const int &ignore_index)
+      : ignore_index(ignore_index) {}
+
+  HOSTDEVICE T operator()(const T &x, const T &label) const {
+    if (static_cast<int>(label) == ignore_index) {
+      return static_cast<T>(0.);
+    }
+    T term1 = (x > 0) ? x : 0;
+    T term2 = x * label;
+    T term3 = std::log(static_cast<T>(1) + std::exp(-(std::abs(x))));
+    return term1 - term2 + term3;
+  }
+
+  int ignore_index;
+};
+
+template <typename T>
+struct SigmoidCrossEntropyWithLogitsBackward {
+  HOSTDEVICE SigmoidCrossEntropyWithLogitsBackward(const int &ignore_index)
+      : ignore_index(ignore_index) {}
+
+  HOSTDEVICE T operator()(const T &x, const T &label) const {
+    if (static_cast<int>(label) == ignore_index) {
+      return static_cast<T>(0.);
+    }
+    T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-x));
+    return simoid_x - label;
+  }
+
+  int ignore_index;
+};
+
 // Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
 template <typename DeviceContext, typename T>
 class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    const framework::Tensor *X = context.Input<framework::Tensor>("X");
-    const framework::Tensor *Labels = context.Input<framework::Tensor>("Label");
-    framework::Tensor *Out = context.Output<framework::Tensor>("Out");
+    const Tensor *X = context.Input<Tensor>("X");
+    const Tensor *Labels = context.Input<Tensor>("Label");
+    Tensor *Out = context.Output<Tensor>("Out");
     Out->mutable_data<T>(context.GetPlace());
+    int ignore_index = context.Attr<int>("ignore_index");
 
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto labels = framework::EigenVector<T>::Flatten(*Labels);
-    auto out = framework::EigenVector<T>::Flatten(*Out);
+    auto x = EigenVector<T>::Flatten(*X);
+    auto labels = EigenVector<T>::Flatten(*Labels);
+    auto out = EigenVector<T>::Flatten(*Out);
     auto &place = *context.device_context<DeviceContext>().eigen_device();
 
-    // term1 = max(x, 0)
-    auto term1 = x.cwiseMax(static_cast<T>(0));
-    // term2 = x * labels
-    auto term2 = x * labels;
-    // term3 = log(1 + exp(-abs(x)))
-    auto term3 = (static_cast<T>(1) + (-(x.abs())).exp()).log();
-
-    out.device(place) = term1 - term2 + term3;
+    out.device(place) = x.binaryExpr(
+        labels, SigmoidCrossEntropyWithLogitsForward<T>(ignore_index));
   }
 };
 
@@ -50,23 +89,23 @@ template <typename DeviceContext, typename T>
 class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    const framework::Tensor *X = context.Input<framework::Tensor>("X");
-    const framework::Tensor *Labels = context.Input<framework::Tensor>("Label");
-    const framework::Tensor *dOut =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    framework::Tensor *dX =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    const Tensor *X = context.Input<Tensor>("X");
+    const Tensor *Labels = context.Input<Tensor>("Label");
+    const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
     dX->mutable_data<T>(context.GetPlace());
 
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto labels = framework::EigenVector<T>::Flatten(*Labels);
-    auto dout = framework::EigenVector<T>::Flatten(*dOut);
-    auto dx = framework::EigenVector<T>::Flatten(*dX);
+    auto ignore_index = context.Attr<int>("ignore_index");
+    auto x = EigenVector<T>::Flatten(*X);
+    auto labels = EigenVector<T>::Flatten(*Labels);
+    auto dout = EigenVector<T>::Flatten(*dOut);
+    auto dx = EigenVector<T>::Flatten(*dX);
     auto &place =
         *context.template device_context<DeviceContext>().eigen_device();
 
-    auto sigmoid_x = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
-    dx.device(place) = dout * (sigmoid_x - labels);
+    auto diff = x.binaryExpr(labels, SigmoidCrossEntropyWithLogitsBackward<T>(
+                                         static_cast<int>(ignore_index)));
+    dx.device(place) = dout * diff;
   }
 };
 
diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cu b/paddle/fluid/operators/smooth_l1_loss_op.cu
index dfbb5c905884b57413587a4f6c33b0238b740c73..e5df479090fabe926f65f58e2300e3ee2027e54d 100644
--- a/paddle/fluid/operators/smooth_l1_loss_op.cu
+++ b/paddle/fluid/operators/smooth_l1_loss_op.cu
@@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
-
 #include "paddle/fluid/operators/smooth_l1_loss_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/softmax_mkldnn_op.cc b/paddle/fluid/operators/softmax_mkldnn_op.cc
index 01819f53e3ab0973f6140c5a81f18f954b6a0376..d2b149535426d097fea4b8fffa9efe82bd6edc64 100644
--- a/paddle/fluid/operators/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/softmax_mkldnn_op.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <iostream>
 #include "mkldnn.hpp"
 #include "paddle/fluid/operators/softmax_op.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h
index 8eb5c7691efe930e9f79ad6a381cb290107d1a14..91829d5761bfdd1f9806af6589a2967fe866fec8 100644
--- a/paddle/fluid/operators/softmax_op.h
+++ b/paddle/fluid/operators/softmax_op.h
@@ -36,9 +36,7 @@ class SoftmaxKernel : public framework::OpKernel<T> {
     Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1);
 
 #ifdef PADDLE_ON_INFERENCE
-    math::SoftmaxFunctor<
-        DeviceContext, T,
-        std::is_same<DeviceContext, platform::CPUDeviceContext>::value>()(
+    math::SoftmaxFunctor<DeviceContext, T, true>()(
         context.template device_context<DeviceContext>(), &X_2d, &Out_2d);
 #else
     math::SoftmaxFunctor<DeviceContext, T, false>()(
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 6d48796191dd13a45f0c7267bfaf05489f528a9d..cee3e87037e0f1439a08b7b275eedefe357a4b13 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
-
 #include <cub/cub.cuh>
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
diff --git a/paddle/fluid/operators/split_selected_rows_op.h b/paddle/fluid/operators/split_selected_rows_op.h
index af64607fafc6544047714e731846a2440be219b8..1fef2b3d378c96d087118d0136885e7e29aa237c 100644
--- a/paddle/fluid/operators/split_selected_rows_op.h
+++ b/paddle/fluid/operators/split_selected_rows_op.h
@@ -72,10 +72,11 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
     for (size_t i = 0; i < outs_rows_idx.size(); ++i) {
       auto rows_idx = outs_rows_idx[i];
       outs[i]->set_height(height_sections[i]);
+      auto dims = x->GetCompleteDims();
+      dims[0] = rows_idx.size();
+      outs[i]->mutable_value()->mutable_data<T>(dims, x->place());
+      outs[i]->mutable_rows()->clear();
       if (rows_idx.size() > 0) {
-        auto dims = x->GetCompleteDims();
-        dims[0] = rows_idx.size();
-        outs[i]->mutable_value()->mutable_data<T>(dims, x->place());
         for (auto idx : rows_idx) {
           outs[i]->mutable_rows()->push_back(idx - abs_sections[i]);
         }
@@ -98,6 +99,8 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
           }
         }
       }
+      PADDLE_ENFORCE_EQ(rows_idx.size(), outs[i]->rows().size(),
+                        "rows should has the same size with tensor dim 0");
     }
   }
 };
diff --git a/paddle/fluid/operators/squared_l2_distance_op.cu b/paddle/fluid/operators/squared_l2_distance_op.cu
index 3e80ae8dd22077c0f9bbdedc24e84f6c339c5a26..c9264da838246efded7d9f85664faf0dc1cec282 100644
--- a/paddle/fluid/operators/squared_l2_distance_op.cu
+++ b/paddle/fluid/operators/squared_l2_distance_op.cu
@@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
-
 #include "paddle/fluid/operators/squared_l2_distance_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/squared_l2_norm_op.cu b/paddle/fluid/operators/squared_l2_norm_op.cu
index 87830413da3f141f01a97966ae0e2b0501ed600a..e31cfeb78ab8a8d1b55a198fe7a2c647a3dce665 100644
--- a/paddle/fluid/operators/squared_l2_norm_op.cu
+++ b/paddle/fluid/operators/squared_l2_norm_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/squared_l2_norm_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu
index db4c2d6c115f04b436db00854ca4b02fea09866b..6125ed07b6d0f92fa317c581a06117dcfa7359ae 100644
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
@@ -8,8 +8,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/sum_op.h"
 #include "paddle/fluid/platform/float16.h"
 
diff --git a/paddle/fluid/operators/yolov3_loss_op.cc b/paddle/fluid/operators/yolov3_loss_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e7597f732430038a4a180297e730340d1bc47b8c
--- /dev/null
+++ b/paddle/fluid/operators/yolov3_loss_op.cc
@@ -0,0 +1,221 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/operators/yolov3_loss_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class Yolov3LossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of Yolov3LossOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("GTBox"),
+                   "Input(GTBox) of Yolov3LossOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("GTLabel"),
+                   "Input(GTLabel) of Yolov3LossOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Loss"),
+                   "Output(Loss) of Yolov3LossOp should not be null.");
+
+    auto dim_x = ctx->GetInputDim("X");
+    auto dim_gtbox = ctx->GetInputDim("GTBox");
+    auto dim_gtlabel = ctx->GetInputDim("GTLabel");
+    auto anchors = ctx->Attrs().Get<std::vector<int>>("anchors");
+    auto class_num = ctx->Attrs().Get<int>("class_num");
+    PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor.");
+    PADDLE_ENFORCE_EQ(dim_x[2], dim_x[3],
+                      "Input(X) dim[3] and dim[4] should be euqal.");
+    PADDLE_ENFORCE_EQ(dim_x[1], anchors.size() / 2 * (5 + class_num),
+                      "Input(X) dim[1] should be equal to (anchor_number * (5 "
+                      "+ class_num)).");
+    PADDLE_ENFORCE_EQ(dim_gtbox.size(), 3,
+                      "Input(GTBox) should be a 3-D tensor");
+    PADDLE_ENFORCE_EQ(dim_gtbox[2], 4, "Input(GTBox) dim[2] should be 5");
+    PADDLE_ENFORCE_EQ(dim_gtlabel.size(), 2,
+                      "Input(GTBox) should be a 2-D tensor");
+    PADDLE_ENFORCE_EQ(dim_gtlabel[0], dim_gtbox[0],
+                      "Input(GTBox) and Input(GTLabel) dim[0] should be same");
+    PADDLE_ENFORCE_EQ(dim_gtlabel[1], dim_gtbox[1],
+                      "Input(GTBox) and Input(GTLabel) dim[1] should be same");
+    PADDLE_ENFORCE_GT(anchors.size(), 0,
+                      "Attr(anchors) length should be greater then 0.");
+    PADDLE_ENFORCE_EQ(anchors.size() % 2, 0,
+                      "Attr(anchors) length should be even integer.");
+    PADDLE_ENFORCE_GT(class_num, 0,
+                      "Attr(class_num) should be an integer greater then 0.");
+
+    std::vector<int64_t> dim_out({1});
+    ctx->SetOutputDim("Loss", framework::make_ddim(dim_out));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        platform::CPUPlace());
+  }
+};
+
+class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input tensor of YOLO v3 loss operator, "
+             "This is a 4-D tensor with shape of [N, C, H, W]."
+             "H and W should be same, and the second dimention(C) stores"
+             "box locations, confidence score and classification one-hot"
+             "key of each anchor box");
+    AddInput("GTBox",
+             "The input tensor of ground truth boxes, "
+             "This is a 3-D tensor with shape of [N, max_box_num, 5], "
+             "max_box_num is the max number of boxes in each image, "
+             "In the third dimention, stores x, y, w, h coordinates, "
+             "x, y is the center cordinate of boxes and w, h is the "
+             "width and height and x, y, w, h should be divided by "
+             "input image height to scale to [0, 1].");
+    AddInput("GTLabel",
+             "The input tensor of ground truth label, "
+             "This is a 2-D tensor with shape of [N, max_box_num], "
+             "and each element shoudl be an integer to indicate the "
+             "box class id.");
+    AddOutput("Loss",
+              "The output yolov3 loss tensor, "
+              "This is a 1-D tensor with shape of [1]");
+
+    AddAttr<int>("class_num", "The number of classes to predict.");
+    AddAttr<std::vector<int>>("anchors",
+                              "The anchor width and height, "
+                              "it will be parsed pair by pair.");
+    AddAttr<float>("ignore_thresh",
+                   "The ignore threshold to ignore confidence loss.");
+    AddAttr<float>("loss_weight_xy", "The weight of x, y location loss.")
+        .SetDefault(1.0);
+    AddAttr<float>("loss_weight_wh", "The weight of w, h location loss.")
+        .SetDefault(1.0);
+    AddAttr<float>(
+        "loss_weight_conf_target",
+        "The weight of confidence score loss in locations with target object.")
+        .SetDefault(1.0);
+    AddAttr<float>("loss_weight_conf_notarget",
+                   "The weight of confidence score loss in locations without "
+                   "target object.")
+        .SetDefault(1.0);
+    AddAttr<float>("loss_weight_class", "The weight of classification loss.")
+        .SetDefault(1.0);
+    AddComment(R"DOC(
+         This operator generate yolov3 loss by given predict result and ground
+         truth boxes.
+         
+         The output of previous network is in shape [N, C, H, W], while H and W
+         should be the same, specify the grid size, each grid point predict given
+         number boxes, this given number is specified by anchors, it should be 
+         half anchors length, which following will be represented as S. In the 
+         second dimention(the channel dimention), C should be S * (class_num + 5),
+         class_num is the box categoriy number of source dataset(such as coco), 
+         so in the second dimention, stores 4 box location coordinates x, y, w, h 
+         and confidence score of the box and class one-hot key of each anchor box.
+
+         While the 4 location coordinates if $$tx, ty, tw, th$$, the box predictions
+         correspnd to:
+
+         $$
+         b_x = \sigma(t_x) + c_x
+         b_y = \sigma(t_y) + c_y
+         b_w = p_w e^{t_w}
+         b_h = p_h e^{t_h}
+         $$
+
+         While $$c_x, c_y$$ is the left top corner of current grid and $$p_w, p_h$$
+         is specified by anchors.
+
+         As for confidence score, it is the logistic regression value of IoU between
+         anchor boxes and ground truth boxes, the score of the anchor box which has 
+         the max IoU should be 1, and if the anchor box has IoU bigger then ignore 
+         thresh, the confidence score loss of this anchor box will be ignored.
+
+         Therefore, the yolov3 loss consist of three major parts, box location loss,
+         confidence score loss, and classification loss. The MSE loss is used for 
+         box location, and binary cross entropy loss is used for confidence score 
+         loss and classification loss.
+
+         Final loss will be represented as follow.
+
+         $$
+         loss = \loss_weight_{xy} * loss_{xy} + \loss_weight_{wh} * loss_{wh}
+              + \loss_weight_{conf_target} * loss_{conf_target}
+              + \loss_weight_{conf_notarget} * loss_{conf_notarget}
+              + \loss_weight_{class} * loss_{class}
+         $$
+         )DOC");
+  }
+};
+
+class Yolov3LossOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
+                   "Input(Loss@GRAD) should not be null");
+    auto dim_x = ctx->GetInputDim("X");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        platform::CPUPlace());
+  }
+};
+
+class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType("yolov3_loss_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput("GTBox", Input("GTBox"));
+    op->SetInput("GTLabel", Input("GTLabel"));
+    op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
+
+    op->SetAttrMap(Attrs());
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("GTBox"), {});
+    op->SetOutput(framework::GradVarName("GTLabel"), {});
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(yolov3_loss, ops::Yolov3LossOp, ops::Yolov3LossOpMaker,
+                  ops::Yolov3LossGradMaker);
+REGISTER_OPERATOR(yolov3_loss_grad, ops::Yolov3LossOpGrad);
+REGISTER_OP_CPU_KERNEL(yolov3_loss, ops::Yolov3LossKernel<float>,
+                       ops::Yolov3LossKernel<double>);
+REGISTER_OP_CPU_KERNEL(yolov3_loss_grad, ops::Yolov3LossGradKernel<float>,
+                       ops::Yolov3LossGradKernel<double>);
diff --git a/paddle/fluid/operators/yolov3_loss_op.h b/paddle/fluid/operators/yolov3_loss_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..0bb285722ddedf721d98237760ec9868e2134442
--- /dev/null
+++ b/paddle/fluid/operators/yolov3_loss_op.h
@@ -0,0 +1,483 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+using Array5 = Eigen::DSizes<int64_t, 5>;
+
+template <typename T>
+static inline bool isZero(T x) {
+  return fabs(x) < 1e-6;
+}
+
+template <typename T>
+static inline T sigmoid(T x) {
+  return 1.0 / (exp(-1.0 * x) + 1.0);
+}
+
+template <typename T>
+static inline T CalcMaskPointNum(const Tensor& mask) {
+  auto mask_t = EigenVector<int>::Flatten(mask);
+  T count = 0.0;
+  for (int i = 0; i < mask_t.dimensions()[0]; i++) {
+    if (mask_t(i)) {
+      count += 1.0;
+    }
+  }
+  return count;
+}
+
+template <typename T>
+static inline T CalcMSEWithMask(const Tensor& x, const Tensor& y,
+                                const Tensor& mask) {
+  auto x_t = EigenVector<T>::Flatten(x);
+  auto y_t = EigenVector<T>::Flatten(y);
+  auto mask_t = EigenVector<int>::Flatten(mask);
+
+  T error_sum = 0.0;
+  T points = 0.0;
+  for (int i = 0; i < x_t.dimensions()[0]; i++) {
+    if (mask_t(i)) {
+      error_sum += pow(x_t(i) - y_t(i), 2);
+      points += 1;
+    }
+  }
+  return (error_sum / points);
+}
+
+template <typename T>
+static void CalcMSEGradWithMask(Tensor* grad, const Tensor& x, const Tensor& y,
+                                const Tensor& mask, T mf) {
+  auto grad_t = EigenVector<T>::Flatten(*grad).setConstant(0.0);
+  auto x_t = EigenVector<T>::Flatten(x);
+  auto y_t = EigenVector<T>::Flatten(y);
+  auto mask_t = EigenVector<int>::Flatten(mask);
+
+  for (int i = 0; i < x_t.dimensions()[0]; i++) {
+    if (mask_t(i)) {
+      grad_t(i) = 2.0 * (x_t(i) - y_t(i)) / mf;
+    }
+  }
+}
+
+template <typename T>
+static inline T CalcBCEWithMask(const Tensor& x, const Tensor& y,
+                                const Tensor& mask) {
+  auto x_t = EigenVector<T>::Flatten(x);
+  auto y_t = EigenVector<T>::Flatten(y);
+  auto mask_t = EigenVector<int>::Flatten(mask);
+
+  T error_sum = 0.0;
+  T points = 0.0;
+  for (int i = 0; i < x_t.dimensions()[0]; i++) {
+    if (mask_t(i)) {
+      error_sum +=
+          -1.0 * (y_t(i) * log(x_t(i)) + (1.0 - y_t(i)) * log(1.0 - x_t(i)));
+      points += 1;
+    }
+  }
+  return (error_sum / points);
+}
+
+template <typename T>
+static inline void CalcBCEGradWithMask(Tensor* grad, const Tensor& x,
+                                       const Tensor& y, const Tensor& mask,
+                                       T mf) {
+  auto grad_t = EigenVector<T>::Flatten(*grad).setConstant(0.0);
+  auto x_t = EigenVector<T>::Flatten(x);
+  auto y_t = EigenVector<T>::Flatten(y);
+  auto mask_t = EigenVector<int>::Flatten(mask);
+
+  for (int i = 0; i < x_t.dimensions()[0]; i++) {
+    if (mask_t(i)) {
+      grad_t(i) = ((1.0 - y_t(i)) / (1.0 - x_t(i)) - y_t(i) / x_t(i)) / mf;
+    }
+  }
+}
+
+template <typename T>
+static void CalcPredResult(const Tensor& input, Tensor* pred_conf,
+                           Tensor* pred_class, Tensor* pred_x, Tensor* pred_y,
+                           Tensor* pred_w, Tensor* pred_h, const int anchor_num,
+                           const int class_num) {
+  const int n = input.dims()[0];
+  const int h = input.dims()[2];
+  const int w = input.dims()[3];
+  const int box_attr_num = 5 + class_num;
+
+  auto input_t = EigenTensor<T, 4>::From(input);
+  auto pred_conf_t = EigenTensor<T, 4>::From(*pred_conf);
+  auto pred_class_t = EigenTensor<T, 5>::From(*pred_class);
+  auto pred_x_t = EigenTensor<T, 4>::From(*pred_x);
+  auto pred_y_t = EigenTensor<T, 4>::From(*pred_y);
+  auto pred_w_t = EigenTensor<T, 4>::From(*pred_w);
+  auto pred_h_t = EigenTensor<T, 4>::From(*pred_h);
+
+  for (int i = 0; i < n; i++) {
+    for (int an_idx = 0; an_idx < anchor_num; an_idx++) {
+      for (int j = 0; j < h; j++) {
+        for (int k = 0; k < w; k++) {
+          pred_x_t(i, an_idx, j, k) =
+              sigmoid(input_t(i, box_attr_num * an_idx, j, k));
+          pred_y_t(i, an_idx, j, k) =
+              sigmoid(input_t(i, box_attr_num * an_idx + 1, j, k));
+          pred_w_t(i, an_idx, j, k) =
+              input_t(i, box_attr_num * an_idx + 2, j, k);
+          pred_h_t(i, an_idx, j, k) =
+              input_t(i, box_attr_num * an_idx + 3, j, k);
+
+          pred_conf_t(i, an_idx, j, k) =
+              sigmoid(input_t(i, box_attr_num * an_idx + 4, j, k));
+
+          for (int c = 0; c < class_num; c++) {
+            pred_class_t(i, an_idx, j, k, c) =
+                sigmoid(input_t(i, box_attr_num * an_idx + 5 + c, j, k));
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+static T CalcBoxIoU(std::vector<T> box1, std::vector<T> box2) {
+  T b1_x1 = box1[0] - box1[2] / 2;
+  T b1_x2 = box1[0] + box1[2] / 2;
+  T b1_y1 = box1[1] - box1[3] / 2;
+  T b1_y2 = box1[1] + box1[3] / 2;
+  T b2_x1 = box2[0] - box2[2] / 2;
+  T b2_x2 = box2[0] + box2[2] / 2;
+  T b2_y1 = box2[1] - box2[3] / 2;
+  T b2_y2 = box2[1] + box2[3] / 2;
+
+  T b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1);
+  T b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1);
+
+  T inter_rect_x1 = std::max(b1_x1, b2_x1);
+  T inter_rect_y1 = std::max(b1_y1, b2_y1);
+  T inter_rect_x2 = std::min(b1_x2, b2_x2);
+  T inter_rect_y2 = std::min(b1_y2, b2_y2);
+  T inter_area = std::max(inter_rect_x2 - inter_rect_x1, static_cast<T>(0.0)) *
+                 std::max(inter_rect_y2 - inter_rect_y1, static_cast<T>(0.0));
+
+  return inter_area / (b1_area + b2_area - inter_area);
+}
+
+template <typename T>
+static void PreProcessGTBox(const Tensor& gt_box, const Tensor& gt_label,
+                            const float ignore_thresh, std::vector<int> anchors,
+                            const int grid_size, Tensor* obj_mask,
+                            Tensor* noobj_mask, Tensor* tx, Tensor* ty,
+                            Tensor* tw, Tensor* th, Tensor* tconf,
+                            Tensor* tclass) {
+  const int n = gt_box.dims()[0];
+  const int b = gt_box.dims()[1];
+  const int anchor_num = anchors.size() / 2;
+  auto gt_box_t = EigenTensor<T, 3>::From(gt_box);
+  auto gt_label_t = EigenTensor<int, 2>::From(gt_label);
+  auto obj_mask_t = EigenTensor<int, 4>::From(*obj_mask).setConstant(0);
+  auto noobj_mask_t = EigenTensor<int, 4>::From(*noobj_mask).setConstant(1);
+  auto tx_t = EigenTensor<T, 4>::From(*tx).setConstant(0.0);
+  auto ty_t = EigenTensor<T, 4>::From(*ty).setConstant(0.0);
+  auto tw_t = EigenTensor<T, 4>::From(*tw).setConstant(0.0);
+  auto th_t = EigenTensor<T, 4>::From(*th).setConstant(0.0);
+  auto tconf_t = EigenTensor<T, 4>::From(*tconf).setConstant(0.0);
+  auto tclass_t = EigenTensor<T, 5>::From(*tclass).setConstant(0.0);
+
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < b; j++) {
+      if (isZero<T>(gt_box_t(i, j, 0)) && isZero<T>(gt_box_t(i, j, 1)) &&
+          isZero<T>(gt_box_t(i, j, 2)) && isZero<T>(gt_box_t(i, j, 3))) {
+        continue;
+      }
+
+      int cur_label = gt_label_t(i, j);
+      T gx = gt_box_t(i, j, 0) * grid_size;
+      T gy = gt_box_t(i, j, 1) * grid_size;
+      T gw = gt_box_t(i, j, 2) * grid_size;
+      T gh = gt_box_t(i, j, 3) * grid_size;
+      int gi = static_cast<int>(gx);
+      int gj = static_cast<int>(gy);
+
+      T max_iou = static_cast<T>(0);
+      T iou;
+      int best_an_index = -1;
+      std::vector<T> gt_box_shape({0, 0, gw, gh});
+      for (int an_idx = 0; an_idx < anchor_num; an_idx++) {
+        std::vector<T> anchor_shape({0, 0, static_cast<T>(anchors[2 * an_idx]),
+                                     static_cast<T>(anchors[2 * an_idx + 1])});
+        iou = CalcBoxIoU<T>(gt_box_shape, anchor_shape);
+        if (iou > max_iou) {
+          max_iou = iou;
+          best_an_index = an_idx;
+        }
+        if (iou > ignore_thresh) {
+          noobj_mask_t(i, an_idx, gj, gi) = 0;
+        }
+      }
+      obj_mask_t(i, best_an_index, gj, gi) = 1;
+      noobj_mask_t(i, best_an_index, gj, gi) = 0;
+      tx_t(i, best_an_index, gj, gi) = gx - gi;
+      ty_t(i, best_an_index, gj, gi) = gy - gj;
+      tw_t(i, best_an_index, gj, gi) = log(gw / anchors[2 * best_an_index]);
+      th_t(i, best_an_index, gj, gi) = log(gh / anchors[2 * best_an_index + 1]);
+      tclass_t(i, best_an_index, gj, gi, cur_label) = 1;
+      tconf_t(i, best_an_index, gj, gi) = 1;
+    }
+  }
+}
+
+static void ExpandObjMaskByClassNum(Tensor* obj_mask_expand,
+                                    const Tensor& obj_mask) {
+  const int n = obj_mask_expand->dims()[0];
+  const int an_num = obj_mask_expand->dims()[1];
+  const int h = obj_mask_expand->dims()[2];
+  const int w = obj_mask_expand->dims()[3];
+  const int class_num = obj_mask_expand->dims()[4];
+  auto obj_mask_expand_t = EigenTensor<int, 5>::From(*obj_mask_expand);
+  auto obj_mask_t = EigenTensor<int, 4>::From(obj_mask);
+
+  obj_mask_expand_t = obj_mask_t.reshape(Array5(n, an_num, h, w, 1))
+                          .broadcast(Array5(1, 1, 1, 1, class_num));
+}
+
+template <typename T>
+static void AddAllGradToInputGrad(
+    Tensor* grad, T loss, const Tensor& pred_x, const Tensor& pred_y,
+    const Tensor& pred_conf, const Tensor& pred_class, const Tensor& grad_x,
+    const Tensor& grad_y, const Tensor& grad_w, const Tensor& grad_h,
+    const Tensor& grad_conf_target, const Tensor& grad_conf_notarget,
+    const Tensor& grad_class, const int class_num, const float loss_weight_xy,
+    const float loss_weight_wh, const float loss_weight_conf_target,
+    const float loss_weight_conf_notarget, const float loss_weight_class) {
+  const int n = pred_x.dims()[0];
+  const int an_num = pred_x.dims()[1];
+  const int h = pred_x.dims()[2];
+  const int w = pred_x.dims()[3];
+  const int attr_num = class_num + 5;
+  auto grad_t = EigenTensor<T, 4>::From(*grad).setConstant(0.0);
+  auto pred_x_t = EigenTensor<T, 4>::From(pred_x);
+  auto pred_y_t = EigenTensor<T, 4>::From(pred_y);
+  auto pred_conf_t = EigenTensor<T, 4>::From(pred_conf);
+  auto pred_class_t = EigenTensor<T, 5>::From(pred_class);
+  auto grad_x_t = EigenTensor<T, 4>::From(grad_x);
+  auto grad_y_t = EigenTensor<T, 4>::From(grad_y);
+  auto grad_w_t = EigenTensor<T, 4>::From(grad_w);
+  auto grad_h_t = EigenTensor<T, 4>::From(grad_h);
+  auto grad_conf_target_t = EigenTensor<T, 4>::From(grad_conf_target);
+  auto grad_conf_notarget_t = EigenTensor<T, 4>::From(grad_conf_notarget);
+  auto grad_class_t = EigenTensor<T, 5>::From(grad_class);
+
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < an_num; j++) {
+      for (int k = 0; k < h; k++) {
+        for (int l = 0; l < w; l++) {
+          grad_t(i, j * attr_num, k, l) =
+              grad_x_t(i, j, k, l) * pred_x_t(i, j, k, l) *
+              (1.0 - pred_x_t(i, j, k, l)) * loss * loss_weight_xy;
+          grad_t(i, j * attr_num + 1, k, l) =
+              grad_y_t(i, j, k, l) * pred_y_t(i, j, k, l) *
+              (1.0 - pred_y_t(i, j, k, l)) * loss * loss_weight_xy;
+          grad_t(i, j * attr_num + 2, k, l) =
+              grad_w_t(i, j, k, l) * loss * loss_weight_wh;
+          grad_t(i, j * attr_num + 3, k, l) =
+              grad_h_t(i, j, k, l) * loss * loss_weight_wh;
+          grad_t(i, j * attr_num + 4, k, l) =
+              grad_conf_target_t(i, j, k, l) * pred_conf_t(i, j, k, l) *
+              (1.0 - pred_conf_t(i, j, k, l)) * loss * loss_weight_conf_target;
+          grad_t(i, j * attr_num + 4, k, l) +=
+              grad_conf_notarget_t(i, j, k, l) * pred_conf_t(i, j, k, l) *
+              (1.0 - pred_conf_t(i, j, k, l)) * loss *
+              loss_weight_conf_notarget;
+
+          for (int c = 0; c < class_num; c++) {
+            grad_t(i, j * attr_num + 5 + c, k, l) =
+                grad_class_t(i, j, k, l, c) * pred_class_t(i, j, k, l, c) *
+                (1.0 - pred_class_t(i, j, k, l, c)) * loss * loss_weight_class;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+class Yolov3LossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* gt_box = ctx.Input<Tensor>("GTBox");
+    auto* gt_label = ctx.Input<Tensor>("GTLabel");
+    auto* loss = ctx.Output<Tensor>("Loss");
+    auto anchors = ctx.Attr<std::vector<int>>("anchors");
+    int class_num = ctx.Attr<int>("class_num");
+    float ignore_thresh = ctx.Attr<float>("ignore_thresh");
+    float loss_weight_xy = ctx.Attr<float>("loss_weight_xy");
+    float loss_weight_wh = ctx.Attr<float>("loss_weight_wh");
+    float loss_weight_conf_target = ctx.Attr<float>("loss_weight_conf_target");
+    float loss_weight_conf_notarget =
+        ctx.Attr<float>("loss_weight_conf_notarget");
+    float loss_weight_class = ctx.Attr<float>("loss_weight_class");
+
+    const int n = input->dims()[0];
+    const int h = input->dims()[2];
+    const int w = input->dims()[3];
+    const int an_num = anchors.size() / 2;
+
+    Tensor pred_x, pred_y, pred_w, pred_h;
+    Tensor pred_conf, pred_class;
+    pred_x.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    pred_y.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    pred_w.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    pred_h.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    pred_conf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    pred_class.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
+    CalcPredResult<T>(*input, &pred_conf, &pred_class, &pred_x, &pred_y,
+                      &pred_w, &pred_h, an_num, class_num);
+
+    Tensor obj_mask, noobj_mask;
+    Tensor tx, ty, tw, th, tconf, tclass;
+    obj_mask.mutable_data<int>({n, an_num, h, w}, ctx.GetPlace());
+    noobj_mask.mutable_data<int>({n, an_num, h, w}, ctx.GetPlace());
+    tx.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    ty.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    tw.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    th.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    tconf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    tclass.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
+    PreProcessGTBox<T>(*gt_box, *gt_label, ignore_thresh, anchors, h, &obj_mask,
+                       &noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass);
+
+    Tensor obj_mask_expand;
+    obj_mask_expand.mutable_data<int>({n, an_num, h, w, class_num},
+                                      ctx.GetPlace());
+    ExpandObjMaskByClassNum(&obj_mask_expand, obj_mask);
+
+    T loss_x = CalcMSEWithMask<T>(pred_x, tx, obj_mask);
+    T loss_y = CalcMSEWithMask<T>(pred_y, ty, obj_mask);
+    T loss_w = CalcMSEWithMask<T>(pred_w, tw, obj_mask);
+    T loss_h = CalcMSEWithMask<T>(pred_h, th, obj_mask);
+    T loss_conf_target = CalcBCEWithMask<T>(pred_conf, tconf, obj_mask);
+    T loss_conf_notarget = CalcBCEWithMask<T>(pred_conf, tconf, noobj_mask);
+    T loss_class = CalcBCEWithMask<T>(pred_class, tclass, obj_mask_expand);
+
+    auto* loss_data = loss->mutable_data<T>({1}, ctx.GetPlace());
+    loss_data[0] = loss_weight_xy * (loss_x + loss_y) +
+                   loss_weight_wh * (loss_w + loss_h) +
+                   loss_weight_conf_target * loss_conf_target +
+                   loss_weight_conf_notarget * loss_conf_notarget +
+                   loss_weight_class * loss_class;
+  }
+};
+
+template <typename T>
+class Yolov3LossGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* gt_box = ctx.Input<Tensor>("GTBox");
+    auto* gt_label = ctx.Input<Tensor>("GTLabel");
+    auto anchors = ctx.Attr<std::vector<int>>("anchors");
+    int class_num = ctx.Attr<int>("class_num");
+    float ignore_thresh = ctx.Attr<float>("ignore_thresh");
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
+    const T loss = output_grad->data<T>()[0];
+    float loss_weight_xy = ctx.Attr<float>("loss_weight_xy");
+    float loss_weight_wh = ctx.Attr<float>("loss_weight_wh");
+    float loss_weight_conf_target = ctx.Attr<float>("loss_weight_conf_target");
+    float loss_weight_conf_notarget =
+        ctx.Attr<float>("loss_weight_conf_notarget");
+    float loss_weight_class = ctx.Attr<float>("loss_weight_class");
+
+    const int n = input->dims()[0];
+    const int c = input->dims()[1];
+    const int h = input->dims()[2];
+    const int w = input->dims()[3];
+    const int an_num = anchors.size() / 2;
+
+    Tensor pred_x, pred_y, pred_w, pred_h;
+    Tensor pred_conf, pred_class;
+    pred_x.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    pred_y.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    pred_w.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    pred_h.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    pred_conf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    pred_class.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
+    CalcPredResult<T>(*input, &pred_conf, &pred_class, &pred_x, &pred_y,
+                      &pred_w, &pred_h, an_num, class_num);
+
+    Tensor obj_mask, noobj_mask;
+    Tensor tx, ty, tw, th, tconf, tclass;
+    obj_mask.mutable_data<int>({n, an_num, h, w}, ctx.GetPlace());
+    noobj_mask.mutable_data<int>({n, an_num, h, w}, ctx.GetPlace());
+    tx.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    ty.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    tw.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    th.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    tconf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    tclass.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
+    PreProcessGTBox<T>(*gt_box, *gt_label, ignore_thresh, anchors, h, &obj_mask,
+                       &noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass);
+
+    Tensor obj_mask_expand;
+    obj_mask_expand.mutable_data<int>({n, an_num, h, w, class_num},
+                                      ctx.GetPlace());
+    ExpandObjMaskByClassNum(&obj_mask_expand, obj_mask);
+
+    Tensor grad_x, grad_y, grad_w, grad_h;
+    Tensor grad_conf_target, grad_conf_notarget, grad_class;
+    grad_x.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    grad_y.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    grad_w.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    grad_h.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    grad_conf_target.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    grad_conf_notarget.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
+    grad_class.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
+    T obj_mf = CalcMaskPointNum<int>(obj_mask);
+    T noobj_mf = CalcMaskPointNum<int>(noobj_mask);
+    T obj_expand_mf = CalcMaskPointNum<int>(obj_mask_expand);
+    CalcMSEGradWithMask<T>(&grad_x, pred_x, tx, obj_mask, obj_mf);
+    CalcMSEGradWithMask<T>(&grad_y, pred_y, ty, obj_mask, obj_mf);
+    CalcMSEGradWithMask<T>(&grad_w, pred_w, tw, obj_mask, obj_mf);
+    CalcMSEGradWithMask<T>(&grad_h, pred_h, th, obj_mask, obj_mf);
+    CalcBCEGradWithMask<T>(&grad_conf_target, pred_conf, tconf, obj_mask,
+                           obj_mf);
+    CalcBCEGradWithMask<T>(&grad_conf_notarget, pred_conf, tconf, noobj_mask,
+                           noobj_mf);
+    CalcBCEGradWithMask<T>(&grad_class, pred_class, tclass, obj_mask_expand,
+                           obj_expand_mf);
+
+    input_grad->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
+    AddAllGradToInputGrad<T>(
+        input_grad, loss, pred_x, pred_y, pred_conf, pred_class, grad_x, grad_y,
+        grad_w, grad_h, grad_conf_target, grad_conf_notarget, grad_class,
+        class_num, loss_weight_xy, loss_weight_wh, loss_weight_conf_target,
+        loss_weight_conf_notarget, loss_weight_class);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 93cb5eb2dc0b3480ebd05dcc6b36d8915d057bab..23c7ebe84221986a5f7ac7583c3a8e17d04fe4af 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -56,9 +56,16 @@ ELSE()
     set(MKLDNN_CTX_DEPS)
 ENDIF()
 
+nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) 
+IF(WITH_GPU)
+  set(STREAM_CALLBACK_DEPS stream_callback_manager)
+ELSE()
+  set(STREAM_CALLBACK_DEPS)
+ENDIF()
+
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
-cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc
+cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc ${STREAM_CALLBACK_DEPS}
     place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
 
diff --git a/paddle/fluid/platform/assert.h b/paddle/fluid/platform/assert.h
index 2ce9b31bb81de867ff4ed6ee14afddecd95317b9..2e8fa7c1b8f7f7b8f3154aae691bb100375981dd 100644
--- a/paddle/fluid/platform/assert.h
+++ b/paddle/fluid/platform/assert.h
@@ -36,6 +36,15 @@ limitations under the License. */
       asm("trap;");                                                     \
     }                                                                   \
   } while (0)
+
+#define PADDLE_ASSERT_MSG_CODE(e, m, c)                                    \
+  do {                                                                     \
+    if (!(e)) {                                                            \
+      printf("%s:%d Assertion `%s` failed (%s %d).\n", __FILE__, __LINE__, \
+             TOSTRING(e), m, c);                                           \
+      asm("trap;");                                                        \
+    }                                                                      \
+  } while (0)
 #else
 #include <assert.h>
 // For cuda, the assertions can affect performance and it is therefore
@@ -43,4 +52,5 @@ limitations under the License. */
 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#assertion
 #define PADDLE_ASSERT(e) assert((e))
 #define PADDLE_ASSERT_MSG(e, m) assert((e) && (m))
+#define PADDLE_ASSERT_MSG_CODE(e, m, c) assert((e) && (m) && (c || 1))
 #endif
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index d466f28d1ea0a8327f8d7a45c3e55c5aacd61544..f9a32bfa4c15261ba6b79fc4efd3a1961f7c6d4d 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -123,7 +123,6 @@ size_t CUDAPinnedMaxChunkSize() {
   return CUDAPinnedMaxAllocSize() / 256;
 }
 
-namespace jit {
 #ifdef PADDLE_WITH_XBYAK
 static Xbyak::util::Cpu cpu;
 bool MayIUse(const cpu_isa_t cpu_isa) {
@@ -165,6 +164,5 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
 }
 #endif
 
-}  // namespace jit
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
index fd31ef77b46d5b5b641983a0421da31914c87c18..55dba545ff133b1c219ee58f6d1bb2d2130d1a59 100644
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -39,7 +39,6 @@ size_t CUDAPinnedMinChunkSize();
 //! Get the maximum chunk size for buddy allocator.
 size_t CUDAPinnedMaxChunkSize();
 
-namespace jit {
 typedef enum {
   isa_any,
   sse42,
@@ -55,7 +54,5 @@ typedef enum {
 // May I use some instruction
 bool MayIUse(const cpu_isa_t cpu_isa);
 
-}  // namespace jit
-
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/cuda_helper_test.cu b/paddle/fluid/platform/cuda_helper_test.cu
index ee45afab93d079374aefe366425502890854c28d..466bf90c63c1496883995819cdcb19f846e4a302 100644
--- a/paddle/fluid/platform/cuda_helper_test.cu
+++ b/paddle/fluid/platform/cuda_helper_test.cu
@@ -93,7 +93,7 @@ TEST(CudaAtomic, float16) {
 
 // unalignment of uint8
 void TestUnalign(size_t num, const int shift_bit) {
-  PADDLE_ENFORCE(num % 2 == 0, "must be a multiple of 2");
+  ASSERT_EQ(num % 2, 0);
   float16 *in1, *in2, *out;
   float16 *d_in1, *d_in2;
   size_t size = sizeof(uint8_t) * (num + shift_bit);
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index d0a108f905f46135bcd2b68be19ab396ab897272..bd81d4dd1f1073edffcb9fd4a02b455db27361d5 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -120,15 +120,24 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
   }
 
   void* allocate(size_t num_bytes) const override {
+    if (UNLIKELY(num_bytes == 0)) {
+      return nullptr;
+    }
     auto buf = paddle::memory::Alloc(place_, num_bytes,
                                      memory::Allocator::kScratchpad);
     void* retv = buf->ptr();
-    allocations_[buf->ptr()] = std::move(buf);
+    {
+      std::lock_guard<std::mutex> lock(mtx_);
+      allocations_.emplace(retv, std::move(buf));
+    }
     return retv;
   }
 
   void deallocate(void* buffer) const override {
-    allocations_.erase(allocations_.find(buffer));
+    if (LIKELY(buffer)) {
+      std::lock_guard<std::mutex> lock(mtx_);
+      allocations_.erase(buffer);
+    }
   }
 
   void* scratchpad() const override {
@@ -155,6 +164,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
   const cudaDeviceProp* device_prop_;  // not owned;
   mutable void* scratch_;
   mutable unsigned int* semaphore_;
+  mutable std::mutex mtx_;  // to protect allocations_
   mutable std::unordered_map<void*, memory::AllocationPtr> allocations_;
 };
 
@@ -210,6 +220,40 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
   LOG_FIRST_N(WARNING, 1) << "device: " << place_.device
                           << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
                           << (cudnn_dso_ver % 100) / 10 << ".";
+
+  {
+    // Check CUDA/CUDNN version compatiblity
+    auto local_cuda_version = runtime_version_ / 100;
+    auto compile_cuda_version = CUDA_VERSION / 100;
+    if (local_cuda_version < compile_cuda_version) {
+      LOG_FIRST_N(WARNING, 1)
+          << "WARNING: device: " << place_.device
+          << ". The installed Paddle is compiled with CUDA "
+          << compile_cuda_version / 10 << "." << compile_cuda_version % 10
+          << ", but CUDA runtime version in your machine is "
+          << local_cuda_version / 10 << "." << local_cuda_version % 10
+          << ", which may cause serious incompatible bug. "
+          << "Please recompile or reinstall Paddle with compatible CUDA "
+             "version.";
+    }
+
+    if (dynload::HasCUDNN()) {
+      auto local_cudnn_version = cudnn_dso_ver / 100;
+      auto compile_cudnn_version = CUDNN_VERSION / 100;
+      if (local_cuda_version < compile_cuda_version) {
+        LOG_FIRST_N(WARNING, 1)
+            << "WARNING: device: " << place_.device
+            << ". The installed Paddle is compiled with CUDNN "
+            << compile_cudnn_version / 10 << "." << compile_cudnn_version % 10
+            << ", but CUDNN version in your machine is "
+            << local_cudnn_version / 10 << "." << local_cudnn_version % 10
+            << ", which may cause serious incompatible bug. "
+            << "Please recompile or reinstall Paddle with compatible CUDNN "
+               "version.";
+      }
+    }
+  }
+
   callback_manager_.reset(new StreamCallbackManager(stream_));
 }
 
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 3edd727978010e20203ab994562ce922b6ee0bad..812e56f1f966d03207cf83ad47cb88e9fa5d55bb 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -21,7 +21,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/gpu_info.h"
-#define EIGEN_USE_GPU
 #endif
 
 #ifdef PADDLE_WITH_MKLDNN
@@ -223,14 +222,10 @@ class CUDADeviceContext : public DeviceContext {
 
   template <typename Callback>
   void AddStreamCallback(Callback&& callback) const {
-    std::lock_guard<std::mutex> guard(callback_mtx_);
     callback_manager_->AddCallback(callback);
   }
 
-  void WaitStreamCallback() const {
-    std::lock_guard<std::mutex> guard(callback_mtx_);
-    callback_manager_->Wait();
-  }
+  void WaitStreamCallback() const { callback_manager_->Wait(); }
 
 #if CUDA_VERSION >= 9000
   /*! \brief CublasCall may need to change cublas's config,
@@ -261,9 +256,7 @@ class CUDADeviceContext : public DeviceContext {
 
   mutable std::mutex mtx_;
 
-  // This lock is only used by callback
-  // If we use mtx_ for StreamCallbackManager, deadlock may occur sometimes
-  mutable std::mutex callback_mtx_;
+  // StreamCallbackManager is thread-safe
   std::unique_ptr<StreamCallbackManager> callback_manager_;
 
   mutable std::mutex cublas_mtx_;
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index dc1d751141187edb7738e42c41514614d4d399b0..0a4563ead65b1e45adca1d1a1fce066a1a55d932 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -143,7 +143,7 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
           case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: {
             auto *kernel =
                 reinterpret_cast<const CUpti_ActivityKernel3 *>(record);
-            tracer->AddKernelRecords(kernel->start, kernel->end,
+            tracer->AddKernelRecords(kernel->name, kernel->start, kernel->end,
                                      kernel->deviceId, kernel->streamId,
                                      kernel->correlationId);
             break;
@@ -224,8 +224,9 @@ class DeviceTracerImpl : public DeviceTracer {
                                      stream_id, correlation_id, bytes});
   }
 
-  void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id,
-                        int64_t stream_id, uint32_t correlation_id) {
+  void AddKernelRecords(std::string name, uint64_t start, uint64_t end,
+                        int64_t device_id, int64_t stream_id,
+                        uint32_t correlation_id) {
     // 0 means timestamp information could not be collected for the kernel.
     if (start == 0 || end == 0) {
       VLOG(3) << correlation_id << " cannot be traced";
@@ -233,7 +234,7 @@ class DeviceTracerImpl : public DeviceTracer {
     }
     std::lock_guard<std::mutex> l(trace_mu_);
     kernel_records_.push_back(
-        KernelRecord{start, end, device_id, stream_id, correlation_id});
+        KernelRecord{name, start, end, device_id, stream_id, correlation_id});
   }
 
   bool IsEnabled() {
@@ -276,13 +277,13 @@ class DeviceTracerImpl : public DeviceTracer {
     profile_pb.set_start_ns(start_ns_);
     profile_pb.set_end_ns(end_ns_);
     for (const KernelRecord &r : kernel_records_) {
-      if (correlations_.find(r.correlation_id) == correlations_.end()) {
-        fprintf(stderr, "cannot relate a kernel activity\n");
-        continue;
-      }
       auto *event = profile_pb.add_events();
       event->set_type(proto::Event::GPUKernel);
-      event->set_name(correlations_.at(r.correlation_id));
+      if (correlations_.find(r.correlation_id) != correlations_.end()) {
+        event->set_name(correlations_.at(r.correlation_id));
+      } else {
+        event->set_name(r.name);
+      }
       event->set_start_ns(r.start_ns);
       event->set_end_ns(r.end_ns);
       event->set_sub_device_id(r.stream_id);
diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h
index eaf047d4744762f69d50bff8d467da8e3b8317cc..bf0786be2d0fafbf4b610d16ef587ac219399203 100644
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -39,6 +39,7 @@ inline uint64_t PosixInNsec() {
 class DeviceTracer {
  public:
   struct KernelRecord {
+    std::string name;
     uint64_t start_ns;
     uint64_t end_ns;
     int64_t device_id;
@@ -84,8 +85,9 @@ class DeviceTracer {
 
   // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
   // added before for human readability.
-  virtual void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id,
-                                int64_t stream_id, uint32_t correlation_id) = 0;
+  virtual void AddKernelRecords(std::string name, uint64_t start, uint64_t end,
+                                int64_t device_id, int64_t stream_id,
+                                uint32_t correlation_id) = 0;
 
   // Generate a proto after done (Disabled).
   virtual proto::Profile GenProfile(const std::string& profile_path) = 0;
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index db62377898339def415a13d185f85f34de326d7f..550fe2edee13d628e761eca194809823537a4024 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -111,7 +111,22 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
   __macro(cudnnFindConvolutionForwardAlgorithmEx);        \
   __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \
   __macro(cudnnFindConvolutionBackwardDataAlgorithmEx);   \
-  __macro(cudnnGetErrorString);
+  __macro(cudnnGetErrorString);                           \
+  __macro(cudnnCreateDropoutDescriptor);                  \
+  __macro(cudnnDropoutGetStatesSize);                     \
+  __macro(cudnnSetDropoutDescriptor);                     \
+  __macro(cudnnCreateRNNDescriptor);                      \
+  __macro(cudnnSetRNNDescriptor);                         \
+  __macro(cudnnGetRNNParamsSize);                         \
+  __macro(cudnnGetRNNWorkspaceSize);                      \
+  __macro(cudnnGetRNNTrainingReserveSize);                \
+  __macro(cudnnRNNForwardTraining);                       \
+  __macro(cudnnRNNBackwardData);                          \
+  __macro(cudnnRNNBackwardWeights);                       \
+  __macro(cudnnRNNForwardInference);                      \
+  __macro(cudnnDestroyDropoutDescriptor);                 \
+  __macro(cudnnDestroyRNNDescriptor);
+
 CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 
 #define CUDNN_DNN_ROUTINE_EACH_R2(__macro) \
@@ -149,6 +164,12 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
+// APIs in R6
+#if CUDNN_VERSION >= 6000
+#define CUDNN_DNN_ROUTINE_EACH_R6(__macro) __macro(cudnnSetRNNDescriptor_v6);
+CUDNN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
 #if CUDNN_VERSION >= 7001
 #define CUDNN_DNN_ROUTINE_EACH_R7(__macro)        \
   __macro(cudnnSetConvolutionGroupCount);         \
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
index 9273e9b1e72f0ad7abd6c20d4a34283fbe24378a..f0a973662360fd9ff35e1006cce937d86f3e563c 100644
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -68,6 +68,8 @@ extern void* mklml_dso_handle;
   __macro(cblas_dgemm_batch);       \
   __macro(cblas_sdot);              \
   __macro(cblas_ddot);              \
+  __macro(cblas_sasum);             \
+  __macro(cblas_dasum);             \
   __macro(cblas_sscal);             \
   __macro(cblas_dscal);             \
   __macro(vsAdd);                   \
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index a85972bdb72ca3119cc14f9e2b810c3875443538..01ee67fd07f848356e801be95d53a61bb5b08e37 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -62,45 +62,54 @@ inline std::string demangle(std::string name) { return name; }
 #endif
 
 struct EnforceNotMet : public std::exception {
-  std::exception_ptr exp_;
   std::string err_str_;
-  EnforceNotMet(std::exception_ptr e, const char* f, int l) : exp_(e) {
-    static constexpr int TRACE_STACK_LIMIT = 100;
+  EnforceNotMet(std::exception_ptr e, const char* f, int l) {
     try {
-      std::rethrow_exception(exp_);
-    } catch (const std::exception& exp) {
-      std::ostringstream sout;
+      std::rethrow_exception(e);
+    } catch (std::exception& e) {
+      Init(e.what(), f, l);
+    }
+  }
 
-      sout << string::Sprintf("%s at [%s:%d]", exp.what(), f, l) << std::endl;
-      sout << "PaddlePaddle Call Stacks: " << std::endl;
+  template <typename... ARGS>
+  EnforceNotMet(const char* f, int l, ARGS... args) {
+    Init(string::Sprintf(args...), f, l);
+  }
+
+  const char* what() const noexcept override { return err_str_.c_str(); }
+
+ private:
+  template <typename StrType>
+  inline void Init(StrType what, const char* f, int l) {
+    static constexpr int TRACE_STACK_LIMIT = 100;
+    std::ostringstream sout;
+
+    sout << string::Sprintf("%s at [%s:%d]", what, f, l) << std::endl;
+    sout << "PaddlePaddle Call Stacks: " << std::endl;
 #if !defined(_WIN32)
-      void* call_stack[TRACE_STACK_LIMIT];
-      auto size = backtrace(call_stack, TRACE_STACK_LIMIT);
-      auto symbols = backtrace_symbols(call_stack, size);
-
-      Dl_info info;
-      for (int i = 0; i < size; ++i) {
-        if (dladdr(call_stack[i], &info) && info.dli_sname) {
-          auto demangled = demangle(info.dli_sname);
-          auto addr_offset = static_cast<char*>(call_stack[i]) -
-                             static_cast<char*>(info.dli_saddr);
-          sout << string::Sprintf("%-3d %*0p %s + %zd\n", i,
-                                  2 + sizeof(void*) * 2, call_stack[i],
-                                  demangled, addr_offset);
-        } else {
-          sout << string::Sprintf("%-3d %*0p\n", i, 2 + sizeof(void*) * 2,
-                                  call_stack[i]);
-        }
+    void* call_stack[TRACE_STACK_LIMIT];
+    auto size = backtrace(call_stack, TRACE_STACK_LIMIT);
+    auto symbols = backtrace_symbols(call_stack, size);
+    Dl_info info;
+    for (int i = 0; i < size; ++i) {
+      if (dladdr(call_stack[i], &info) && info.dli_sname) {
+        auto demangled = demangle(info.dli_sname);
+        auto addr_offset = static_cast<char*>(call_stack[i]) -
+                           static_cast<char*>(info.dli_saddr);
+        sout << string::Sprintf("%-3d %*0p %s + %zd\n", i,
+                                2 + sizeof(void*) * 2, call_stack[i], demangled,
+                                addr_offset);
+      } else {
+        sout << string::Sprintf("%-3d %*0p\n", i, 2 + sizeof(void*) * 2,
+                                call_stack[i]);
       }
-      free(symbols);
+    }
+    free(symbols);
 #else
-      sout << "Windows not support stack backtrace yet.";
+    sout << "Windows not support stack backtrace yet.";
 #endif
-      err_str_ = sout.str();
-    }
+    err_str_ = sout.str();
   }
-
-  const char* what() const noexcept { return err_str_.c_str(); }
 };
 
 struct EOFException : public std::exception {
@@ -242,13 +251,8 @@ inline void throw_on_error(T e) {
   throw_on_error(e, "");
 }
 
-#define PADDLE_THROW(...)                                              \
-  do {                                                                 \
-    throw ::paddle::platform::EnforceNotMet(                           \
-        std::make_exception_ptr(                                       \
-            std::runtime_error(paddle::string::Sprintf(__VA_ARGS__))), \
-        __FILE__, __LINE__);                                           \
-  } while (false)
+#define PADDLE_THROW(...) \
+  throw ::paddle::platform::EnforceNotMet(__FILE__, __LINE__, __VA_ARGS__)
 
 #ifndef REPLACE_ENFORCE_GLOG
 #define PADDLE_ENFORCE(...)                                             \
diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h
index 9d48557caf75f3571ead3df43a1a93cf65e4b8cb..98afe843c0035ec14ad874508dc02b8d1d3d359c 100644
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -71,9 +71,6 @@ struct float16;
 }  // namespace platform
 }  // namespace paddle
 
-// NOTE():
-// Do not move the eigen.h header, otherwise the eigen_vector<bool> will failed.
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/platform/hostdevice.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index 6954e4c6a9df8dea01ec2b0f193965d835503b17..ca89d91aadb2d3e9005e6dd06cef124428d7e250 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/string/split.h"
 
 #ifndef _WIN32
 constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
@@ -45,6 +46,15 @@ DEFINE_bool(
     "input and output must be half precision) and recurrent neural networks "
     "(RNNs).");
 
+DEFINE_string(selected_gpus, "",
+              "A list of device ids separated by comma, like: 0,1,2,3. "
+              "This option is useful when doing multi process training and "
+              "each process have only one device (GPU). If you want to use "
+              "all visible devices, set this to empty string. NOTE: the "
+              "reason of doing this is that we want to use P2P communication"
+              "between GPU devices, use CUDA_VISIBLE_DEVICES can only use"
+              "share-memory only.");
+
 namespace paddle {
 namespace platform {
 
@@ -121,6 +131,24 @@ int GetCurrentDeviceId() {
   return device_id;
 }
 
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetSelectedDevices() {
+  // use user specified GPUs in single-node multi-process mode.
+  std::vector<int> devices;
+  if (!FLAGS_selected_gpus.empty()) {
+    auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ',');
+    for (auto id : devices_str) {
+      devices.push_back(atoi(id.c_str()));
+    }
+  } else {
+    int count = GetCUDADeviceCount();
+    for (int i = 0; i < count; ++i) {
+      devices.push_back(i);
+    }
+  }
+  return devices;
+}
+
 void SetDeviceId(int id) {
   // TODO(qijun): find a better way to cache the cuda device count
   PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h
index 6a0b3c8e02d49068c2dbe14c7feea7e139947694..1e1ab2503f53fe20bbe62c48f65d8535947f1aa8 100644
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <cuda_runtime.h>
 #include <stddef.h>
 #include <string>
+#include <vector>
 
 namespace paddle {
 namespace platform {
@@ -47,6 +48,9 @@ int GetCUDAMaxThreadsPerMultiProcessor(int i);
 //! Get the current GPU device id in system.
 int GetCurrentDeviceId();
 
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetSelectedDevices();
+
 //! Set the GPU device id for next execution.
 void SetDeviceId(int device_id);
 
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 258779ba51026d0cc418257a37b78f346fa48efa..0d10d82d74a2011b1b2bc088fe88cbfdb49600b8 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/string/split.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
@@ -82,10 +83,8 @@ void InitDevices(bool init_p2p) {
   std::vector<int> devices;
 #ifdef PADDLE_WITH_CUDA
   try {
-    int count = platform::GetCUDADeviceCount();
-    for (int i = 0; i < count; ++i) {
-      devices.push_back(i);
-    }
+    // use user specified GPUs in single-node multi-process mode.
+    devices = platform::GetSelectedDevices();
   } catch (const std::exception &exp) {
     LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
   }
@@ -95,20 +94,15 @@ void InitDevices(bool init_p2p) {
 
 void InitDevices(bool init_p2p, const std::vector<int> devices) {
   std::vector<platform::Place> places;
-  int count = 0;
-#ifdef PADDLE_WITH_CUDA
-  try {
-    count = platform::GetCUDADeviceCount();
-  } catch (const std::exception &exp) {
-    LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
-  }
-#endif
 
   for (size_t i = 0; i < devices.size(); ++i) {
-    if (devices[i] >= count || devices[i] < 0) {
+    // In multi process multi gpu mode, we may have gpuid = 7
+    // but count = 1.
+    if (devices[i] < 0) {
       LOG(WARNING) << "Invalid devices id.";
       continue;
     }
+
     places.emplace_back(platform::CUDAPlace(devices[i]));
   }
   if (init_p2p) {
@@ -122,7 +116,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
 #endif
 
 #if !defined(_WIN32) && !defined(__APPLE__) && !defined(__OSX__)
-  if (platform::jit::MayIUse(platform::jit::avx)) {
+  if (platform::MayIUse(platform::avx)) {
 #ifndef __AVX__
     LOG(WARNING) << "AVX is available, Please re-compile on local machine";
 #endif
@@ -137,10 +131,10 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
          " version or compile from source code."
 
 #ifdef __AVX512F__
-  if (!platform::jit::MayIUse(platform::jit::avx512f)) {
-    if (platform::jit::MayIUse(platform::jit::avx2)) {
+  if (!platform::MayIUse(platform::avx512f)) {
+    if (platform::MayIUse(platform::avx2)) {
       AVX_GUIDE(AVX512, AVX2);
-    } else if (platform::jit::MayIUse(platform::jit::avx)) {
+    } else if (platform::MayIUse(platform::avx)) {
       AVX_GUIDE(AVX512, AVX);
     } else {
       AVX_GUIDE(AVX512, NonAVX);
@@ -149,8 +143,8 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
 #endif
 
 #ifdef __AVX2__
-  if (!platform::jit::MayIUse(platform::jit::avx2)) {
-    if (platform::jit::MayIUse(platform::jit::avx)) {
+  if (!platform::MayIUse(platform::avx2)) {
+    if (platform::MayIUse(platform::avx)) {
       AVX_GUIDE(AVX2, AVX);
     } else {
       AVX_GUIDE(AVX2, NonAVX);
@@ -159,7 +153,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
 #endif
 
 #ifdef __AVX__
-  if (!platform::jit::MayIUse(platform::jit::avx)) {
+  if (!platform::MayIUse(platform::avx)) {
     AVX_GUIDE(AVX, NonAVX);
   }
 #endif
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 761a9815e098098cb4c4080bd8605dde7f6870a4..e53064893ee89f663a76483b92de32b318b6c61f 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -107,176 +107,24 @@ inline mkldnn::memory::format GetMKLDNNFormat(
       memory.dst_primitive_desc().desc().data.format);
 }
 
-class MKLDNNHandler {
- public:
-  MKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
-                const std::string& base_key)
-      : dev_ctx_(dev_ctx),
-        engine_(engine),
-        key_(base_key),
-        is_reusing_(false) {}
-
-  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_src_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireWeightsMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_weights_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireBiasMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_bias_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDstMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_dst_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_diff_dst_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_diff_src_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireMemoryFromPrimitive(
-      mkldnn::memory::primitive_desc mdp, void* ptr,
-      const std::string& suffix) {
-    auto local_key = key_ + suffix;
-    auto mem_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
-                   "Fail to find mem primitive in device context");
-    if (mem_p == nullptr) {
-      mem_p = std::make_shared<mkldnn::memory>(mdp, ptr);
-      dev_ctx_.SetBlob(local_key, mem_p);
-    } else {
-      mem_p->set_data_handle(ptr);
-      // Mark that reusing happenned. All primitives from operator instance
-      // should be reused or none of them. So we check consistency
-      is_reusing_ = true;
-    }
-    return mem_p;
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireMemory(const mkldnn::memory::desc& md,
-                                                void* ptr,
-                                                const std::string& suffix) {
-    /*Generate key*/
-    auto local_key = key_ + suffix;
-    auto mem_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
-                   "Fail to find mem primitive in device context");
-    if (mem_p == nullptr) {
-      mem_p = std::make_shared<mkldnn::memory>(
-          mkldnn::memory::primitive_desc{md, engine_}, ptr);
-      dev_ctx_.SetBlob(local_key, mem_p);
-    } else {
-      mem_p->set_data_handle(ptr);
-      // Mark that reusing happenned. All primitives from operator instance
-      // should be reused or none of them. So we check consistency
-      is_reusing_ = true;
-    }
-    return mem_p;
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireMemory(
-      const std::shared_ptr<mkldnn::memory>& user_memory_p,
-      const std::shared_ptr<mkldnn::memory>& target_memory_p,
-      const std::string& suffix,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto local_key = key_ + suffix;
-    auto key_reorder_p = key_ + suffix + "reorder_p";
-
-    auto stored_reorder_p = std::static_pointer_cast<mkldnn::reorder>(
-        dev_ctx_.GetBlob(key_reorder_p));
-
-    if (stored_reorder_p) {
-      pipeline.push_back(*stored_reorder_p);
-    } else {
-      auto reorder_p =
-          std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p);
-      dev_ctx_.SetBlob(key_reorder_p, reorder_p);
-      pipeline.push_back(*reorder_p);
-    }
-
-    return target_memory_p;
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireMemory(
-      mkldnn::memory::primitive_desc& mpd,       // NOLINT
-      mkldnn::memory::primitive_desc& user_mpd,  // NOLINT
-      const std::shared_ptr<mkldnn::memory> user_memory_p,
-      const std::string& suffix,
-      std::vector<mkldnn::primitive>& pipeline,  // NOLINT
-      bool is_persistent = false) {
-    // create reorder primitive if the input format is not the preferred one
-    auto local_key = key_ + suffix;
-    auto key_reorder_p = key_ + suffix + "reorder_p";
-
-    auto target_memory_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    PADDLE_ENFORCE((target_memory_p != nullptr) || (is_reusing_ == false),
-                   "Fail to find mem primitive in device context");
-    if (target_memory_p == nullptr) {
-      target_memory_p = user_memory_p;
-      std::shared_ptr<mkldnn::primitive> reorder_p;
-      if (mpd != user_mpd) {
-        target_memory_p = std::make_shared<mkldnn::memory>(mpd);
-
-        auto reorder_p =
-            std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p);
-        dev_ctx_.SetBlob(key_reorder_p, reorder_p);
-        pipeline.push_back(*reorder_p);
-      }
-      dev_ctx_.SetBlob(local_key, target_memory_p);
-    } else if (!is_persistent) {
-      // Make reorder if needed
-      auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
-          dev_ctx_.GetBlob(key_reorder_p));
-      if (reorder_p != nullptr) {
-        pipeline.push_back(*reorder_p);
-      }
-      is_reusing_ = true;
-    }
-    return target_memory_p;
-  }
-
-  static std::string GetHash(mkldnn::memory::dims& operand_dims,  // NOLINT
-                             const std::string& suffix) {
-    return dims2str(operand_dims) + suffix;
-  }
-
- protected:
-  static std::string dims2str(const mkldnn::memory::dims& operand_dims) {
-    std::string dstr = "";
-    for (size_t i = 0; i < operand_dims.size(); ++i) {
-      dstr += std::to_string(operand_dims[i]) + "-";
-    }
-    return dstr;
-  }
-
- protected:
-  const MKLDNNDeviceContext& dev_ctx_;
-  mkldnn::engine engine_;
-  std::string key_;
-  bool is_reusing_;
-};
-
 inline mkldnn::memory::format MKLDNNFormatForSize(
     size_t dims_size, mkldnn::memory::format data_format) {
   if (dims_size == 1) {
     return mkldnn::memory::format::x;
   } else if (dims_size == 2) {
     return mkldnn::memory::format::nc;
+  } else if (dims_size == 3) {
+    if (data_format == mkldnn::memory::format::nchw) {
+      return mkldnn::memory::format::ncw;
+    } else if (data_format == mkldnn::memory::format::nhwc) {
+      return mkldnn::memory::format::nwc;
+    }
+  } else if (dims_size == 5) {
+    if (data_format == mkldnn::memory::format::nchw) {
+      return mkldnn::memory::format::ncdhw;
+    } else if (data_format == mkldnn::memory::format::nhwc) {
+      return mkldnn::memory::format::ndhwc;
+    }
   }
   return data_format;
 }
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c6421f3fa6ffbe7d3c682611def9e87d2fae5b0
--- /dev/null
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -0,0 +1,458 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace platform {
+
+using user_function = std::function<std::shared_ptr<float>(const float*)>;
+
+class MKLDNNHandler {
+ public:
+  MKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
+                const std::string& base_key)
+      : dev_ctx_(dev_ctx),
+        engine_(engine),
+        key_(base_key),
+        is_reusing_(false) {}
+
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_src_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireWeightsMemory(
+      const mkldnn::memory::desc& md, void* ptr,
+      user_function custom_func = {}) {
+    return this->AcquireMemory(md, ptr, "@user_weights_mem_p", custom_func);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireBiasMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_bias_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDstMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_dst_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_diff_dst_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_diff_src_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireMemoryFromPrimitive(
+      mkldnn::memory::primitive_desc mdp, void* ptr,
+      const std::string& suffix) {
+    auto local_key = key_ + suffix;
+    auto mem_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find mem primitive in device context");
+    if (mem_p == nullptr) {
+      mem_p = std::make_shared<mkldnn::memory>(mdp, ptr);
+      dev_ctx_.SetBlob(local_key, mem_p);
+    } else {
+      mem_p->set_data_handle(ptr);
+      // Mark that reusing happenned. All primitives from operator instance
+      // should be reused or none of them. So we check consistency
+      is_reusing_ = true;
+    }
+    return mem_p;
+  }
+
+  // This incarnation of AcquireMemory can call user function eg. custom reorder
+  // or preprocessing routine if needed
+  std::shared_ptr<mkldnn::memory> AcquireMemory(
+      const mkldnn::memory::desc& md, void* ptr, const std::string& suffix,
+      user_function custom_func = {}) {
+    /*Generate key*/
+    auto local_key = key_ + suffix;
+    auto mem_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find mem primitive in device context");
+    if (mem_p == nullptr) {
+      // Call custom reorder/preprocessing func if available
+      if (custom_func) {
+        auto reordered_data = custom_func(reinterpret_cast<const float*>(ptr));
+        dev_ctx_.SetBlob(local_key + "-custom_reorder", reordered_data);
+        ptr = reinterpret_cast<void*>(reordered_data.get());
+      }
+
+      mem_p = std::make_shared<mkldnn::memory>(
+          mkldnn::memory::primitive_desc{md, engine_}, ptr);
+      dev_ctx_.SetBlob(local_key, mem_p);
+    } else {
+      mem_p->set_data_handle(ptr);
+      // Mark that reusing happenned. All primitives from operator instance
+      // should be reused or none of them. So we check consistency
+      is_reusing_ = true;
+    }
+    return mem_p;
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireMemory(
+      const std::shared_ptr<mkldnn::memory>& user_memory_p,
+      const std::shared_ptr<mkldnn::memory>& target_memory_p,
+      const std::string& suffix,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto local_key = key_ + suffix;
+    auto key_reorder_p = key_ + suffix + "reorder_p";
+
+    auto stored_reorder_p = std::static_pointer_cast<mkldnn::reorder>(
+        dev_ctx_.GetBlob(key_reorder_p));
+
+    if (stored_reorder_p) {
+      pipeline.push_back(*stored_reorder_p);
+    } else {
+      auto reorder_p =
+          std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p);
+      dev_ctx_.SetBlob(key_reorder_p, reorder_p);
+      pipeline.push_back(*reorder_p);
+    }
+
+    return target_memory_p;
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireMemory(
+      mkldnn::memory::primitive_desc& mpd,       // NOLINT
+      mkldnn::memory::primitive_desc& user_mpd,  // NOLINT
+      const std::shared_ptr<mkldnn::memory> user_memory_p,
+      const std::string& suffix,
+      std::vector<mkldnn::primitive>& pipeline,  // NOLINT
+      bool is_persistent = false) {
+    // create reorder primitive if the input format is not the preferred one
+    auto local_key = key_ + suffix;
+    auto key_reorder_p = key_ + suffix + "reorder_p";
+
+    auto target_memory_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((target_memory_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find mem primitive in device context");
+    if (target_memory_p == nullptr) {
+      target_memory_p = user_memory_p;
+      std::shared_ptr<mkldnn::primitive> reorder_p;
+      if (mpd != user_mpd) {
+        target_memory_p = std::make_shared<mkldnn::memory>(mpd);
+        auto reorder_p =
+            std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p);
+        dev_ctx_.SetBlob(key_reorder_p, reorder_p);
+        pipeline.push_back(*reorder_p);
+      }
+      dev_ctx_.SetBlob(local_key, target_memory_p);
+    } else if (!is_persistent) {
+      // Make reorder if needed
+      auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
+          dev_ctx_.GetBlob(key_reorder_p));
+      if (reorder_p != nullptr) {
+        pipeline.push_back(*reorder_p);
+      }
+      is_reusing_ = true;
+    }
+    return target_memory_p;
+  }
+
+  static std::string GetHash(mkldnn::memory::dims& operand_dims,  // NOLINT
+                             const std::string& suffix) {
+    return dims2str(operand_dims) + suffix;
+  }
+
+ protected:
+  static std::string dims2str(const mkldnn::memory::dims& operand_dims) {
+    std::string dstr = "";
+    for (size_t i = 0; i < operand_dims.size(); ++i) {
+      dstr += std::to_string(operand_dims[i]) + "-";
+    }
+    return dstr;
+  }
+
+ protected:
+  const MKLDNNDeviceContext& dev_ctx_;
+  mkldnn::engine engine_;
+  std::string key_;
+  bool is_reusing_;
+};
+
+template <class forward_t, class backward_data_t, class backward_weights_t>
+class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
+ public:
+  ConvMKLDNNTemplateHandler(
+      std::shared_ptr<typename forward_t::primitive_desc> conv_pd,
+      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
+      const std::string& base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key) {
+    conv_pd_ = conv_pd;
+  }
+
+  ConvMKLDNNTemplateHandler(
+      std::shared_ptr<typename forward_t::primitive_desc> conv_pd,
+      std::shared_ptr<typename backward_data_t::primitive_desc>
+          conv_bwd_data_pd,
+      std::shared_ptr<typename backward_weights_t::primitive_desc>
+          conv_bwd_weights_pd,
+      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
+      const std::string& base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
+        conv_pd_(conv_pd),
+        conv_bwd_weights_pd_(conv_bwd_weights_pd),
+        conv_bwd_data_pd_(conv_bwd_data_pd) {
+    // If we are in Grad operatgor then update a key with BWD suffix to
+    // distinguish from FWD memory primitives
+    key_ += "-BWD";
+  }
+
+  size_t GetDstMemorySize() const {
+    return conv_pd_->dst_primitive_desc().get_size();
+  }
+
+  mkldnn::memory::format GetDstFormat() const {
+    return static_cast<mkldnn::memory::format>(
+        conv_pd_->dst_primitive_desc().desc().data.format);
+  }
+
+  size_t GetDiffWeightsMemorySize() const {
+    return conv_bwd_weights_pd_->diff_weights_primitive_desc().get_size();
+  }
+
+  size_t GetDiffSourceMemorySize() const {
+    return conv_bwd_data_pd_->diff_src_primitive_desc().get_size();
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromWeightsPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto src_pd = conv_bwd_weights_pd_->src_primitive_desc();
+    auto user_pd = user_memory_p->get_primitive_desc();
+    return this->AcquireMemory(src_pd, user_pd, user_memory_p,
+                               "@weights-src_mem_p", pipeline);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromWeightsPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto diff_dst_pd = conv_bwd_weights_pd_->diff_dst_primitive_desc();
+    auto user_pd = user_memory_p->get_primitive_desc();
+    return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
+                               "@weights-diff_dst_mem_p", pipeline);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffWeightsMemoryFromWeightsPrimitive(
+      void* ptr) {
+    return this->AcquireMemoryFromPrimitive(
+        conv_bwd_weights_pd_->diff_weights_primitive_desc(), ptr,
+        "@diff_weights_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromDataPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto diff_dst_pd = conv_bwd_data_pd_->diff_dst_primitive_desc();
+    auto user_pd = user_memory_p->get_primitive_desc();
+    return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
+                               "@data-diff_dst_mem_p", pipeline);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromDataPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto weights_pd = conv_bwd_data_pd_->weights_primitive_desc();
+    auto user_pd = user_weights_memory_p->get_primitive_desc();
+    return this->AcquireMemory(weights_pd, user_pd, user_weights_memory_p,
+                               "@data-weights_mem_p", pipeline);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireResidualDataMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_residual_data_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromResidualDataMemory(
+      const std::shared_ptr<mkldnn::memory>& user_residual_memory_p,
+      void* dst_ptr,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    return this->AcquireMemory(user_residual_memory_p,
+                               this->AcquireDstMemoryFromPrimitive(dst_ptr),
+                               "@residual_data_mem_p", pipeline);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemoryFromDataPrimitive(
+      void* ptr) {
+    return this->AcquireMemoryFromPrimitive(
+        conv_bwd_data_pd_->diff_src_primitive_desc(), ptr, "@diff_src_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromPrimitive(void* ptr) {
+    return this->AcquireMemoryFromPrimitive(conv_pd_->dst_primitive_desc(), ptr,
+                                            "@dst_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto src_pd = conv_pd_->src_primitive_desc();
+    auto user_pd = user_memory_p->get_primitive_desc();
+    return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p",
+                               pipeline);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
+      std::vector<mkldnn::primitive>& pipeline,  // NOLINT
+      bool is_persistent = false) {
+    auto user_weights_pd = user_weights_memory_p->get_primitive_desc();
+    auto weights_pd = conv_pd_->weights_primitive_desc();
+    return this->AcquireMemory(weights_pd, user_weights_pd,
+                               user_weights_memory_p, "@weights_mem_p",
+                               pipeline, is_persistent);
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireBiasMemoryFromPrimitive(
+      const std::shared_ptr<mkldnn::memory> user_bias_memory_p,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    auto user_bias_pd = user_bias_memory_p->get_primitive_desc();
+    auto bias_pd = conv_pd_->bias_primitive_desc();
+    return this->AcquireMemory(bias_pd, user_bias_pd, user_bias_memory_p,
+                               "@bias_mem_p", pipeline);
+  }
+
+  std::shared_ptr<forward_t> AcquireConvolution(
+      std::shared_ptr<mkldnn::memory> src_memory_p,
+      std::shared_ptr<mkldnn::memory> weights_memory_p,
+      std::shared_ptr<mkldnn::memory> dst_memory_p) {
+    auto prim_key = key_ + "@conv_p";
+    auto conv_p =
+        std::static_pointer_cast<forward_t>(dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find convolution primitive in device context");
+    if (conv_p == nullptr) {
+      conv_p = std::make_shared<forward_t>(*conv_pd_, *(src_memory_p),
+                                           *(weights_memory_p.get()),
+                                           *(dst_memory_p.get()));
+
+      dev_ctx_.SetBlob(prim_key, conv_p);
+    } else {
+      is_reusing_ = true;
+    }
+    return conv_p;
+  }
+
+  std::shared_ptr<forward_t> AcquireConvolution(
+      std::shared_ptr<mkldnn::memory> src_memory_p,
+      std::shared_ptr<mkldnn::memory> weights_memory_p,
+      std::shared_ptr<mkldnn::memory> bias_memory_p,
+      std::shared_ptr<mkldnn::memory> dst_memory_p) {
+    auto prim_key = key_ + "@conv_p";
+    auto conv_p =
+        std::static_pointer_cast<forward_t>(dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find convolution primitive in device context");
+    if (conv_p == nullptr) {
+      conv_p = std::make_shared<forward_t>(
+          *conv_pd_, *(src_memory_p), *(weights_memory_p.get()),
+          *(bias_memory_p.get()), *(dst_memory_p.get()));
+
+      dev_ctx_.SetBlob(prim_key, conv_p);
+    } else {
+      is_reusing_ = true;
+    }
+    return conv_p;
+  }
+
+  std::shared_ptr<backward_weights_t> AcquireConvolutionBackwardWeights(
+      std::shared_ptr<mkldnn::memory> src_memory_p,
+      std::shared_ptr<mkldnn::memory> diff_dst_memory_p,
+      std::shared_ptr<mkldnn::memory> diff_weights_memory_p) {
+    auto prim_key = key_ + "@conv_bwd_weights_p";
+    auto conv_bwd_weights_p = std::static_pointer_cast<backward_weights_t>(
+        dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE(
+        (conv_bwd_weights_p != nullptr) || (is_reusing_ == false),
+        "Fail to find convolution bwd weights primitive in device context");
+    if (conv_bwd_weights_p == nullptr) {
+      // create backward conv primitive for weights
+      conv_bwd_weights_p = std::make_shared<backward_weights_t>(
+          *conv_bwd_weights_pd_, *src_memory_p, *diff_dst_memory_p,
+          *diff_weights_memory_p);
+      dev_ctx_.SetBlob(prim_key, conv_bwd_weights_p);
+    } else {
+      is_reusing_ = true;
+    }
+    return conv_bwd_weights_p;
+  }
+
+  std::shared_ptr<backward_data_t> AcquireConvolutionBackwardData(
+      std::shared_ptr<mkldnn::memory> diff_dst_memory_p,
+      std::shared_ptr<mkldnn::memory> weights_memory_p,
+      std::shared_ptr<mkldnn::memory> diff_src_memory_p) {
+    auto prim_key = key_ + "@conv_bwd_data_p";
+    auto conv_bwd_data_p =
+        std::static_pointer_cast<backward_data_t>(dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE(
+        (conv_bwd_data_p != nullptr) || (is_reusing_ == false),
+        "Fail to find convolution bwd data primitive in device context");
+    if (conv_bwd_data_p == nullptr) {
+      conv_bwd_data_p = std::make_shared<backward_data_t>(
+          *conv_bwd_data_pd_, *diff_dst_memory_p, *weights_memory_p,
+          *diff_src_memory_p);
+      dev_ctx_.SetBlob(prim_key, conv_bwd_data_p);
+    } else {
+      is_reusing_ = true;
+    }
+    return conv_bwd_data_p;
+  }
+
+  // Generate keys for storing/retriving primitives for this operator
+  // TODO(jczaja): Make hashing function more optimial
+  static std::string GetHash(mkldnn::memory::dims& input_dims,    // NOLINT
+                             mkldnn::memory::dims& weights_dims,  // NOLINT
+                             std::vector<int>& strides,           // NOLINT
+                             std::vector<int>& paddings,          // NOLINT
+                             std::vector<int>& dilations,         // NOLINT
+                             int groups, const std::string& suffix) {
+    return dims2str(input_dims) + dims2str(weights_dims) + dims2str(strides) +
+           dims2str(paddings) + dims2str(dilations) + std::to_string(groups) +
+           suffix;
+  }
+
+ private:
+  std::shared_ptr<typename forward_t::primitive_desc> conv_pd_;
+  std::shared_ptr<typename backward_weights_t::primitive_desc>
+      conv_bwd_weights_pd_;
+  std::shared_ptr<typename backward_data_t::primitive_desc> conv_bwd_data_pd_;
+};
+
+using ConvMKLDNNHandler =
+    ConvMKLDNNTemplateHandler<mkldnn::convolution_forward,
+                              mkldnn::convolution_backward_data,
+                              mkldnn::convolution_backward_weights>;
+
+using ConvTransposeMKLDNNHandler =
+    ConvMKLDNNTemplateHandler<mkldnn::deconvolution_forward,
+                              mkldnn::deconvolution_backward_data,
+                              mkldnn::deconvolution_backward_weights>;
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index fc903b548c70e9b72c6121dd24c014973e3cd1d4..7c539d25f6dd02fc09aa1234d7bf0164b54a610f 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -97,7 +97,7 @@ struct NCCLContextMap {
         order_.size(), contexts_.size(),
         "NCCL Context Map does not support contain two or more same device");
 
-    if (places.size() <= 1) {
+    if (places.size() <= 1 && num_trainers == 1) {
       return;
     }
     std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
@@ -111,12 +111,19 @@ struct NCCLContextMap {
       {
         int nranks = num_trainers * order_.size();
         NCCLGroupGuard gurad;
-        for (auto &gpu_id : order_) {
-          int rank = trainer_id * order_.size() + gpu_id;
-          VLOG(3) << "init nccl rank: " << rank << " nranks: " << nranks;
+        for (size_t i = 0; i < order_.size(); ++i) {
+          int gpu_id = order_[i];
+          int rank;
+          if (order_.size() > 1) {
+            rank = trainer_id * order_.size() + i;
+          } else {
+            rank = trainer_id;
+          }
+          VLOG(30) << "init nccl rank: " << rank << " nranks: " << nranks
+                   << "gpu id: " << gpu_id;
           PADDLE_ENFORCE(cudaSetDevice(gpu_id));
           PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
-              comms.get() + gpu_id, nranks, *nccl_id, rank));
+              comms.get() + i, nranks, *nccl_id, rank));
         }
       }
     }
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
new file mode 100644
index 0000000000000000000000000000000000000000..466c77469ef256179c52442d21c1d62dfc4ef1bb
--- /dev/null
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/stream_callback_manager.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+
+#if CUDA_VERSION >= 10000
+static void CUDART_CB StreamCallbackFunc(void *user_data);
+#else
+static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
+                                         cudaError_t status, void *user_data)
+#endif
+{
+  std::unique_ptr<std::function<void()>> func(
+      reinterpret_cast<std::function<void()> *>(user_data));
+  (*func)();
+}
+
+StreamCallbackManager::StreamCallbackManager(const cudaStream_t stream)
+    : stream_(stream), thread_pool_(1) {}
+
+void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
+  auto *callback_func = new std::function<void()>(std::move(callback));
+  auto *func = new std::function<void()>([this, callback_func] {
+    std::lock_guard<std::mutex> lock(mtx_);
+    last_future_ = thread_pool_.enqueue([callback_func] {
+      std::unique_ptr<std::function<void()>> releaser(callback_func);
+      (*callback_func)();
+    });
+  });
+#if CUDA_VERSION >= 10000
+  PADDLE_ENFORCE(cudaLaunchHostFunc(stream_, StreamCallbackFunc, func));
+#else
+  PADDLE_ENFORCE(cudaStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
+#endif
+}
+
+void StreamCallbackManager::Wait() const {
+  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
+  {
+    std::lock_guard<std::mutex> lock(mtx_);
+    if (last_future_.valid()) {
+      last_future_.wait();
+    }
+  }
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h
index ed8734c98cb996721a3002523c9276e6d7f492ae..8668bcb1131719e882ecbccb08ad00b63409eb28 100644
--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
@@ -18,67 +18,32 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <functional>
+#include <future>  // NOLINT
 #include <memory>
+#include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace platform {
 
-class StreamCallbackManager;
-
-struct StreamCallbackContext {
-  template <typename Callback>
-  inline StreamCallbackContext(const StreamCallbackManager *manager,
-                               Callback &&callback)
-      : manager_(manager), callback_(callback) {}
-
-  const StreamCallbackManager *manager_;  // do not own
-  std::function<void()> callback_;
-};
-
+// NOTE(zjl): clean StreamCallbackManager to make compilation faster
+// Make StreamCallbackManager thread-safe
 class StreamCallbackManager {
  public:
-  explicit inline StreamCallbackManager(cudaStream_t stream = nullptr)
-      : stream_(stream), thread_pool_(new ThreadPool(1)) {}
+  explicit StreamCallbackManager(const cudaStream_t stream);
+
+  ~StreamCallbackManager() = default;
 
-  template <typename Callback>
-  inline void AddCallback(Callback &&callback) const {
-    auto *stream_callback_context =
-        new StreamCallbackContext(this, std::forward<Callback>(callback));
-#if CUDA_VERSION >= 10000
-    PADDLE_ENFORCE(cudaLaunchHostFunc(stream_,
-                                      StreamCallbackManager::StreamCallbackFunc,
-                                      stream_callback_context));  // NOLINT
-#else
-    PADDLE_ENFORCE(cudaStreamAddCallback(
-        stream_, StreamCallbackManager::StreamCallbackFunc,
-        stream_callback_context, 0));  // NOLINT
-#endif
-  }
+  void AddCallback(std::function<void()> callback) const;
 
-  void Wait() const { thread_pool_.reset(new ThreadPool(1)); }
+  void Wait() const;
 
  private:
   const cudaStream_t stream_;
-  mutable std::unique_ptr<ThreadPool> thread_pool_;
-
-// cudaStreamCallback cannot call CUDA API inside, so we have to use
-// thread_pool here
-#if CUDA_VERSION >= 10000
-  static void CUDART_CB StreamCallbackFunc(void *user_data)
-#else
-  static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
-                                           cudaError_t status, void *user_data)
-#endif
-  {
-    auto *callback_context_ptr =
-        reinterpret_cast<StreamCallbackContext *>(user_data);
-    callback_context_ptr->manager_->thread_pool_->enqueue([=]() {
-      std::unique_ptr<StreamCallbackContext> callback_context(
-          callback_context_ptr);
-      callback_context->callback_();
-    });
-  }
+  mutable ::ThreadPool thread_pool_;
+  mutable std::mutex mtx_;
+  mutable std::future<void> last_future_;
 };
 
 }  // namespace platform
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 25d241d9768c16e1da304a78f259d5a626f702fc..b8954cb12628d1f4f333956e0213ddf9c01e592c 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,6 +1,7 @@
 
-set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder parallel_executor profiler)
-set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc)
+set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer)
+set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc)
+
 if(WITH_PYTHON)
   if(WITH_AMD_GPU)
     hip_library(paddle_pybind SHARED
diff --git a/paddle/fluid/pybind/async_executor_py.cc b/paddle/fluid/pybind/async_executor_py.cc
new file mode 100644
index 0000000000000000000000000000000000000000..470e8b050808295d49728bbdb757b6a612df9a01
--- /dev/null
+++ b/paddle/fluid/pybind/async_executor_py.cc
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <fcntl.h>
+
+// To avoid conflicting definition in gcc-4.8.2 headers and pyconfig.h (2.7.3)
+#ifdef _POSIX_C_SOURCE
+#undef _POSIX_C_SOURCE
+#endif
+
+#ifdef _XOPEN_SOURCE
+#undef _XOPEN_SOURCE
+#endif
+#include <string>
+#include <vector>
+
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "google/protobuf/text_format.h"
+#include "paddle/fluid/framework/async_executor.h"
+#include "paddle/fluid/framework/data_feed.h"
+#include "paddle/fluid/framework/data_feed.pb.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/variant.h"
+#include "paddle/fluid/pybind/async_executor_py.h"
+
+namespace py = pybind11;
+namespace pd = paddle::framework;
+
+namespace paddle {
+namespace pybind {
+using set_name_func = void (pd::DataFeedDesc::*)(const std::string&);
+void BindAsyncExecutor(py::module* m) {
+  py::class_<framework::AsyncExecutor>(*m, "AsyncExecutor")
+      .def(py::init([](framework::Scope* scope, const platform::Place& place) {
+        return std::unique_ptr<framework::AsyncExecutor>(
+            new framework::AsyncExecutor(scope, place));
+      }))
+      .def("run_from_files", &framework::AsyncExecutor::RunFromFile);
+}  // end BindAsyncExecutor
+}  // end namespace pybind
+}  // end namespace paddle
diff --git a/paddle/fluid/pybind/async_executor_py.h b/paddle/fluid/pybind/async_executor_py.h
new file mode 100644
index 0000000000000000000000000000000000000000..a99d6e04218c9310ede00de7d9bdfc015889bd22
--- /dev/null
+++ b/paddle/fluid/pybind/async_executor_py.h
@@ -0,0 +1,28 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+
+void BindAsyncExecutor(py::module* m);
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
new file mode 100644
index 0000000000000000000000000000000000000000..34e9c897d9e95feb185083b7c0a6a824d8dc809c
--- /dev/null
+++ b/paddle/fluid/pybind/imperative.cc
@@ -0,0 +1,36 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/pybind/imperative.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/imperative/tracer.h"
+
+namespace paddle {
+namespace pybind {
+
+// Bind Methods
+void BindTracer(pybind11::module *m) {
+  pybind11::class_<imperative::Tracer>(*m, "Tracer", "")
+      .def("__init__",
+           [](imperative::Tracer &self, framework::BlockDesc *root_block) {
+             new (&self) imperative::Tracer(root_block);
+           })
+      .def("trace", &imperative::Tracer::Trace)
+      .def("get_scope", &imperative::Tracer::GetScope,
+           pybind11::return_value_policy::reference);
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a9d3a01ea81f11ac85000c3d0153f20e108789a
--- /dev/null
+++ b/paddle/fluid/pybind/imperative.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <Python.h>
+#include <vector>
+#include "paddle/fluid/imperative/layer.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace paddle {
+namespace pybind {
+
+class PyLayer : public imperative::Layer {
+ public:
+  using imperative::Layer::Layer;  // Inherit constructors
+
+  std::vector<imperative::VarBase> Forward(
+      const std::vector<imperative::VarBase>& inputs) override {
+    PYBIND11_OVERLOAD(std::vector<imperative::VarBase>, Layer, Forward,
+                      inputs);  // NOLINT
+  }
+
+  void Backward() override {
+    PYBIND11_OVERLOAD(void, Layer, Backward, );  // NOLINT
+  }
+};
+
+class PyOpBase : public imperative::OpBase {
+ public:
+  using imperative::OpBase::OpBase;  // Inherit constructors
+};
+
+class PyVarBase : public imperative::VarBase {
+ public:
+  using imperative::VarBase::VarBase;  // Inherit constructors
+};
+
+void BindTracer(pybind11::module* m);
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index e31c2f217322be8ef8b131189504b54cf6b4ad80..74b4f2e937b3d3715b13b03e8d3618c0afafb69c 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -34,6 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/version.h"
+#include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
@@ -42,8 +43,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/pybind/async_executor_py.h"
 #include "paddle/fluid/pybind/const_value.h"
 #include "paddle/fluid/pybind/exception.h"
+#include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/pybind/protobuf.h"
 #include "paddle/fluid/pybind/pybind.h"  // NOLINT
 #include "paddle/fluid/pybind/recordio.h"
@@ -99,6 +102,42 @@ PYBIND11_MODULE(core, m) {
 
   BindException(&m);
 
+  py::class_<imperative::VarBase, PyVarBase>(m, "VarBase", R"DOC()DOC")
+      .def(py::init<>())
+      .def("_run_backward",
+           [](imperative::VarBase &self, framework::Scope *scope) {
+             self.RunBackward(scope);
+           })
+      .def("_grad", &imperative::VarBase::Grad)
+      .def_property(
+          "desc",
+          [](const imperative::VarBase &self) { return self.var_desc_; },
+          [](imperative::VarBase &self, framework::VarDesc *var_desc) {
+            self.var_desc_ = var_desc;
+          },
+          py::return_value_policy::reference);
+
+  py::class_<imperative::OpBase, PyOpBase>(m, "OpBase", R"DOC()DOC")
+      .def(py::init<>())
+      .def_property(
+          "desc", [](const imperative::OpBase &self) { return self.op_desc_; },
+          [](imperative::OpBase &self, framework::OpDesc *op_desc) {
+            if (op_desc) {
+              self.op_desc_ = op_desc;
+            }
+          },
+          py::return_value_policy::reference);
+
+  py::class_<imperative::Layer, PyLayer /* <--- trampoline*/> layer(m, "Layer");
+  layer.def(py::init<>())
+      .def("forward",
+           [](imperative::Layer &self,
+              const std::vector<imperative::VarBase> &inputs) {
+             return self.Forward(inputs);
+           })
+      .def("backward", &imperative::Layer::Backward);
+  BindTracer(&m);
+
   py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
       .def_buffer(
           [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
@@ -297,6 +336,8 @@ PYBIND11_MODULE(core, m) {
       .def("get_tensor",
            [](SelectedRows &self) { return self.mutable_value(); },
            py::return_value_policy::reference)
+      .def("numel",
+           [](SelectedRows &self) -> int64_t { return self.value().numel(); })
       .def("set_height", &SelectedRows::set_height)
       .def("height", &SelectedRows::height)
       .def("set_rows",
@@ -398,7 +439,26 @@ All parameter, weight, gradient are variables in Paddle.
             },
         py::return_value_policy::copy);
 
-  py::class_<Scope>(m, "Scope", "")
+  py::class_<Scope>(m, "Scope", R"DOC(
+    Scope is an association of a name to Variable. All variables belong to Scope.
+
+    Variables in a parent scope can be retrieved from local scope.
+
+    You need to specify a scope to run a Net, i.e., `exe.Run(&scope)`.
+    One net can run in different scopes and update different variable in the
+    scope.
+
+    You can create var in a scope and get it from the scope.
+
+    Examples:
+        .. code-block:: python
+
+          # create tensor from a scope and set value to it.
+          param = scope.var('Param').get_tensor()
+          param_array = np.full((height, row_numel), 5.0).astype("float32")
+          param.set(param_array, place)
+
+        )DOC")
       .def("var",
            [](Scope &self, const std::string &name) -> Variable * {
              return self.Var(name);
@@ -581,6 +641,7 @@ All parameter, weight, gradient are variables in Paddle.
 
   m.def("set_feed_variable", framework::SetFeedVariable);
   m.def("get_fetch_variable", framework::GetFetchVariable);
+  m.def("get_variable_tensor", framework::GetVariableTensor);
 
   m.def("_is_program_version_supported", IsProgramVersionSupported);
 
@@ -866,6 +927,18 @@ All parameter, weight, gradient are variables in Paddle.
           [](BuildStrategy &self, int num_trainers) {
             self.num_trainers_ = num_trainers;
           })
+      .def_property(
+          "trainers_endpoints",
+          [](const BuildStrategy &self) { return self.trainers_endpoints_; },
+          [](BuildStrategy &self,
+             const std::vector<std::string> &trainers_endpoints) {
+            self.trainers_endpoints_ = trainers_endpoints;
+          })
+      .def_property("trainer_id",
+                    [](const BuildStrategy &self) { return self.trainer_id_; },
+                    [](BuildStrategy &self, int trainer_id) {
+                      self.trainer_id_ = trainer_id;
+                    })
       .def_property(
           "fuse_elewise_add_act_ops",
           [](const BuildStrategy &self) {
@@ -913,6 +986,7 @@ All parameter, weight, gradient are variables in Paddle.
       });
 
   BindRecordIOWriter(&m);
+  BindAsyncExecutor(&m);
 }
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 02a75236f6c7c7a64f2aa110ca7a7e3d92832fe9..24800e17098759082fd047e51a10fa40ff48b961 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -162,7 +162,7 @@ void PyCPUTensorSetFromArray(
     paddle::platform::CPUPlace place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
-  for (size_t i = 0; i < array.ndim(); ++i) {
+  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
     dims.push_back(static_cast<int>(array.shape()[i]));
   }
 
@@ -182,7 +182,7 @@ inline void PyCPUTensorSetFromArray(
     paddle::platform::CPUPlace place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
-  for (size_t i = 0; i < array.ndim(); ++i) {
+  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
     dims.push_back(static_cast<int>(array.shape()[i]));
   }
 
@@ -200,7 +200,7 @@ void PyCUDATensorSetFromArray(
     paddle::platform::CUDAPlace place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
-  for (size_t i = 0; i < array.ndim(); ++i) {
+  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
     dims.push_back(static_cast<int>(array.shape()[i]));
   }
 
@@ -221,7 +221,7 @@ inline void PyCUDATensorSetFromArray(
     paddle::platform::CUDAPlace place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
-  for (size_t i = 0; i < array.ndim(); ++i) {
+  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
     dims.push_back(static_cast<int>(array.shape()[i]));
   }
 
@@ -240,7 +240,7 @@ void PyCUDAPinnedTensorSetFromArray(
     const paddle::platform::CUDAPinnedPlace &place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
-  for (size_t i = 0; i < array.ndim(); ++i) {
+  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
     dims.push_back(static_cast<int>(array.shape()[i]));
   }
 
@@ -260,7 +260,7 @@ inline void PyCUDAPinnedTensorSetFromArray(
     const paddle::platform::CUDAPinnedPlace &place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
-  for (size_t i = 0; i < array.ndim(); ++i) {
+  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
     dims.push_back(static_cast<int>(array.shape()[i]));
   }
 
diff --git a/paddle/fluid/string/CMakeLists.txt b/paddle/fluid/string/CMakeLists.txt
index 8572dc1e8e543b552e3ed5a180ec942faf90a624..169a925d12328e7d1df744635445b5674c19b125 100644
--- a/paddle/fluid/string/CMakeLists.txt
+++ b/paddle/fluid/string/CMakeLists.txt
@@ -3,3 +3,4 @@ cc_library(pretty_log SRCS pretty_log.cc)
 cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)
 cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
 cc_test(to_string_test SRCS to_string_test.cc)
+cc_test(split_test SRCS split_test.cc)
diff --git a/paddle/fluid/string/split.h b/paddle/fluid/string/split.h
new file mode 100644
index 0000000000000000000000000000000000000000..ccb96b8a9cb68f03acbca592a2149ba5001f34d2
--- /dev/null
+++ b/paddle/fluid/string/split.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace string {
+
+static inline std::vector<std::string> Split(std::string const& original,
+                                             char separator) {
+  std::vector<std::string> results;
+  std::string token;
+  std::istringstream is(original);
+  while (std::getline(is, token, separator)) {
+    if (!token.empty()) {
+      results.push_back(token);
+    }
+  }
+  return results;
+}
+
+}  // namespace string
+}  // namespace paddle
diff --git a/paddle/fluid/string/split_test.cc b/paddle/fluid/string/split_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c85dc1eed40dbe25d922c0f4810a747d1bd2d60f
--- /dev/null
+++ b/paddle/fluid/string/split_test.cc
@@ -0,0 +1,28 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/string/split.h"
+
+#include <string>
+
+#include "gtest/gtest.h"
+
+TEST(StringSplit, StringSplit) {
+  std::string to_split = "0,1,2,3,4,5";
+  int i = 0;
+  for (auto s : paddle::string::Split(to_split, ',')) {
+    EXPECT_EQ(atoi(s.c_str()), i);
+    i++;
+  }
+}
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index a6720fa798ec5cf60a8806a7f72fe6febaf4f7ac..6299b166af8a5f65cf587ae282c955f33db0044b 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -437,14 +437,32 @@ EOF
         export http_proxy=
         export https_proxy=
         # TODO: jiabin need to refine this part when these tests fixed on mac
-        ctest --output-on-failure -j $1
+        ctest --output-on-failure -j $2
         # make install should also be test when unittest
         make install -j 8
-        pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+        if [ "$1" == "cp27-cp27m" ]; then
+            pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+        elif [ "$1" == "cp35-cp35m" ]; then
+            pip3.5 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+        elif [ "$1" == "cp36-cp36m" ]; then
+            pip3.6 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+        elif [ "$1" == "cp37-cp37m" ]; then
+            pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+        fi
+      
         if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
             paddle version
         fi
-        pip uninstall -y paddlepaddle
+
+        if [ "$1" == "cp27-cp27m" ]; then
+            pip uninstall -y paddlepaddle
+        elif [ "$1" == "cp35-cp35m" ]; then
+            pip3.5 uninstall -y paddlepaddle
+        elif [ "$1" == "cp36-cp36m" ]; then
+            pip3.6 uninstall -y paddlepaddle
+        elif [ "$1" == "cp37-cp37m" ]; then
+            pip3.7 uninstall -y paddlepaddle
+        fi
     fi
 }
 
@@ -454,12 +472,15 @@ function assert_api_not_changed() {
     virtualenv .env
     source .env/bin/activate
     pip install ${PADDLE_ROOT}/build/python/dist/*whl
-    python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec
+    python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid,paddle.reader > new.spec
     if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ]; then
         # Use sed to make python2 and python3 sepc keeps the same
         sed -i 's/arg0: str/arg0: unicode/g' new.spec
         sed -i "s/\(.*Transpiler.*\).__init__ ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec
     fi
+    # ComposeNotAligned has significant difference between py2 and py3
+    sed -i '/.*ComposeNotAligned.*/d' new.spec
+
     python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec
     deactivate
 }
@@ -469,7 +490,19 @@ function assert_api_spec_approvals() {
         BRANCH="develop"
     fi
 
-    API_FILES=("paddle/fluid/API.spec" "paddle/fluid/framework/operator.h")
+    API_FILES=("paddle/fluid/API.spec"
+               "paddle/fluid/framework/operator.h"
+               "paddle/fluid/framework/tensor.h"
+               "paddle/fluid/framework/lod_tensor.h"
+               "paddle/fluid/framework/selected_rows.h"
+               "paddle/fluid/framework/op_desc.h"
+               "paddle/fluid/framework/block_desc.h"
+               "paddle/fluid/framework/var_desc.h"
+               "paddle/fluid/framework/scope.h"
+               "paddle/fluid/framework/ir/node.h"
+               "paddle/fluid/framework/ir/graph.h"
+               "paddle/fluid/framework/framework.proto"
+               "paddle/fluid/operators/distributed/send_recv.proto.in")
     for API_FILE in ${API_FILES[*]}; do
       API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "${API_FILE}" || true`
       echo "checking ${API_FILE} change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
@@ -883,7 +916,7 @@ function main() {
       maccheck)
         cmake_gen ${PYTHON_ABI:-""}
         build_mac
-        run_mac_test ${PROC_RUN:-1}
+        run_mac_test ${PYTHON_ABI:-""} ${PROC_RUN:-1}
         ;;
       macbuild)
         cmake_gen ${PYTHON_ABI:-""}
diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py
index 19fc229e6fa84792f58aeeb00be09eb2401b19c7..57547f1867a937d16fb2dfc9b84e1a30759a527e 100644
--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
@@ -32,11 +32,28 @@ the image layout as follows.
 
 from __future__ import print_function
 
+import six
 import numpy as np
-try:
-    import cv2
-except ImportError:
-    cv2 = None
+# FIXME(minqiyang): this is an ugly fix for the numpy bug reported here
+# https://github.com/numpy/numpy/issues/12497
+if six.PY3:
+    import subprocess
+    import sys
+    import_cv2_proc = subprocess.Popen(
+        [sys.executable, "-c", "import cv2"],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE)
+    out, err = import_cv2_proc.communicate()
+    retcode = import_cv2_proc.poll()
+    if retcode != 0:
+        cv2 = None
+    else:
+        import cv2
+else:
+    try:
+        import cv2
+    except ImportError:
+        cv2 = None
 import os
 import tarfile
 import six.moves.cPickle as pickle
diff --git a/python/paddle/dataset/wmt16.py b/python/paddle/dataset/wmt16.py
index aa66696fae7d3adb44511417edf4a92b82a9151b..1052d24c57b79e1db921f59bb6ea6ecdc87a7f81 100644
--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
@@ -71,15 +71,16 @@ def __build_dict(tar_file, dict_size, save_path, lang):
             for w in sen.split():
                 word_dict[w] += 1
 
-    with open(save_path, "w") as fout:
-        fout.write("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK))
+    with open(save_path, "wb") as fout:
+        fout.write(
+            cpt.to_bytes("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK)))
         for idx, word in enumerate(
                 sorted(
                     six.iteritems(word_dict), key=lambda x: x[1],
                     reverse=True)):
             if idx + 3 == dict_size: break
-            fout.write(word[0].encode('utf-8'))
-            fout.write('\n')
+            fout.write(cpt.to_bytes(word[0]))
+            fout.write(cpt.to_bytes('\n'))
 
 
 def __load_dict(tar_file, dict_size, lang, reverse=False):
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index f7fefb3e5b767e25373665058d4fd6a298fb3d60..e0bb0d1152b258f9b16bb9063acf0ac6012d3432 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -20,6 +20,13 @@ from .framework import *
 # import all class inside executor into fluid module
 from . import executor
 from .executor import *
+
+from . import data_feed_desc
+from .data_feed_desc import *
+
+from . import async_executor
+from .async_executor import *
+
 from . import trainer
 from . import inferencer
 
@@ -27,6 +34,7 @@ from . import io
 from . import evaluator
 from . import initializer
 from . import layers
+from . import imperative
 from . import contrib
 from . import nets
 from . import optimizer
@@ -54,11 +62,13 @@ Tensor = LoDTensor
 
 __all__ = framework.__all__ + executor.__all__ + \
     trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
-    parallel_executor.__all__ + lod_tensor.__all__ + [
+    parallel_executor.__all__ + lod_tensor.__all__ + \
+    data_feed_desc.__all__ + async_executor.__all__ + [
         'io',
         'initializer',
         'layers',
         'contrib',
+        'imperative',
         'transpiler',
         'nets',
         'optimizer',
@@ -116,8 +126,9 @@ def __bootstrap__():
         'check_nan_inf', 'benchmark', 'eager_delete_scope', 'use_mkldnn',
         'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem',
         'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size",
-        'eager_delete_tensor_gb', 'allocator_strategy',
-        'reader_queue_speed_test_mode', 'print_sub_graph_dir'
+        'eager_delete_tensor_gb', 'fast_eager_deletion_mode',
+        'allocator_strategy', 'reader_queue_speed_test_mode',
+        'print_sub_graph_dir', 'pe_profile_fname'
     ]
     if 'Darwin' not in sysstr:
         read_env_flags.append('use_pinned_memory')
@@ -139,7 +150,7 @@ def __bootstrap__():
         read_env_flags += [
             'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
             'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
-            'cudnn_exhaustive_search'
+            'cudnn_exhaustive_search', 'selected_gpus'
         ]
     core.init_gflags([sys.argv[0]] +
                      ["--tryfromenv=" + ",".join(read_env_flags)])
diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py
new file mode 100644
index 0000000000000000000000000000000000000000..2664a7301db3bf471126ff26504e7042f02b7d84
--- /dev/null
+++ b/python/paddle/fluid/async_executor.py
@@ -0,0 +1,151 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import contextlib
+import six
+from .framework import Program, default_main_program, Variable
+from . import core
+from .executor import global_scope, Executor
+from paddle.fluid.proto import data_feed_pb2
+from google.protobuf import text_format
+from . import io
+from .data_feed_desc import DataFeedDesc
+
+__all__ = ['AsyncExecutor']
+
+
+class AsyncExecutor(object):
+    """
+    An asynchronous Executor in Python. Through exploiting the power of
+    multi-core processor and data queueing, AsyncExecutor makes data reading
+    and cosuming decoupled, each run in multiple threads in parallel.
+
+    Instead of reading data in python side, AsyncExecutor accepts a training
+    file list, which will be retrieved in C++, then training inputs will be
+    read, parsed and fed to training network within C++ code.
+
+    AsyncExecutor is in active development and the API might change in the near
+    future.
+
+    Example:
+        >>> data_feed = fluid.DataFeedDesc('data.proto')
+        >>> startup_program = fluid.default_startup_program()
+        >>> main_program = fluid.default_main_program()
+        >>> filelist = ["train_data/part-%d" % i for i in range(100)]
+        >>> thread_num = len(filelist) / 4
+        >>>
+        >>> place = fluid.CPUPlace()
+        >>> async_executor = fluid.AsyncExecutor(place)
+        >>>
+        >>> async_executor.run_startup_program(startup_program)
+        >>>
+        >>> epoch = 10
+        >>> for i in range(epoch):
+        >>>     async_executor.run(main_program,
+        >>>                        data_feed,
+        >>>                        filelist,
+        >>>                        thread_num,
+        >>>                        [acc],
+        >>>                        debug=False)
+
+    Args:
+        place(fluid.CPUPlace|None): indicate the executor run on which device.
+                                   Only CPUPlace supported
+
+    Note:
+        For debugging complicated network in parallel-GPUs, you can test it
+        on the executor. They has the exactly same arguments, and expected
+        the same results.
+
+    Note: Only running on CPUPlace supported.
+    """
+
+    def __init__(self, place=None):
+        if place is None:
+            place = core.CPUPlace()
+        if not isinstance(place, core.CPUPlace):
+            raise ValueError("AsyncExecutor only supports CPU device")
+
+        p = core.Place()
+        p.set_place(place)
+
+        scope = global_scope()
+        self.executor = core.AsyncExecutor(scope, p)
+
+    def run(self, program, data_feed, filelist, thread_num, fetch, debug=False):
+        """
+        Run program by this AsyncExecutor. Training dataset will be in filelist.
+        Users can also inspect certain variables by naming them in parameter
+        :code:`fetch`, like in fluid.Executor. Unlike fluid.Executor, however,
+        AsyncExecutor doesn't return fetched variables, instead, it will dump
+        the values of each fetched variable to stdandard output.
+
+        Running the dataset will be on multiple threads, within each a thread
+        local scope will be created, then all OPs also created in that scope.
+        Parameters are updated by all the OPs simultaneously.
+
+        Args:
+            program(Program): the program that need to run, if not provied,
+                              then default_main_program will be used.
+            data_feed(DataFeedDesc): A DataFeedDesc object
+            filelist(str): a file containing the training dataset file list
+            thread_num(int): number of concurrent training threads. See
+                             :code:`Note` for how to set this properly
+            fetch(str|list): the var name or a list of var names to inspect
+            debug(bool): When set to True, fetch vars will be printed to
+                         standard output after each minibatch
+
+        Note:
+            the executor will run all operators in the program but not only
+            the operators dependent by the fetch_list.
+
+        Note:
+            Running AsyncExecutor will be on multiple threads, each bound to a
+            CPU core. To achieve best performance, it's suggested to set thread
+            num to be equal or slightly less than that of CPU cores.
+        """
+        if program is None:
+            program = default_main_program()
+        program_desc = program.desc
+
+        if data_feed is None:
+            raise ValueError('ValueError: data_feed should be provided')
+
+        if filelist is None:
+            raise ValueError('ValueError: filelist should be provided')
+
+        if isinstance(filelist, str):
+            filelist = [filelist]
+
+        if not isinstance(thread_num, int):
+            raise TypeError('TypeError: thread_num should be a positive number')
+
+        if fetch is not None:
+            if isinstance(fetch, Variable):
+                fetch = [fetch]
+            fetch_var_names = [var.name for var in fetch]
+            for fetch_var in fetch:
+                shape = fetch_var.shape
+                if shape[len(shape) - 1] != 1:
+                    raise AssertionError(
+                        "%s: Fetch variable has wrong shape. Only varibles "
+                        "with the last dimension size 1 supported." %
+                        (fetch_var.name))
+
+        self.executor.run_from_files(program_desc,
+                                     data_feed.desc(), filelist, thread_num,
+                                     fetch_var_names, debug)
diff --git a/python/paddle/fluid/average.py b/python/paddle/fluid/average.py
index 42cd3b36420ef5a17a9a7d981978ba8869809936..40a734af311e2037c1816dce97db123ebedd2f4f 100644
--- a/python/paddle/fluid/average.py
+++ b/python/paddle/fluid/average.py
@@ -48,6 +48,7 @@ class WeightedAverage(object):
 
     Examples:
         .. code-block:: python
+
             avg = fluid.average.WeightedAverage()
             avg.add(value=2.0, weight=1)
             avg.add(value=4.0, weight=2)
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index 1738afe93e99f1de28bec2fb23be8e1a309d9288..0f7dd531b3e5992caa558def6bbdf446a7d2ffaa 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -134,12 +134,12 @@ class GradientClipByValue(BaseGradientClipAttr):
     Examples:
         .. code-block:: python
 
-            w_param_attrs = ParamAttr(name=None,
-              initializer=UniformInitializer(low=-1.0, high=1.0, seed=0),
+            w_param_attrs = fluid.ParamAttr(name=None,
+              initializer=fluid.initializer.UniformInitializer(low=-1.0, high=1.0, seed=0),
               learning_rate=1.0,
-              regularizer=L1Decay(1.0),
+              regularizer=fluid.regularizer.L1Decay(1.0),
               trainable=True,
-              clip=GradientClipByValue(-1.0, 1.0))
+              clip=fluid.clip.GradientClipByValue(-1.0, 1.0))
             y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
     """
 
@@ -185,12 +185,12 @@ class GradientClipByNorm(BaseGradientClipAttr):
     Examples:
         .. code-block:: python
 
-            w_param_attrs = ParamAttr(name=None,
-              initializer=UniformInitializer(low=-1.0, high=1.0, seed=0),
+            w_param_attrs = flui.ParamAttr(name=None,
+              initializer=fluid.initializer.UniformInitializer(low=-1.0, high=1.0, seed=0),
               learning_rate=1.0,
-              regularizer=L1Decay(1.0),
+              regularizer=fluid.regularizer.L1Decay(1.0),
               trainable=True,
-              clip=GradientClipByNorm(clip_norm=2.0))
+              clip=fluid.clip.GradientClipByNorm(clip_norm=2.0))
             y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
 
     """
@@ -271,7 +271,12 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
                     "All parameters' 'clip_norm' of a same group should be the same"
                 )
 
-        square = grad * grad
+        merge_grad = grad
+        if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
+            merge_grad = layers.merge_selected_rows(grad)
+            merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
+
+        square = layers.square(merge_grad)
         local_norm_var = layers.reduce_sum(input=square)
         context[self.group_name].append(local_norm_var)
 
@@ -292,6 +297,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
 
         new_grad = layers.elementwise_mul(
             x=grad, y=self.context[group_scale_name])
+
         return param, new_grad
 
 
diff --git a/python/paddle/fluid/data_feed_desc.py b/python/paddle/fluid/data_feed_desc.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2ec74d6cfdeb34c1f48c086a3aa30d5100c3efb
--- /dev/null
+++ b/python/paddle/fluid/data_feed_desc.py
@@ -0,0 +1,152 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.proto import data_feed_pb2
+from google.protobuf import text_format
+
+__all__ = ['DataFeedDesc']
+
+
+class DataFeedDesc(object):
+    """
+    Datafeed descriptor, describing input training data format. This class is
+    currently only used for AsyncExecutor (See comments for class AsyncExecutor
+    for a brief introduction)
+
+    DataFeedDesc shall be initialized from a valid protobuf message from disk:
+    >>> data_feed = fluid.DataFeedDesc('data.proto')
+
+    See :code:`paddle/fluid/framework/data_feed.proto` for message definition.
+    A typical message might look like:
+
+    >>> name: "MultiSlotDataFeed"
+    >>> batch_size: 2
+    >>> multi_slot_desc {
+    >>>     slots {
+    >>>         name: "words"
+    >>>         type: "uint64"
+    >>>         is_dense: false
+    >>>         is_used: true
+    >>>     }
+    >>>     slots {
+    >>>         name: "label"
+    >>>         type: "uint64"
+    >>>         is_dense: false
+    >>>         is_used: true
+    >>>     }
+    >>> }
+
+    However, users usually shouldn't care about the message format; instead,
+    they are encouragd to use :code:`Data Generator` as a tool to generate a
+    valid data description, in the process of converting their raw log files to
+    training files acceptable to AsyncExecutor.
+
+    DataFeedDesc can also be changed during runtime. Once you got familiar with
+    what each field mean, you can modify it to better suit your need. E.g.:
+    >>> data_feed.set_batch_size(128)
+    >>> data_feed.set_dense_slots('wd')  # The slot named 'wd' will be dense
+    >>> data_feed.set_use_slots('wd')    # The slot named 'wd' will be used
+
+    Finally, the content can be dumped out for debugging purpose:
+    >>> print(data_feed.desc())
+
+    Args:
+        proto_file(string): Disk file containing a data feed description.
+    
+    """
+
+    def __init__(self, proto_file):
+        self.proto_desc = data_feed_pb2.DataFeedDesc()
+        with open(proto_file, 'r') as f:
+            text_format.Parse(f.read(), self.proto_desc)
+        if self.proto_desc.name == "MultiSlotDataFeed":
+            self.__name_to_index = {
+                slot.name: i
+                for i, slot in enumerate(self.proto_desc.multi_slot_desc.slots)
+            }
+
+    def set_batch_size(self, batch_size):
+        """
+        Set batch size. Will be effective during training
+
+        Example:
+            >>> data_feed = fluid.DataFeedDesc('data.proto')
+            >>> data_feed.set_batch_size(128)
+
+        Args:
+            batch_size: batch size
+
+        """
+        self.proto_desc.batch_size = batch_size
+
+    def set_dense_slots(self, dense_slots_name):
+        """
+        Set if a specific slot will be dense. Will be effective during training.
+        features for a dense slot will be fed into a Tensor, while those for a
+        sparse slot will be fed into a LoDTensor
+
+        Example:
+            >>> data_feed = fluid.DataFeedDesc('data.proto')
+            >>> data_feed.set_dense_slots(['words'])
+
+        Args:
+            dense_slots_name: a list of slot names which will be set dense
+
+        Note:
+            Default is sparse for all slots
+        """
+        if self.proto_desc.name != "MultiSlotDataFeed":
+            raise ValueError(
+                "Only MultiSlotDataFeed need set_dense_slots, pls check your datafeed.proto"
+            )
+        for name in dense_slots_name:
+            self.proto_desc.multi_slot_desc.slots[self.__name_to_index[
+                name]].is_dense = True
+
+    def set_use_slots(self, use_slots_name):
+        """
+        Set if a specific slot will be used for training. A dataset shall
+        contain a lot of features, through this function one can select which
+        ones will be used for a specific model.
+
+        Example:
+            >>> data_feed = fluid.DataFeedDesc('data.proto')
+            >>> data_feed.set_use_slots(['words'])
+
+        Args:
+            use_slots_name: a list of slot names which will be used in training
+
+        Note:
+            Default is not used for all slots
+        """
+        if self.proto_desc.name != "MultiSlotDataFeed":
+            raise ValueError(
+                "Only MultiSlotDataFeed need set_use_slots, pls check your datafeed.proto"
+            )
+        for name in use_slots_name:
+            self.proto_desc.multi_slot_desc.slots[self.__name_to_index[
+                name]].is_used = True
+
+    def desc(self):
+        """
+        Returns a protobuf message for this DataFeedDesc
+
+        Example:
+            >>> data_feed = fluid.DataFeedDesc('data.proto')
+            >>> print(data_feed.desc())
+
+        Returns:
+            A string message
+        """
+        return text_format.MessageToString(self.proto_desc)
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index 5102a558fd3fdfd89ad769cd3a10f5dc3ea78716..13d2893fd146b5a3d9100ee1ba6c2243cb9c411b 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -258,10 +258,13 @@ class DataFeeder(object):
         multiple mini-batches. Each mini-batch will be feed on each device.
 
         Args:
-            reader(fun): the input data.
-            multi_devices(bool): the number of places. Default None.
-            num_places(int): the number of places. Default None.
-            drop_last(bool): the number of places. Default None.
+            reader(function): the reader is the function which can generate data.
+            multi_devices(bool): whether to use multiple devices or not.
+            num_places(int): if the multi_devices is True, you can specify the number
+                of GPU to use, if 'num_places' is None, the function will use all the
+                GPU of the current machine. Default None.
+            drop_last(bool): whether to drop the last batch if the
+                size of the last batch is less than batch_size. Default True.
 
         Returns:
             dict: the result of conversion.
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 288951cd7cd32155f136125fb817c35dd2ec6444..f2886090d75f87654b33cf7aa6f98ebf6f2e27d1 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -20,7 +20,7 @@ import six
 from .framework import Program, default_main_program, Variable
 from . import core
 
-__all__ = ['Executor', 'global_scope', 'scope_guard', '_switch_scope']
+__all__ = ['Executor', 'global_scope', 'scope_guard']
 
 g_scope = core.Scope()
 
@@ -278,6 +278,7 @@ class Executor(object):
         p = core.Place()
         p.set_place(place)
         self.executor = core.Executor(p)
+
         self.program_caches = dict()
         self._closed = False
 
@@ -406,16 +407,17 @@ class Executor(object):
 
         Examples:
 
-            >>> data = layers.data(name='X', shape=[1], dtype='float32')
-            >>> hidden = layers.fc(input=data, size=10)
-            >>> layers.assign(hidden, out)
-            >>> loss = layers.mean(out)
+            >>> data = fluid.layers.data(name='X', shape=[1], dtype='float32')
+            >>> out = fluid.layers.create_tensor(dtype='float32')
+            >>> hidden = fluid.layers.fc(input=data, size=10)
+            >>> fluid.layers.assign(hidden,out)
+            >>> loss = fluid.layers.mean(out)
             >>> adam = fluid.optimizer.Adam()
-            >>> adam.minimize(loss)
+						>>> adam.minimize(loss)
 
             >>> cpu = core.CPUPlace()
-            >>> exe = Executor(cpu)
-            >>> exe.run(default_startup_program())
+            >>> exe = fluid.Executor(cpu)
+            >>> exe.run(fluid.default_startup_program())
 
             >>> x = numpy.random.random(size=(10, 1)).astype('float32')
             >>> outs = exe.run(
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index b991187d424108db176ebd6996d7d161f11dcd3d..089792059465c60da43d02e8389f4e36900c2292 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -18,6 +18,7 @@ import collections
 import contextlib
 import re
 import six
+import sys
 
 import numpy as np
 
@@ -49,6 +50,16 @@ GRAD_VAR_SUFFIX = core.kGradVarSuffix()
 ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
 CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName()
 
+_imperative_tracer_ = None
+
+
+def _in_imperative_mode():
+    return _imperative_tracer_ is not None
+
+
+def _imperative_tracer():
+    return _imperative_tracer_
+
 
 class NameScope(object):
     def __init__(self, name="", parent=None):
@@ -89,12 +100,13 @@ def name_scope(prefix=None):
 
     Examples:
         .. code-block:: python
+
           with name_scope("encoder"):
              ...
           with name_scope("decoder"):
              ...
-             with name_scope("attention"):
-                ...
+          with name_scope("attention"):
+             ...
     """
     # TODO(panyx0718): Only [0-9a-z].
     assert prefix, "namescope prefix cannot be empty."
@@ -344,6 +356,21 @@ class Variable(object):
         self.op = None
         self.stop_gradient = stop_gradient
         self.is_data = is_data
+        if _in_imperative_mode():
+            self._ivar = core.VarBase()
+            self._ivar.desc = self.desc
+
+    def _numpy(self):
+        scope = _imperative_tracer().get_scope(self.block.desc)
+        tensor = core.get_variable_tensor(scope, self.desc.name())
+        return np.array(tensor)
+
+    def _backward(self):
+        scope = _imperative_tracer().get_scope(self.block.desc)
+        self._ivar._run_backward(scope)
+
+    def _gradient(self):
+        return np.array(self._ivar._grad())
 
     def __str__(self):
         return self.to_string(True)
@@ -654,6 +681,23 @@ class Operator(object):
         if self._has_kernel(type):
             self.desc.infer_var_type(self.block.desc)
             self.desc.infer_shape(self.block.desc)
+        if _in_imperative_mode():
+            self.iop = core.OpBase()
+            self.iop.desc = self.desc
+            self.inputs = []
+            if inputs is not None:
+                for inp in inputs.values():
+                    if isinstance(inp, Variable):
+                        self.inputs.append(inp)
+                    elif isinstance(inp, list) or isinstance(inp, tuple):
+                        self.inputs.extend(inp[:])
+            self.outputs = []
+            if outputs is not None:
+                for out in outputs.values():
+                    if isinstance(out, Variable):
+                        self.outputs.append(out)
+                    elif isinstance(out, list) or isinstance(out, tuple):
+                        self.outputs.extend(out[:])
 
     def _has_kernel(self, op_type):
         return op_type not in self.OP_WITHOUT_KERNEL_SET
@@ -1040,19 +1084,15 @@ class Block(object):
             raise ValueError("var %s not in this block" % name)
         return v
 
-    def _var_recursive(self, name):
+    def _find_var_recursive(self, name):
         """
         Get a Variable by name from this block recursively.
 
         Args:
             name(str): the Variable's name.
 
-        Raises:
-            ValueError: this block and this parent block doesn't
-                have a Variable with the giving name.
-
         Returns:
-            Variable: the Variable with the giving name.
+            Variable: the Variable with the giving name. Or None if not found.
         """
         frontier = list()
         visited = set()
@@ -1078,8 +1118,27 @@ class Block(object):
                 frontier.append(prog.block(cur.forward_block_idx))
 
             visited.add(id(cur))
+        return None
 
-        raise ValueError("Var {0} is not found recursively".format(name))
+    def _var_recursive(self, name):
+        """
+        Get a Variable by name from this block recursively.
+
+        Args:
+            name(str): the Variable's name.
+
+        Raises:
+            ValueError: this block and this parent block doesn't
+                have a Variable with the giving name.
+
+        Returns:
+            Variable: the Variable with the giving name.
+        """
+        var = self._find_var_recursive(name)
+        if var:
+            return var
+        else:
+            raise ValueError("Var {0} is not found recursively".format(name))
 
     def all_parameters(self):
         return list(self.iter_parameters())
@@ -1205,6 +1264,9 @@ class Block(object):
         """
         op_desc = self.desc.append_op()
         op = Operator(block=self, desc=op_desc, *args, **kwargs)
+        if _in_imperative_mode():
+            _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs],
+                                       [v._ivar for v in op.outputs], self.desc)
         self.ops.append(op)
         return op
 
@@ -1441,6 +1503,7 @@ class Program(object):
         self._is_chief = False
         self._slice_vars_and_attrs = []
         self._endpoints = []
+        self._trainers_endpoints = []
         self._distributed_lookup_table = None
 
     @property
@@ -2208,3 +2271,12 @@ def _get_var(name, program=None):
     assert isinstance(program, Program)
 
     return program.global_block().var(name)
+
+
+@contextlib.contextmanager
+def _imperative_guard(tracer):
+    global _imperative_tracer_
+    tmp_trace = _imperative_tracer_
+    _imperative_tracer_ = tracer
+    yield
+    _imperative_tracer_ = tmp_trace
diff --git a/python/paddle/fluid/imperative/__init__.py b/python/paddle/fluid/imperative/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..922308b6b18b335535d41f24d544cde04991b794
--- /dev/null
+++ b/python/paddle/fluid/imperative/__init__.py
@@ -0,0 +1,25 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from . import base
+from .base import *
+
+from . import layers
+from .layers import *
+
+__all__ = []
+__all__ += layers.__all__
+__all__ += base.__all__
diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..15d38ddb56c71ef7de67f79cf52cd26070f470cb
--- /dev/null
+++ b/python/paddle/fluid/imperative/base.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import contextlib
+import numpy as np
+
+from paddle.fluid import core
+from paddle.fluid import framework
+
+__all__ = ['enabled', 'guard', 'to_variable']
+
+
+def enabled():
+    return framework._in_imperative_mode()
+
+
+@contextlib.contextmanager
+def guard():
+    train = framework.Program()
+    startup = framework.Program()
+    tracer = core.Tracer(train.current_block().desc)
+    with framework.program_guard(train, startup):
+        with framework.unique_name.guard():
+            with framework._imperative_guard(tracer):
+                yield
+
+
+def to_variable(value, block=None):
+    if isinstance(value, np.ndarray):
+        if not block:
+            block = framework.default_main_program().current_block()
+        py_var = framework.Variable(
+            block,
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            name=None,
+            shape=value.shape,
+            dtype=value.dtype)
+        scope = framework._imperative_tracer().get_scope(block.desc)
+        var = scope.var(py_var.name)
+        tensor = var.get_tensor()
+        tensor.set(value, core.CPUPlace())
+        return py_var
+    elif isinstance(value, framework.Variable):
+        return value
+    else:
+        raise ValueError("Unsupported type %s" % type(value))
diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a28f7f4ae35295394b560d79e3dc0cdd5f2beab
--- /dev/null
+++ b/python/paddle/fluid/imperative/layers.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import sys
+import numpy as np
+
+from paddle.fluid import core
+from paddle.fluid import framework
+from paddle.fluid.imperative import base
+
+__all__ = ['PyLayer']
+
+
+class PyLayer(core.Layer):
+    def __init__(self):
+        pass
+
+    def __call__(self, inputs):
+        # TODO(panyx0718): Support declarative mode as well.
+        assert base.enabled()
+        if not isinstance(inputs, list) and not isinstance(inputs, tuple):
+            inputs = [inputs]
+
+        var_inputs = []
+        for x in inputs:
+            py_var = base.to_variable(x)
+            var_inputs.append(py_var)
+        outputs = self.forward(var_inputs)
+        return outputs
+
+    def forward(self, inputs):
+        return []
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index a26b8df5a240be8340597b9627866c323fa98a2d..b37ebbe5179ba6e36be70ff936cb8a3ca0d89d13 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -33,13 +33,15 @@ def force_init_on_cpu():
     """
     The flag of whether force to init variables on CPU.
 
-    Returns::
+    Returns:
+        bool: the state if we should force init on CPU.
 
     Examples:
+
         .. code-block:: python
 
             if force_init_on_cpu():
-                pass
+                create_op('force_cpu': force_init_on_cpu())
 
     """
     return _force_init_on_cpu_
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 0782933c6c4851b410ee3fdf14d4f9d9e83d49cc..e74a87fc68db0e126098f7188db4a712dff2612d 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -145,7 +145,7 @@ def save_vars(executor,
 
             prog = fluid.default_main_program()
             fluid.io.save_vars(executor=exe, dirname=path, main_program=prog,
-                               vars=None)
+                               vars=None, predicate = name_has_fc)
             # All variables in `main_program` whose name includes "fc" will be saved.
             # And variables are going to be saved separately.
 
@@ -369,7 +369,7 @@ def load_vars(executor,
 
             prog = fluid.default_main_program()
             fluid.io.load_vars(executor=exe, dirname=path, main_program=prog,
-                               vars=None)
+                               vars=None, predicate=name_has_fc)
             # All variables in `main_program` whose name includes "fc" will be loaded.
             # And all the variables are supposed to have been saved in differnet files.
 
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index dc317de9abbd06f4021e64b87ea88ba6af8809c9..74b4a977db6b69d4d256e1f7b36eb53524269bb1 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -17,10 +17,13 @@ from __future__ import print_function
 import copy
 import itertools
 import six
+import sys
+import numpy as np
 
 from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating
 from . import unique_name
 from paddle.fluid.initializer import Constant, Xavier
+from paddle.fluid.imperative import base
 from .param_attr import ParamAttr, WeightNormParamAttr
 from . import core
 from six.moves import zip
@@ -46,23 +49,21 @@ class LayerHelper(object):
     def startup_program(self):
         return default_startup_program()
 
+    def to_variable(self, x):
+        return base.to_variable(x, self.main_program.current_block())
+
     def append_op(self, *args, **kwargs):
         return self.main_program.current_block().append_op(*args, **kwargs)
 
     def multiple_input(self, input_param_name='input'):
         inputs = self.kwargs.get(input_param_name, [])
-        type_error = TypeError(
-            "Input of {0} layer should be Variable or sequence of Variable".
-            format(self.layer_type))
-        if isinstance(inputs, Variable):
-            inputs = [inputs]
-        elif not isinstance(inputs, list) and not isinstance(inputs, tuple):
-            raise type_error
+        ret = []
+        if isinstance(inputs, list) or isinstance(inputs, tuple):
+            for inp in inputs:
+                ret.append(self.to_variable(inp))
         else:
-            for each in inputs:
-                if not isinstance(each, Variable):
-                    raise type_error
-        return inputs
+            ret.append(self.to_variable(inputs))
+        return ret
 
     def input(self, input_param_name='input'):
         inputs = self.multiple_input(input_param_name)
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 05138bf94598f649ef7fdbaa94896b6ba0884416..b7e39685691809d04ecddc21d2d04a7a85e478d5 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -717,8 +717,9 @@ class While(object):
 
         out_vars = []
         for inner_out_name in inner_outputs:
-            if inner_out_name in parent_block.vars:
-                out_vars.append(parent_block.var(inner_out_name))
+            inner_var = parent_block._find_var_recursive(inner_out_name)
+            if inner_var:
+                out_vars.append(inner_var)
 
         step_scope = parent_block.create_var(
             type=core.VarDesc.VarType.STEP_SCOPES)
@@ -1264,10 +1265,11 @@ class ConditionalBlock(object):
             if each_name not in input_set
         ]
 
-        out_list = [
-            parent_block.var(var_name) for var_name in parent_block.vars
-            if var_name in intermediate
-        ]
+        out_list = []
+        for inner_out_name in intermediate:
+            inner_var = parent_block._find_var_recursive(inner_out_name)
+            if inner_var:
+                out_list.append(inner_var)
 
         step_scope = parent_block.create_var(
             type=core.VarDesc.VarType.STEP_SCOPES)
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 4843af8340310e0f47964d41708b13216fcd2161..ce731f39ea099a4d8948812989ad19b3cce119ff 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 from .layer_function_generator import generate_layer_fn
 from .layer_function_generator import autodoc, templatedoc
 from ..layer_helper import LayerHelper
+from ..framework import Variable
 from . import tensor
 from . import nn
 from . import ops
@@ -46,6 +47,7 @@ __all__ = [
     'iou_similarity',
     'box_coder',
     'polygon_box_transform',
+    'yolov3_loss',
 ]
 
 
@@ -401,6 +403,113 @@ def polygon_box_transform(input, name=None):
     return output
 
 
+@templatedoc(op_type="yolov3_loss")
+def yolov3_loss(x,
+                gtbox,
+                gtlabel,
+                anchors,
+                class_num,
+                ignore_thresh,
+                loss_weight_xy=None,
+                loss_weight_wh=None,
+                loss_weight_conf_target=None,
+                loss_weight_conf_notarget=None,
+                loss_weight_class=None,
+                name=None):
+    """
+    ${comment}
+
+    Args:
+        x (Variable): ${x_comment}
+        gtbox (Variable): groud truth boxes, should be in shape of [N, B, 4],
+                          in the third dimenstion, x, y, w, h should be stored 
+                          and x, y, w, h should be relative value of input image.
+                          N is the batch number and B is the max box number in 
+                          an image.
+        gtlabel (Variable): class id of ground truth boxes, shoud be ins shape
+                            of [N, B].
+        anchors (list|tuple): ${anchors_comment}
+        class_num (int): ${class_num_comment}
+        ignore_thresh (float): ${ignore_thresh_comment}
+        loss_weight_xy (float|None): ${loss_weight_xy_comment}
+        loss_weight_wh (float|None): ${loss_weight_wh_comment}
+        loss_weight_conf_target (float|None): ${loss_weight_conf_target_comment}
+        loss_weight_conf_notarget (float|None): ${loss_weight_conf_notarget_comment}
+        loss_weight_class (float|None): ${loss_weight_class_comment}
+        name (string): the name of yolov3 loss
+
+    Returns:
+        Variable: A 1-D tensor with shape [1], the value of yolov3 loss
+
+    Raises:
+        TypeError: Input x of yolov3_loss must be Variable
+        TypeError: Input gtbox of yolov3_loss must be Variable"
+        TypeError: Input gtlabel of yolov3_loss must be Variable"
+        TypeError: Attr anchors of yolov3_loss must be list or tuple
+        TypeError: Attr class_num of yolov3_loss must be an integer
+        TypeError: Attr ignore_thresh of yolov3_loss must be a float number
+
+    Examples:
+    .. code-block:: python
+
+        x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
+        gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32')
+        gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32')
+        anchors = [10, 13, 16, 30, 33, 23]
+        loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80
+                                        anchors=anchors, ignore_thresh=0.5)
+    """
+    helper = LayerHelper('yolov3_loss', **locals())
+
+    if not isinstance(x, Variable):
+        raise TypeError("Input x of yolov3_loss must be Variable")
+    if not isinstance(gtbox, Variable):
+        raise TypeError("Input gtbox of yolov3_loss must be Variable")
+    if not isinstance(gtlabel, Variable):
+        raise TypeError("Input gtlabel of yolov3_loss must be Variable")
+    if not isinstance(anchors, list) and not isinstance(anchors, tuple):
+        raise TypeError("Attr anchors of yolov3_loss must be list or tuple")
+    if not isinstance(class_num, int):
+        raise TypeError("Attr class_num of yolov3_loss must be an integer")
+    if not isinstance(ignore_thresh, float):
+        raise TypeError(
+            "Attr ignore_thresh of yolov3_loss must be a float number")
+
+    if name is None:
+        loss = helper.create_variable_for_type_inference(dtype=x.dtype)
+    else:
+        loss = helper.create_variable(
+            name=name, dtype=x.dtype, persistable=False)
+
+    attrs = {
+        "anchors": anchors,
+        "class_num": class_num,
+        "ignore_thresh": ignore_thresh,
+    }
+
+    if loss_weight_xy is not None and isinstance(loss_weight_xy, float):
+        self.attrs['loss_weight_xy'] = loss_weight_xy
+    if loss_weight_wh is not None and isinstance(loss_weight_wh, float):
+        self.attrs['loss_weight_wh'] = loss_weight_wh
+    if loss_weight_conf_target is not None and isinstance(
+            loss_weight_conf_target, float):
+        self.attrs['loss_weight_conf_target'] = loss_weight_conf_target
+    if loss_weight_conf_notarget is not None and isinstance(
+            loss_weight_conf_notarget, float):
+        self.attrs['loss_weight_conf_notarget'] = loss_weight_conf_notarget
+    if loss_weight_class is not None and isinstance(loss_weight_class, float):
+        self.attrs['loss_weight_class'] = loss_weight_class
+
+    helper.append_op(
+        type='yolov3_loss',
+        inputs={"X": x,
+                "GTBox": gtbox,
+                "GTLabel": gtlabel},
+        outputs={'Loss': loss},
+        attrs=attrs)
+    return loss
+
+
 @templatedoc()
 def detection_map(detect_res,
                   label,
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 3f47053961bcc41b82f1b6776e9365166e78ddbf..42f4959a83fe113d6cbbe0db355249a9c203d602 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -943,7 +943,18 @@ def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None):
 
 def shuffle(reader, buffer_size):
     """
-    Shuffle the reader.
+    Creates a data reader whose data output is shuffled.
+    Output from the iterator that created by original reader will be
+    buffered into shuffle buffer, and then shuffled. The size of shuffle buffer
+    is determined by argument buf_size.
+
+    Args:
+        param reader: the original reader whose output will be shuffled.
+        type reader: callable
+        param buf_size: shuffle buffer size.
+        type buf_size: int
+        return: the new reader whose output is shuffled.
+        rtype: callable
     """
     return __create_unshared_decorated_reader__(
         'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)})
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index eea0a362a0c31083f304a2167d0fdadfb30fb640..09b1b30216b03e71253ca8da1d462db897e1a607 100644
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -20,7 +20,7 @@ import string
 
 from six.moves import cStringIO
 from ..proto import framework_pb2
-from ..framework import OpProtoHolder, Variable
+from ..framework import OpProtoHolder, Variable, core, convert_np_dtype_to_dtype_
 from ..layer_helper import LayerHelper
 
 __all__ = [
@@ -178,6 +178,15 @@ def generate_layer_fn(op_type):
                         "operator {0} must input same dtype. {1} vs {2}".format(
                             op_type, dtype, each.dtype))
 
+        if dtype is None:
+            arg_dtype = kwargs.get("dtype")
+            if arg_dtype:
+                if not isinstance(arg_dtype, core.VarDesc.VarType):
+                    dtype = convert_np_dtype_to_dtype_(arg_dtype)
+                else:
+                    dtype = arg_dtype
+            else:
+                dtype = core.VarDesc.VarType.FP32
         return dtype
 
     def func(*args, **kwargs):
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index 149224bb68ac869dec14ac9f953f0072bd24c7e2..dde05189722fef77e03a1c2d8f3cbae44a3e8245 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -308,13 +308,9 @@ def piecewise_decay(boundaries, values):
 
 
 def append_LARS(params_grads, learning_rate, weight_decay):
-    """Applies LARS (LAYER-WISE ADAPTIVE RATE SCALING) to learning rate for
-       each layer.
-
-    ```python
-        learning_rate *= local_gw_ratio * sqrt(sumsq(param))
-                        / (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param)))
-    ```
+    """
+    Applies LARS (LAYER-WISE ADAPTIVE RATE SCALING) to learning rate for
+    each layer.
 
     Args:
         learning_rate: A learning rate Variable. This
@@ -323,6 +319,11 @@ def append_LARS(params_grads, learning_rate, weight_decay):
 
     Returns:
         The decayed learning rate
+    Examples:
+        .. code-block:: python
+        
+            learning_rate *= local_gw_ratio * sqrt(sumsq(param))
+                        / (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param)))
     """
 
     def _balanced_weight(param_norm, grad_norm):
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index fb1ae7b753d69447b00635c0a6c0ae8f040f5ad9..bc8e3e8a3c55b89b8162f766f91fde13729fd23d 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -41,6 +41,7 @@ __all__ = [
     'crf_decoding',
     'cos_sim',
     'cross_entropy',
+    'bpr_loss',
     'square_error_cost',
     'chunk_eval',
     'sequence_conv',
@@ -169,9 +170,15 @@ __all__ = [
     'log_loss',
     'add_position_encoding',
     'bilinear_tensor_product',
+    'merge_selected_rows',
+    'get_tensor_from_selected_rows',
+    'lstm',
+    'psroi_pool',
     'huber_regression_loss',
 ]
 
+kIgnoreIndex = -100
+
 
 def fc(input,
        size,
@@ -327,6 +334,11 @@ def embedding(input,
     """
 
     helper = LayerHelper('embedding', **locals())
+    remote_prefetch = False
+    if os.environ.get('PADDLE_ENABLE_REMOTE_PREFETCH'):
+        remote_prefetch = True
+    if remote_prefetch:
+        assert is_sparse is True and is_distributed is False
     w = helper.create_parameter(
         attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False)
     tmp = helper.create_variable_for_type_inference(dtype)
@@ -340,6 +352,7 @@ def embedding(input,
         attrs={
             'is_sparse': is_sparse,
             'is_distributed': is_distributed,
+            'remote_prefetch': remote_prefetch,
             'padding_idx': padding_idx
         })
     return tmp
@@ -467,6 +480,168 @@ def dynamic_lstm(input,
     return hidden, cell
 
 
+def lstm(input,
+         init_h,
+         init_c,
+         max_len,
+         hidden_size,
+         num_layers,
+         dropout_prob=0.0,
+         is_bidirec=False,
+         is_test=False,
+         name=None,
+         default_initializer=None,
+         seed=-1):
+    """
+    If Device is GPU, This op will use cudnn LSTM implementation
+
+    A four-gate Long Short-Term Memory network with no peephole connections.
+    In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1,
+    the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
+
+    $$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
+
+    $$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$
+
+    $$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$
+
+    $$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$
+
+    $$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$
+
+    $$ h_t = o_t \\odot tanh(c_t) $$
+
+    - W terms denote weight matrices (e.g. $W_{ix}$ is the matrix
+      of weights from the input gate to the input)
+    - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
+    - sigmoid is the logistic sigmoid function.
+    - $i, f, o$ and $c$ are the input gate, forget gate, output gate,
+      and cell activation vectors, respectively, all of which have the same size as
+      the cell output activation vector $h$.
+    - The $\odot$ is the element-wise product of the vectors.
+    - `tanh` is the activation functions.
+    - $\tilde{c_t}$ is also called candidate hidden state,
+      which is computed based on the current input and the previous hidden state.
+
+    Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication,
+    X represensts a matrix multiplication
+
+
+    Args:
+        input (Variable): LSTM input tensor, shape MUST be ( seq_len x batch_size x input_size )
+        init_h(Variable): The initial hidden state of the LSTM
+                       This is a tensor with shape ( num_layers x batch_size x hidden_size)
+                       if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
+        init_c(Variable): The initial cell state of the LSTM.
+                       This is a tensor with shape ( num_layers x batch_size x hidden_size )
+                       if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
+        max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len
+        hidden_size (int): hidden size of the LSTM
+        num_layers (int): total layers number of the LSTM
+        dropout_prob(float|0.0): dropout prob, dropout ONLY work between rnn layers, NOT between time steps
+                             There is NO dropout work on rnn output of the last RNN layers
+        is_bidirec (bool): If it is bidirectional
+        is_test (bool): If it is in test phrase
+        name (str|None): A name for this layer(optional). If set None, the layer
+                         will be named automatically.
+        default_initializer(Initialize|None): Where use initializer to initialize the Weight
+                         If set None, defaule initializer will be used
+        seed(int): Seed for dropout in LSTM, If it's -1, dropout will use random seed
+
+
+    Returns:
+        rnn_out(Tensor): result of LSTM hidden, shape is (seq_len x batch_size x hidden_size)
+                         if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2)
+        last_h(Tensor): the hidden state of the last step of LSTM
+                        shape is ( num_layers x batch_size x hidden_size )
+                        if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
+        last_c(Tensor): the cell state of the last step of LSTM
+                        shape is ( num_layers x batch_size x hidden_size )
+                        if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
+
+
+    Examples:
+        .. code-block:: python
+
+            input = embedding
+            batch_size = 20
+            max_len = 100
+            dropout_prob = 0.2
+            input_size = 100
+            hidden_size = 150
+            num_layers = 1
+            init_hidden1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False)
+            init_cell1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False)
+
+            rnn_out, last_h, last_c = layers.lstm( input, init_h, init_c, \
+                    max_len, dropout_prob, input_size, hidden_size, \
+                    num_layers)
+    """
+
+    helper = LayerHelper('cudnn_lstm', **locals())
+
+    dtype = input.dtype
+    input_shape = list(input.shape)
+    input_size = input_shape[-1]
+    weight_size = 0
+    for i in range(num_layers):
+        if i == 0:
+            input_weight_size = (input_size * hidden_size) * 4
+        else:
+            if is_bidirec:
+                input_weight_size = (hidden_size * 2 * hidden_size) * 4
+            else:
+                input_weight_size = (hidden_size * hidden_size) * 4
+
+        hidden_weight_size = (hidden_size * hidden_size) * 4
+
+        if is_bidirec:
+            weight_size += (input_weight_size + hidden_weight_size) * 2
+            weight_size += hidden_size * 8 * 2
+        else:
+            weight_size += input_weight_size + hidden_weight_size
+            weight_size += hidden_size * 8
+
+    weight = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=[weight_size],
+        dtype=dtype,
+        default_initializer=default_initializer)
+
+    out = helper.create_variable_for_type_inference(dtype)
+    last_h = helper.create_variable_for_type_inference(dtype)
+    last_c = helper.create_variable_for_type_inference(dtype)
+
+    cache = helper.create_variable(
+        persistable=True, type=core.VarDesc.VarType.RAW, stop_gradient=True)
+
+    helper.append_op(
+        type='cudnn_lstm',
+        inputs={
+            'Input': input,
+            'InitH': init_h,
+            'InitC': init_c,
+            'W': weight,
+            'Cache': cache,
+        },
+        outputs={
+            'Out': out,
+            'last_h': last_h,
+            'last_c': last_c,
+        },
+        attrs={
+            'max_len': max_len,
+            'is_bidirec': is_bidirec,
+            'input_size': input_size,
+            'hidden_size': hidden_size,
+            'num_layers': num_layers,
+            'is_test': is_test,
+            'dropout_prob': dropout_prob,
+            'seed': seed,
+        })
+    return out, last_h, last_c
+
+
 def dynamic_lstmp(input,
                   size,
                   proj_size,
@@ -758,7 +933,7 @@ def dynamic_gru(input,
             emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
             hidden_dim = 512
             x = fluid.layers.fc(input=emb, size=hidden_dim * 3)
-            hidden = fluid.layers.dynamic_gru(input=x, dim=hidden_dim)
+            hidden = fluid.layers.dynamic_gru(input=x, size=hidden_dim)
     """
 
     helper = LayerHelper('gru', **locals())
@@ -1099,7 +1274,7 @@ def dropout(x,
     return out
 
 
-def cross_entropy(input, label, soft_label=False, ignore_index=-100):
+def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
     """
     **Cross Entropy Layer**
 
@@ -1146,7 +1321,7 @@ def cross_entropy(input, label, soft_label=False, ignore_index=-100):
                                            labels. Default: `False`.
         ignore_index (int): Specifies a target value that is ignored and does
                             not contribute to the input gradient. Only valid
-                            if soft_label is set to False. Default: -100
+                            if soft_label is set to False. Default: kIgnoreIndex
 
     Returns:
          A 2-D tensor with shape [N x 1], the cross entropy loss.
@@ -1176,6 +1351,44 @@ def cross_entropy(input, label, soft_label=False, ignore_index=-100):
     return out
 
 
+def bpr_loss(input, label, name=None):
+    """
+    Bayesian Personalized Ranking Loss Operator.
+
+    This operator belongs to pairwise ranking loss. Label is the desired item.
+    The loss at a given point in one session is defined as:
+    $Y[i] = -\frac{1}{N_{i}-1} * \sum_{0\le j<N_{i},~ j\neq Label[i]}\log(\sigma(X[i, Label[i]]-X[i, j]))$
+
+    Learn more details by reading paper <session-based recommendations with recurrent
+    neural networks>(https://arxiv.org/abs/1511.06939)
+
+    Args:
+        input (Variable|list):  a 2-D tensor with shape [N x D], where N is the
+                                batch size and D is the number of classes.
+                                This input is not probability but logits.
+        label (Variable|list):  the ground truth which is a 2-D tensor.  `label`
+                                is a tensor<int64> with shape [N x 1].
+        name (str|None):        A name for this layer(optional). If set None, the
+                                layer will be named automatically. Default: None.
+    Returns:
+        A 2-D tensor with shape [N x 1], the bpr loss.
+
+    Examples:
+        .. code-block:: python
+
+          cost = fluid.layers.bpr_loss(input=predict, label=label)
+    """
+
+    helper = LayerHelper('bpr_loss', **locals())
+    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    helper.append_op(
+        type='bpr_loss',
+        inputs={'X': [input],
+                'Label': [label]},
+        outputs={'Y': [out]})
+    return out
+
+
 def square_error_cost(input, label):
     """
     **Square error cost layer**
@@ -2301,7 +2514,8 @@ def batch_norm(input,
                moving_mean_name=None,
                moving_variance_name=None,
                do_model_average_for_mean_and_var=False,
-               fuse_with_relu=False):
+               fuse_with_relu=False,
+               use_global_stats=False):
     """
     **Batch Normalization Layer**
 
@@ -2328,6 +2542,19 @@ def batch_norm(input,
         \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
         y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
 
+
+    When use_global_stats = True, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    They are global (or running) statistics. (It usually got from the
+    pre-trained model.)
+    The training and testing (or inference) have the same behavior:
+
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}}  \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta
+
     Args:
         input(variable): The input variable which is a LoDTensor.
         act(string, Default None): Activation type, linear|relu|prelu|...
@@ -2350,6 +2577,11 @@ def batch_norm(input,
         moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
         do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not.
         fuse_with_relu (bool): if True, this OP performs relu after batch norm.
+        use_global_stats(bool, Default False): Whether to use global mean and
+            variance. In inference or test mode, set use_global_stats to true
+            or is_test to true, and the behavior is equivalent.
+            In train mode, when setting use_global_stats True, the global mean
+            and variance are also used during train period.
 
     Returns:
         Variable: A tensor variable which is the result after applying batch normalization on the input.
@@ -2382,9 +2614,15 @@ def batch_norm(input,
         shape=param_shape,
         dtype=dtype,
         default_initializer=Constant(1.0))
+    # setting stop_gradient=True to reduce computation
+    if use_global_stats and helper.param_attr.learning_rate == 0.:
+        scale.stop_gradient = True
 
     bias = helper.create_parameter(
         attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
+    # setting stop_gradient=True to reduce computation
+    if use_global_stats and helper.bias_attr.learning_rate == 0.:
+        scale.stop_gradient = True
 
     mean = helper.create_parameter(
         attr=ParamAttr(
@@ -2440,7 +2678,8 @@ def batch_norm(input,
             "epsilon": epsilon,
             "is_test": is_test,
             "use_mkldnn": False,
-            "fuse_with_relu": fuse_with_relu
+            "fuse_with_relu": fuse_with_relu,
+            "use_global_stats": use_global_stats
         })
 
     return helper.append_activation(batch_norm_out)
@@ -3390,6 +3629,7 @@ def beam_search_decode(ids, scores, beam_size, end_id, name=None):
 
     Examples:
         .. code-block:: python
+
             # Suppose `ids` and `scores` are LodTensorArray variables reserving
             # the selected ids and scores of all steps
             finished_ids, finished_scores = layers.beam_search_decode(
@@ -4220,7 +4460,14 @@ def ctc_greedy_decoder(input, blank, name=None):
 
         input.lod = [[4, 4]]
 
-        Then:
+        Computation:
+
+        step1: Apply argmax to first input sequence which is input.data[0:4]. Then we get:
+               [[0], [2], [1], [0]]
+        step2: merge repeated tokens and remove blank which is 0. Then we get first output sequence:
+               [[2], [1]]
+
+        Finally:
 
         output.data = [[2],
                        [1],
@@ -4228,6 +4475,7 @@ def ctc_greedy_decoder(input, blank, name=None):
 
         output.lod = [[2, 1]]
 
+
     Args:
 
         input(Variable): (LoDTensor<float>), the probabilities of
@@ -4242,8 +4490,10 @@ def ctc_greedy_decoder(input, blank, name=None):
         name (str): The name of this layer. It is optional.
 
     Returns:
-        Variable: CTC greedy decode result. If all the sequences in result were
-        empty, the result LoDTensor will be [-1] with LoD [[]] and dims [1, 1].
+        Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1].
+                  'Lp' is the sum if all output sequences' length. If all the sequences
+                  in result were empty, the result LoDTensor will be [-1] with
+                  LoD [[]] and dims [1, 1].
 
     Examples:
         .. code-block:: python
@@ -4877,7 +5127,7 @@ def im2sequence(input,
 
             output.lod = [[4, 4]]
 
-     Examples:
+    Examples:
 
         .. code-block:: python
 
@@ -4981,7 +5231,7 @@ def multiplex(inputs, index):
 def softmax_with_cross_entropy(logits,
                                label,
                                soft_label=False,
-                               ignore_index=-100,
+                               ignore_index=kIgnoreIndex,
                                numeric_stable_mode=False,
                                return_softmax=False):
     """
@@ -5039,7 +5289,7 @@ def softmax_with_cross_entropy(logits,
             labels as soft labels. By default, `soft_label` is set to False.
         ignore_index (int): Specifies a target value that is ignored and does
                             not contribute to the input gradient. Only valid
-                            if soft_label is set to False. Default: -100
+                            if soft_label is set to False. Default: kIgnoreIndex
         numeric_stable_mode (bool): A flag to indicate whether to use a more
                                     numerically stable algorithm. Only valid
                                     when soft_label is False and GPU is used.
@@ -5664,24 +5914,23 @@ def pad_constant_like(x, y, pad_value=0., name=None):
                   [[38, 39, 40]],
                   [[41, 42, 43]]]]
             Y.shape = (1, 3, 1, 3)
+		And
+            pad_value = -1,
 
-    And
-        pad_value = -1,
-
-    Return:
-        Out = [[[[35, 36, 37],
-                  [-1, -1, -1]],
-                [[38, 39, 40],
-                  [-1, -1, -1]],
-                 [[41, 42, 43],
-                  [-1, -1, -1]]],
-                [[[-1, -1, -1],
-                  [-1, -1, -1]],
-                 [[-1, -1, -1],
-                  [-1, -1, -1]],
-                 [[-1, -1, -1],
-                  [-1, -1, -1]]]]
-        Out.shape = (2, 3, 2, 3)
+        Return:
+            Out = [[[[35, 36, 37],
+                     [-1, -1, -1]],
+                    [[38, 39, 40],
+                     [-1, -1, -1]],
+                    [[41, 42, 43],
+                     [-1, -1, -1]]],
+                  [[[-1, -1, -1],
+                    [-1, -1, -1]],
+                   [[-1, -1, -1],
+                    [-1, -1, -1]],
+                   [[-1, -1, -1],
+                    [-1, -1, -1]]]]
+            Out.shape = (2, 3, 2, 3)
 
     Args:
         x (Variable): The input tensor variable.
@@ -5920,6 +6169,7 @@ def image_resize(input,
     Supporting resample methods:
 
         'BILINEAR' : Bilinear interpolation
+
         'NEAREST' : Nearest neighbor interpolation
 
     Args:
@@ -6414,7 +6664,8 @@ def relu(x, name=None):
     helper = LayerHelper('relu', **locals())
     dtype = helper.input_dtype(input_param_name='x')
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type="relu", inputs={"X": x}, outputs={"Out": out})
+    helper.append_op(
+        type="relu", inputs={"X": helper.input('x')}, outputs={"Out": out})
     return out
 
 
@@ -6575,7 +6826,7 @@ def crop(x, shape=None, offsets=None, name=None):
 
             # or
             z = fluid.layers.data(name="z", shape=[3, 5], dtype="float32")
-            crop = fluid.layers.crop(z, shape=[2, 3])
+            crop = fluid.layers.crop(z, shape=[-1, 2, 3])
 
     """
     helper = LayerHelper('crop', **locals())
@@ -6856,44 +7107,45 @@ def pad2d(input,
     than height-1. And the width dimension has the same condition.
 
     Example:
+        .. code-block:: text
 
-      Given that X is a channel of image from input:
+	      Given that X is a channel of image from input:
 
-      X = [[1, 2, 3],
-           [4, 5, 6]]
+	      X = [[1, 2, 3],
+		   [4, 5, 6]]
 
-      Case 0:
+	      Case 0:
 
-        paddings = [0, 1, 2, 3],
-        mode = 'constant'
-        pad_value = 0
+		paddings = [0, 1, 2, 3],
+		mode = 'constant'
+		pad_value = 0
 
-        Out = [[0, 0, 1, 2, 3, 0, 0, 0]
-               [0, 0, 4, 5, 6, 0, 0, 0]
-               [0, 0, 0, 0, 0, 0, 0, 0]]
+		Out = [[0, 0, 1, 2, 3, 0, 0, 0]
+		       [0, 0, 4, 5, 6, 0, 0, 0]
+		       [0, 0, 0, 0, 0, 0, 0, 0]]
 
-      Case 1:
+	      Case 1:
 
-        paddings = [0, 1, 2, 1],
-        mode = 'reflect'
+		paddings = [0, 1, 2, 1],
+		mode = 'reflect'
 
-        Out = [[3, 2, 1, 2, 3, 2]
-               [6, 5, 4, 5, 6, 5]
-               [3, 2, 1, 2, 3, 2]]
+		Out = [[3, 2, 1, 2, 3, 2]
+		       [6, 5, 4, 5, 6, 5]
+		       [3, 2, 1, 2, 3, 2]]
 
-      Case 2:
+	      Case 2:
 
-        paddings = [0, 1, 2, 1],
-        mode = 'edge'
+		paddings = [0, 1, 2, 1],
+		mode = 'edge'
 
-        Out = [[1, 1, 1, 2, 3, 3]
-               [4, 4, 4, 5, 6, 6]
-               [4, 4, 4, 5, 6, 6]]
+		Out = [[1, 1, 1, 2, 3, 3]
+		       [4, 4, 4, 5, 6, 6]
+		       [4, 4, 4, 5, 6, 6]]
 
 
     Args:
         input (Variable): The input image with [N, C, H, W] format or [N, H, W, C] format.
-        paddings (tuple|list): The padding size. If padding is a tuple, it must
+        paddings (tuple|list|Variable): The padding size. If padding is a tuple, it must
             contain four integers, (padding_top, padding_bottom, padding_left, padding_right).
             Default: padding = [0, 0, 0, 0].
         mode (str): Three modes: constant(default), reflect, edge. Default: constant
@@ -6918,16 +7170,17 @@ def pad2d(input,
     helper = LayerHelper('pad2d', **locals())
     dtype = helper.input_dtype(input_param_name='input')
     out = helper.create_variable_for_type_inference(dtype)
+    inputs = {'X': input}
+    attrs = {'mode': mode, 'pad_value': pad_value, 'data_format': data_format}
+
+    if isinstance(paddings, Variable):
+        inputs['Paddings'] = paddings
+        attrs['paddings'] = []
+    else:
+        attrs['paddings'] = paddings
+
     helper.append_op(
-        type='pad2d',
-        inputs={'X': input},
-        outputs={"Out": out},
-        attrs={
-            'paddings': paddings,
-            'mode': mode,
-            'pad_value': pad_value,
-            'data_frmat': data_format
-        })
+        type='pad2d', inputs=inputs, outputs={"Out": out}, attrs=attrs)
 
     return out
 
@@ -7125,13 +7378,13 @@ def prelu(x, mode, param_attr=None, name=None):
     Args:
         x (Variable): The input tensor.
         param_attr(ParamAttr|None): The parameter attribute for the learnable
-                       weight (alpha).
+          weight (alpha).
         mode (string): The mode for weight sharing. It supports all, channel
-                       and element. all: all elements share same weight
-                       channel:elements in a channel share same weight
-                       element:each element has a weight
+          and element. all: all elements share same weight
+          channel:elements in a channel share same weight
+          element:each element has a weight
         name(str|None): A name for this layer(optional). If set None, the layer
-                       will be named automatically.
+          will be named automatically.
 
     Returns:
         Variable: The output tensor with the same shape as input.
@@ -7575,6 +7828,11 @@ def uniform_random_batch_size_like(input,
     Returns:
         out (Variable): ${out_comment}
 
+    Examples:
+        .. code-block:: python
+
+            input = layers.data(name="input", shape=[13, 11], dtype='float32')
+            out = layers.uniform_random_batch_size_like(input, [-1, 11])
     """
 
     helper = LayerHelper('uniform_random_batch_size_like', **locals())
@@ -7612,6 +7870,10 @@ def gaussian_random(shape, mean=0.0, std=1.0, seed=0, dtype='float32'):
     Returns:
         out (Variable): ${out_comment}
 
+    Examples:
+        .. code-block:: python
+
+            out = layers.gaussian_random(shape=[20, 30])
     """
 
     helper = LayerHelper('gaussian_random', **locals())
@@ -7647,6 +7909,16 @@ def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'):
     Returns:
         out (Variable): ${out_comment}
 
+    Examples:
+        .. code-block:: python
+
+            x = layers.data(
+                name="X",
+                shape=[13, 11],
+                dtype='float32',
+                append_batch_size=False)
+
+            out = layers.sampling_id(x)
     """
 
     helper = LayerHelper('sampling_id', **locals())
@@ -7686,6 +7958,14 @@ def gaussian_random_batch_size_like(input,
 
     Returns:
         out (Variable): ${out_comment}
+
+    Examples:
+        .. code-block:: python
+
+            input = layers.data(name="input", shape=[13, 11], dtype='float32')
+
+            out = layers.gaussian_random_batch_size_like(
+                input, shape=[-1, 11], mean=1.0, std=2.0)
     """
 
     helper = LayerHelper('gaussian_random_batch_size_like', **locals())
@@ -7718,6 +7998,12 @@ def sum(x):
 
     Returns:
         out (Variable): ${out_comment}
+
+    Examples:
+        .. code-block:: python
+
+            input = layers.data(name="input", shape=[13, 11], dtype='float32')
+            out = layers.sum(input)
     """
 
     helper = LayerHelper('sum', **locals())
@@ -7746,6 +8032,17 @@ def slice(input, axes, starts, ends):
     Returns:
         out (Variable): ${out_comment}
 
+    Examples:
+        .. code-block:: python
+
+            starts = [1, 0, 2]
+            ends = [3, 3, 4]
+            axes = [0, 1, 2]
+
+            input = layers.data(
+                name="input", shape=[3, 4, 5, 6], dtype='float32')
+
+            out = layers.slice(input, axes=axes, starts=starts, ends=ends)
     """
 
     helper = LayerHelper('slice', **locals())
@@ -7773,6 +8070,12 @@ def shape(input):
     Returns:
         out (Variable): ${out_comment}
 
+    Examples:
+        .. code-block:: python
+
+            input = layers.data(
+                name="input", shape=[3, 100, 100], dtype="float32")
+            out = layers.shape(input)
     """
 
     helper = LayerHelper('shape', **locals())
@@ -8123,6 +8426,29 @@ def mean(x, name=None):
     return out
 
 
+@templatedoc()
+def merge_selected_rows(x, name=None):
+    """
+    ${comment}
+
+    Args:
+        x(${x_type}): ${x_comment}
+        name(basestring|None): Name of the output.
+
+    Returns:
+        out(${out_type}): ${out_comment}
+    """
+
+    helper = LayerHelper("merge_selected_rows", **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type="merge_selected_rows",
+        inputs={"X": x},
+        attrs={},
+        outputs={"Out": out})
+    return out
+
+
 @templatedoc()
 def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
     """
@@ -8160,13 +8486,17 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
 
 
 @templatedoc()
-def sigmoid_cross_entropy_with_logits(x, label, name=None):
+def sigmoid_cross_entropy_with_logits(x,
+                                      label,
+                                      ignore_index=kIgnoreIndex,
+                                      name=None):
     """
     ${comment}
 
     Args:
         x(${x_type}): ${x_comment}
         label(${label_type}): ${label_comment}
+        ignore_index(&{ignore_index}): ${ignore_index_comment}
         name(basestring|None): Name of the output.
 
     Returns:
@@ -8185,7 +8515,7 @@ def sigmoid_cross_entropy_with_logits(x, label, name=None):
         type="sigmoid_cross_entropy_with_logits",
         inputs={"X": x,
                 "Label": label},
-        attrs={},
+        attrs={"ignore_index": ignore_index},
         outputs={"Out": out})
     return out
 
@@ -8773,6 +9103,82 @@ def bilinear_tensor_product(x,
     return helper.append_activation(out)
 
 
+@templatedoc()
+def get_tensor_from_selected_rows(x, name=None):
+    """
+    ${comment}
+
+    Args:
+        x(${x_type}): ${x_comment}
+        name(basestring|None): Name of the output.
+
+    Returns:
+        out(${out_type}): ${out_comment}
+    """
+
+    helper = LayerHelper('get_tensor_from_selected_rows', **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='get_tensor_from_selected_rows',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={})
+    return out
+
+
+@templatedoc()
+def psroi_pool(input,
+               rois,
+               output_channels,
+               spatial_scale,
+               pooled_height,
+               pooled_width,
+               name=None):
+    """
+    ${comment}
+
+    Args:
+        input (Variable): ${x_comment}
+        rois (Variable): ROIs (Regions of Interest) to pool over.
+        output_channels (integer): ${output_channels_comment}
+        spatial_scale (float): ${spatial_scale_comment} Default: 1.0
+        pooled_height (integer): ${pooled_height_comment} Default: 1
+        pooled_width (integer): ${pooled_width_comment} Default: 1
+        name (str, default None): The name of this layer.
+
+    Returns:
+        Variable: ${out_comment}.
+
+    Examples:
+        .. code-block:: python
+
+            pool_out = fluid.layers.psroi_pool(input=x, rois=rois, 490, 1.0, 7, 7)
+    """
+    helper = LayerHelper('psroi_pool', **locals())
+    # check attrs
+    if not isinstance(output_channels, int):
+        raise TypeError("output_channels must be int type")
+    if not isinstance(spatial_scale, float):
+        raise TypeError("spatial_scale must be float type")
+    if not isinstance(pooled_height, int):
+        raise TypeError("pooled_height must be int type")
+    if not isinstance(pooled_width, int):
+        raise TypeError("pooled_width must be int type")
+    dtype = helper.input_dtype()
+    out = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type='psroi_pool',
+        inputs={'X': input,
+                'ROIs': rois},
+        outputs={'Out': out},
+        attrs={
+            'output_channels': output_channels,
+            'spatial_scale': spatial_scale,
+            'pooled_height': pooled_height,
+            'pooled_width': pooled_width
+        })
+    return out
+
 def huber_regression_loss(input, label, delta):
     """
     Huber regression loss is a loss function used in robust regression.
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index ff32c00104171bf42c00be33f05758a4387228e1..49a486cf0c3d11b18417e8838aead07d748f3e02 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -622,7 +622,7 @@ def reverse(x, axis):
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type='reverse',
-        inputs={'Input': x},
+        inputs={'X': x},
         outputs={'Out': [out]},
         attrs={'axis': axis})
     return out
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index 829154f1b23d6e49bf963762be6b6488c98ec94a..85af8fea13d5b9a1e22014fbd727e1baed3247be 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -222,13 +222,13 @@ class Precision(MetricBase):
     Examples:
         .. code-block:: python
 
-        metric = fluid.metrics.Precision()
-        for pass in range(PASSES):
-            metric.reset()
-            for data in train_reader():
-                loss, preds, labels = exe.run(fetch_list=[cost, preds, labels])
-            metric.update(preds=preds, labels=labels)
-            numpy_precision = metric.eval()
+            metric = fluid.metrics.Precision()
+            for pass in range(PASSES):
+                metric.reset()
+                for data in train_reader():
+                    loss, preds, labels = exe.run(fetch_list=[cost, preds, labels])
+                metric.update(preds=preds, labels=labels)
+                numpy_precision = metric.eval()
     """
 
     def __init__(self, name=None):
@@ -267,13 +267,13 @@ class Recall(MetricBase):
     Examples:
         .. code-block:: python
 
-        metric = fluid.metrics.Recall()
-        for pass in range(PASSES):
-            metric.reset()
-            for data in train_reader():
-                loss, preds, labels = exe.run(fetch_list=[cost, preds, labels])
-            metric.update(preds=preds, labels=labels)
-            numpy_recall = metric.eval()
+            metric = fluid.metrics.Recall()
+            for pass in range(PASSES):
+                metric.reset()
+                for data in train_reader():
+                    loss, preds, labels = exe.run(fetch_list=[cost, preds, labels])
+                metric.update(preds=preds, labels=labels)
+                numpy_recall = metric.eval()
     """
 
     def __init__(self, name=None):
@@ -449,8 +449,9 @@ class EditDistance(MetricBase):
                 distance_evaluator.update(distances, seq_num)
                 distance, instance_error = distance_evaluator.eval()
 
-        In the above example:
+    In the above example:
         'distance' is the average of the edit distance in a pass.
+
         'instance_error' is the instance error rate in a pass.
 
     """
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index bdcd045341212d6cf9dbfbc3cebc72f320e37e9d..c54c3963a152851f5396c2ba71c28cc09c1cd523 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -95,7 +95,14 @@ class ParallelExecutor(object):
         self._places = []
         self._act_places = []
         if use_cuda:
-            for i in six.moves.range(core.get_cuda_device_count()):
+            gpus = []
+            gpus_env = os.getenv("FLAGS_selected_gpus")
+            if gpus_env:
+                gpus = [int(s) for s in gpus_env.split(",")]
+            else:
+                for i in six.moves.range(core.get_cuda_device_count()):
+                    gpus.append(i)
+            for i in gpus:
                 p = core.Place()
                 self._act_places.append(core.CUDAPlace(i))
                 p.set_place(self._act_places[-1])
@@ -128,9 +135,17 @@ class ParallelExecutor(object):
             build_strategy = BuildStrategy()
 
         build_strategy.num_trainers = num_trainers
+        build_strategy.trainer_id = trainer_id
 
         main = main_program
         main = main if main else framework.default_main_program()
+
+        trainers_endpoints = main._trainers_endpoints
+        if num_trainers > 1 and trainers_endpoints:
+            assert num_trainers == len(
+                trainers_endpoints), "num_trainers == len(end_points)"
+            build_strategy.trainers_endpoints = trainers_endpoints
+
         if scope == None:
             scope = executor.global_scope()
 
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index a51607bfdb1dde3d25f490770cc2ba368ceb27ff..38ddf93198d7c58382e36a5b7af488f56e6f9878 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -50,8 +50,9 @@ class ParamAttr(object):
 
             w_param_attrs = fluid.ParamAttr(name="fc_weight",
                                             learning_rate=0.5,
-                                            regularizer=fluid.L2Decay(1.0),
+                                            regularizer=fluid.regularizer.L2Decay(1.0),
                                             trainable=True)
+	    x = fluid.layers.data(name='X', shape=[1], dtype='float32')
             y_predict = fluid.layers.fc(input=x, size=10, param_attr=w_param_attrs)
     """
 
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt
index ad056aaa7b30b06d950486fd059c5b6a15770551..f9c6d60540fcb6f8a73fdc4e68471448e16cbdc2 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt
@@ -10,6 +10,8 @@ else()
     foreach(src ${TEST_OPS})
         if(${src} STREQUAL "test_recognize_digits_conv")
             message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
+        elseif(${src} STREQUAL "test_recognize_digits_mlp")
+            message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
         else()
             py_test(${src} SRCS ${src}.py)
         endif()
diff --git a/python/paddle/fluid/tests/demo/async_executor.py b/python/paddle/fluid/tests/demo/async_executor.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe8da0aab74bd5fc6219666236a04423a6d60489
--- /dev/null
+++ b/python/paddle/fluid/tests/demo/async_executor.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tarfile
+import paddle.fluid as fluid
+import paddle
+from paddle.fluid import core
+
+URL = 'http://paddle-unittest-data.gz.bcebos.com/python_paddle_fluid_tests_demo_async-executor/train_data.tar.gz'
+MD5 = '2a405a31508969b3ab823f42c0f522ca'
+
+
+def bow_net(data,
+            label,
+            dict_dim=89528,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2):
+    """
+    BOW net
+    This model is from https://github.com/PaddlePaddle/models:
+    models/fluid/PaddleNLP/text_classification/nets.py
+    """
+    # embedding
+    emb = fluid.layers.embedding(
+        input=data, size=[dict_dim, emb_dim], is_sparse=True)
+    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
+    bowh = fluid.layers.tanh(bow)
+    # fc layer after conv
+    fc_1 = fluid.layers.fc(input=bowh, size=hid_dim, act="tanh")
+    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
+    # probability of each class
+    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
+    # cross entropy loss
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    # mean loss
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+    return avg_cost, acc, prediction
+
+
+def train():
+    # Download data
+    with tarfile.open(paddle.dataset.common.download(URL, "imdb", MD5)) as tarf:
+        tarf.extractall(path='./')
+        tarf.close()
+
+    # Initialize dataset description
+    dataset = fluid.DataFeedDesc('train_data/data.prototxt')
+    dataset.set_batch_size(128)  # See API doc for how to change other fields
+    print dataset.desc()  # Debug purpose: see what we get
+
+    # define network
+    # input text data
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+    # label data
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+    avg_cost, acc, prediction = bow_net(data, label)
+    sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
+    opt_ops, weight_and_grad = sgd_optimizer.minimize(avg_cost)
+
+    # Run startup program
+    startup_program = fluid.default_startup_program()
+    place = fluid.CPUPlace()
+    executor = fluid.Executor(place)
+    executor.run(startup_program)
+
+    async_executor = fluid.AsyncExecutor(place)
+    main_program = fluid.default_main_program()
+    epochs = 10
+    filelist = ["train_data/part-%d" % i for i in range(12)]
+    for i in range(epochs):
+        thread_num = 4
+        async_executor.run(
+            main_program,  # This can be changed during iteration
+            dataset,  # This can be changed during iteration
+            filelist,  # This can be changed during iteration
+            thread_num,  # This can be changed during iteration
+            [data, acc],  # Multiple fetch targets can be specified
+            debug=False)
+        fluid.io.save_inference_model('imdb/epoch%d.model' % i,
+                                      [data.name, label.name], [acc], executor)
+
+
+if __name__ == "__main__":
+    train()
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index a2eca5541a152ca99804a7f87c9b0bc3d12d4eee..d99eaa0634f93dcd16dd80ae172f11e8090a2623 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -388,5 +388,18 @@ class TestGenerateProposals(unittest.TestCase):
         print(rpn_rois.shape)
 
 
+class TestYoloDetection(unittest.TestCase):
+    def test_yolov3_loss(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[30, 7, 7], dtype='float32')
+            gtbox = layers.data(name='gtbox', shape=[10, 4], dtype='float32')
+            gtlabel = layers.data(name='gtlabel', shape=[10], dtype='int32')
+            loss = layers.yolov3_loss(x, gtbox, gtlabel, [10, 13, 30, 13], 10,
+                                      0.5)
+
+            self.assertIsNotNone(loss)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/test_gradient_clip.py b/python/paddle/fluid/tests/test_gradient_clip.py
deleted file mode 100644
index 266687fcd092dfdeec9343e2592f4c22b683d588..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/test_gradient_clip.py
+++ /dev/null
@@ -1,84 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-
-BATCH_SIZE = 128
-CLIP = 1
-
-prog = fluid.framework.Program()
-with fluid.program_guard(main_program=prog):
-    image = fluid.layers.data(name='x', shape=[784], dtype='float32')
-
-    hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
-    hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
-    predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
-
-    label = fluid.layers.data(name='y', shape=[1], dtype='int64')
-
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(cost)
-
-prog_clip = prog.clone()
-
-avg_cost_clip = prog_clip.block(0).var(avg_cost.name)
-
-p_g = fluid.backward.append_backward(loss=avg_cost)
-p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip)
-
-with fluid.program_guard(main_program=prog_clip):
-    fluid.clip.set_gradient_clip(
-        fluid.clip.GradientClipByGlobalNorm(clip_norm=CLIP))
-    p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip)
-
-grad_list = [elem[1] for elem in p_g]
-grad_clip_list = [elem[1] for elem in p_g_clip]
-
-train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.mnist.train(), buf_size=8192),
-    batch_size=BATCH_SIZE)
-
-place = fluid.CPUPlace()
-exe = fluid.Executor(place)
-feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
-exe.run(fluid.default_startup_program())
-
-count = 0
-for data in train_reader():
-    count += 1
-    if count > 5:
-        break
-    out = exe.run(prog, feed=feeder.feed(data), fetch_list=grad_list)
-    out_clip = exe.run(prog_clip,
-                       feed=feeder.feed(data),
-                       fetch_list=grad_clip_list)
-    global_norm = 0
-    for v in out[1:]:
-        global_norm += np.sum(np.power(v, 2))
-    global_norm = np.sqrt(global_norm)
-
-    global_norm_clip = 0
-    for v in out_clip[1:]:
-        global_norm_clip += np.sum(np.power(v, 2))
-    global_norm_clip = np.sqrt(global_norm_clip)
-
-    if not np.isclose(
-            a=global_norm_clip, b=np.minimum(global_norm, CLIP), rtol=5e-3):
-        exit(1)
-exit(0)
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 26035f303e72a87b81fdb120fbb92894d78e996b..a4089ba3ca08bed5702a66ed370da52ecd9b58c6 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -43,7 +43,7 @@ if(APPLE)
         list(REMOVE_ITEM TEST_OPS test_desc_clone)
         list(REMOVE_ITEM TEST_OPS test_program_code)
     endif(NOT WITH_DISTRIBUTE)
-    message(WARNING "These tests has been disabled in OSX before being fixed: \n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext")
+    message(WARNING "These tests has been disabled in OSX before being fixed:\n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext")
     # this op is not support on mac
     list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op)
     # TODO: add the unitest back when it fixed
@@ -95,13 +95,12 @@ if(WITH_DISTRIBUTE)
     if(NOT APPLE)
         set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200)
         set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
+	py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext)
+	set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000)
         # FIXME(typhoonzero): add these tests back
-	# py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext)
-	# set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000)
 	# py_test_modules(test_dist_transformer MODULES test_dist_transformer)
 	# set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
-        # TODO(typhoonzero): make dist test parallel when fix port management issue
-        set_tests_properties(test_dist_mnist test_dist_word2vec test_dist_ctr test_dist_simnet_bow test_dist_save_load test_dist_text_classification test_dist_mnist_batch_merge PROPERTIES RUN_SERIAL TRUE)
+        set_tests_properties(test_dist_ctr test_dist_mnist test_dist_mnist_batch_merge test_dist_save_load test_dist_se_resnext test_dist_simnet_bow test_dist_text_classification test_dist_train test_dist_word2vec PROPERTIES RUN_SERIAL TRUE)
     endif(NOT APPLE)
     py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/dist_ctr.py b/python/paddle/fluid/tests/unittests/dist_ctr.py
index 902dc6544ed6858c4cd8d64b14d6af2367059091..65969824338a5c354415cac8a34bd3863716bef4 100644
--- a/python/paddle/fluid/tests/unittests/dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr.py
@@ -16,11 +16,13 @@ from __future__ import print_function
 
 import paddle
 import paddle.fluid as fluid
+import os
 
 import dist_ctr_reader
 from test_dist_base import TestDistRunnerBase, runtime_main
 
 IS_SPARSE = True
+os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
 
 # Fix seed for test
 fluid.default_startup_program().random_seed = 1
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist.py b/python/paddle/fluid/tests/unittests/dist_mnist.py
index 1cda2711f765622b0bda6f4c688f69352bbd2a6f..1c45a10a9ddde743dce9b343e4d18f568bb05e72 100644
--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
@@ -93,7 +93,7 @@ class TestDistMnist2x2(TestDistRunnerBase):
         # TODO(typhoonzero): fix distributed adam optimizer
         # opt = fluid.optimizer.AdamOptimizer(
         #     learning_rate=0.001, beta1=0.9, beta2=0.999)
-        opt = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
+        opt = fluid.optimizer.Momentum(learning_rate=self.lr, momentum=0.9)
 
         # Reader
         train_reader = paddle.batch(
diff --git a/python/paddle/fluid/tests/unittests/dist_save_load.py b/python/paddle/fluid/tests/unittests/dist_save_load.py
index cf62817956c12cd4487eba88bf49ed43331dff03..faec5350424668fca6416e91c3e58174bd4ec877 100644
--- a/python/paddle/fluid/tests/unittests/dist_save_load.py
+++ b/python/paddle/fluid/tests/unittests/dist_save_load.py
@@ -102,7 +102,7 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
 
         if args.mem_opt:
             fluid.memory_optimize(fluid.default_main_program(), skip_grads=True)
-        if args.is_dist:
+        if args.update_method == "pserver":
             t = self.get_transpiler(args.trainer_id,
                                     fluid.default_main_program(),
                                     args.endpoints, args.trainers,
@@ -147,7 +147,7 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
 
         def get_data():
             origin_batch = next(reader_generator)
-            if args.is_dist and args.use_reader_alloc:
+            if args.update_method == "pserver" and args.use_reader_alloc:
                 new_batch = []
                 for offset, item in enumerate(origin_batch):
                     if offset % 2 == args.trainer_id:
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 271b9c740fd99554e9a7aa8d476a52cf6385b1d9..76a707efdc0804be0316ab12c347ffed6199529a 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -216,6 +216,15 @@ class OpTest(unittest.TestCase):
                                      self.dtype)
         outputs = append_input_output(block, op_proto, self.outputs, False,
                                       self.dtype)
+
+        if hasattr(self, "cache_name_list"):
+            for name in self.cache_name_list:
+                inputs[name] = block.create_var(
+                    name=name,
+                    persistable=True,
+                    type=core.VarDesc.VarType.RAW,
+                    stop_gradient=True)
+
         op = block.append_op(
             type=self.op_type,
             inputs=inputs,
@@ -428,8 +437,17 @@ class OpTest(unittest.TestCase):
         op_inputs = self.inputs if hasattr(self, "inputs") else dict()
         op_outputs = self.outputs if hasattr(self, "outputs") else dict()
         op_attrs = self.attrs if hasattr(self, "attrs") else dict()
-        self.op = create_op(self.scope, self.op_type, op_inputs, op_outputs,
-                            op_attrs)
+
+        cache_list = None
+        if hasattr(self, "cache_name_list"):
+            cache_list = self.cache_name_list
+        self.op = create_op(
+            self.scope,
+            self.op_type,
+            op_inputs,
+            op_outputs,
+            op_attrs,
+            cache_list=cache_list)
 
         if no_grad_set is None:
             no_grad_set = set()
diff --git a/python/paddle/fluid/tests/unittests/test_async_executor.py b/python/paddle/fluid/tests/unittests/test_async_executor.py
new file mode 100644
index 0000000000000000000000000000000000000000..43855b95f9e3096d58ca3e8acfdb25f034bab175
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_async_executor.py
@@ -0,0 +1,142 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import paddle
+import unittest
+import tarfile
+import os
+import shutil
+
+proto_str = ('name: "MultiSlotDataFeed"\n'
+             'batch_size: 2\n'
+             'multi_slot_desc {\n'
+             '   slots {\n'
+             '       name: "words"\n'
+             '       type: "uint64"\n'
+             '       is_dense: false\n'
+             '       is_used: true\n'
+             '   }\n'
+             '   slots {\n'
+             '       name: "label"\n'
+             '       type: "uint64"\n'
+             '       is_dense: false\n'
+             '       is_used: true\n'
+             '   }\n'
+             '}')
+
+URL = 'http://paddle-unittest-data.gz.bcebos.com/python_paddle_fluid_tests_demo_async-executor/train_data.tar.gz'
+MD5 = '2a405a31508969b3ab823f42c0f522ca'
+
+
+def bow_net(data,
+            label,
+            dict_dim=89528,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2):
+    """
+    BOW net
+    This model is from https://github.com/PaddlePaddle/models:
+    models/fluid/PaddleNLP/text_classification/nets.py
+    """
+    # embedding
+    emb = fluid.layers.embedding(
+        input=data, size=[dict_dim, emb_dim], is_sparse=True)
+    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
+    bowh = fluid.layers.tanh(bow)
+    # fc layer after conv
+    fc_1 = fluid.layers.fc(input=bowh, size=hid_dim, act="tanh")
+    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
+    # probability of each class
+    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
+    # cross entropy loss
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    # mean loss
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+    return avg_cost, acc, prediction
+
+
+class TestAsyncExecutor(unittest.TestCase):
+    def setUp(self):
+        with open('./data.prototxt', 'w+') as f:
+            f.write(proto_str)
+            f.close()
+
+        with tarfile.open(paddle.dataset.common.download(URL, "imdb",
+                                                         MD5)) as tarf:
+            tarf.extractall(path='./')
+            tarf.close()
+
+    def test_data_feed_desc(self):
+        data_feed = fluid.DataFeedDesc('./data.prototxt')
+        # assertEqueal(data_feed.proto_desc.batch, 2)
+        # assertEqual(len(data_feed.proto_desc.multi_slot_desc), 2)
+        self.assertEqual(" ".join(data_feed.desc().split()),
+                         " ".join(proto_str.split()))
+
+    def test_run(self):
+        # Initialize dataset description
+        data_feed = fluid.DataFeedDesc('train_data/data.prototxt')
+        data_feed.set_batch_size(
+            128)  # See API doc for how to change other fields
+
+        # define network
+        # input text data
+        data = fluid.layers.data(
+            name="words", shape=[1], dtype="int64", lod_level=1)
+        # label data
+        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+        avg_cost, acc, prediction = bow_net(data, label)
+        sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
+        opt_ops, weight_and_grad = sgd_optimizer.minimize(avg_cost)
+
+        # Run startup program
+        startup_program = fluid.default_startup_program()
+        place = fluid.CPUPlace()
+        executor = fluid.Executor(place)
+        executor.run(startup_program)
+
+        main_program = fluid.default_main_program()
+        async_executor = fluid.AsyncExecutor(place)
+
+        self.assertRaises(TypeError, async_executor.run)
+        self.assertRaises(TypeError, async_executor.run, main_program)
+        self.assertRaises(TypeError, async_executor.run, main_program,
+                          data_feed)
+
+        filelist = ['train_data/part-%d' % i for i in range(10)]
+        self.assertRaises(TypeError, async_executor.run, main_program,
+                          data_feed, filelist)
+
+        thread_num = 4
+        self.assertRaises(TypeError, async_executor.run, main_program,
+                          data_feed, filelist, thread_num)
+
+        async_executor.run(main_program, data_feed, filelist, thread_num, [acc])
+        fluid.io.save_inference_model("imdb.model", [data.name, label.name],
+                                      [acc], executor)
+        statinfo = os.stat('imdb.model/__model__')
+        self.assertGreater(statinfo.st_size, 0)
+
+        os.remove('./data.prototxt')
+        shutil.rmtree('./train_data')
+        shutil.rmtree('./imdb.model')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index 80261eff4e747f87658bc7c9114c21bee511df09..2869a6ba53bfb9120ae68d67d10eb5080be5f07b 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -54,6 +54,19 @@ def _reference_testing(x, scale, offset, mean, var, epsilon, data_format):
     return y
 
 
+def _cal_mean_variance(x, epsilon, data_format):
+    assert data_format in ['NCHW', 'NHWC']
+    x_square = x * x
+    axis = (0, 2, 3) if data_format == 'NCHW' else (0, 1, 2)
+    C = x.shape[1] if data_format == 'NCHW' else x.shape[-1]
+    x_square_sum = np.sum(x_square, axis)
+    x_sum = np.sum(x, axis=axis)
+    element_count = np.size(x) / C
+    mean = x_sum / element_count
+    var = x_square_sum / element_count - mean * mean
+    return mean, var
+
+
 def _reference_training(x, scale, offset, epsilon, data_format):
     x_shape = x.shape
 
@@ -294,7 +307,18 @@ class TestBatchNormOpTraining(unittest.TestCase):
         self.use_mkldnn = False
         self.fuse_with_relu = False
         self.data_formats = ["NCHW", "NHWC"]
+        self.momentum = 0.9
+        self.epsilon = 0.00001
         self.init_kernel_type()
+        self.init_test_case()
+
+    def init_test_case(self):
+        self.use_global_stats = False
+        self.no_grad_set = set()
+        self.fetch_list = [
+            'y', 'mean', 'variance', 'saved_mean', 'saved_variance', 'x@GRAD',
+            'scale@GRAD', 'bias@GRAD'
+        ]
 
     def __assert_close(self, tensor, np_array, msg, atol=1e-4):
         np.allclose(np.array(tensor), np_array, atol=atol)
@@ -313,11 +337,22 @@ class TestBatchNormOpTraining(unittest.TestCase):
 
         return y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad
 
+    def set_mean_variance(self, scale_shape, x, data_layout):
+        mean = np.zeros(scale_shape).astype(np.float32)
+        variance = np.ones(scale_shape).astype(np.float32)
+        # computing global mean/variance for one step
+        if self.use_global_stats:
+            mom = self.momentum
+            x_mean, x_var = _cal_mean_variance(x, self.epsilon, data_layout)
+            mean = x_mean * (1. - mom) + mom * mean
+            variance = x_var * (1. - mom) + mom * variance
+        return mean, variance
+
     def test_forward_backward(self):
         def test_with_place(place, data_layout, shape):
             # attr
-            epsilon = 0.00001
-            momentum = 0.9
+            epsilon = self.epsilon
+            momentum = self.momentum
             if data_layout == "NCHW":
                 n, c, h, w = shape[0], shape[1], shape[2], shape[3]
             else:
@@ -328,9 +363,7 @@ class TestBatchNormOpTraining(unittest.TestCase):
             x = np.random.random_sample(shape).astype(np.float32)
             scale = np.random.random_sample(scale_shape).astype(np.float32)
             bias = np.random.random_sample(scale_shape).astype(np.float32)
-            mean = np.zeros(scale_shape).astype(np.float32)
-            variance = np.ones(scale_shape).astype(np.float32)
-
+            mean, variance = self.set_mean_variance(scale_shape, x, data_layout)
             y_grad = np.random.random_sample(shape).astype(np.float32)
 
             y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad = self.ref_forward_backward(
@@ -339,6 +372,9 @@ class TestBatchNormOpTraining(unittest.TestCase):
 
             var_dict = locals()
             var_dict['y@GRAD'] = y_grad
+            var_dict['x@GRAD'] = x_grad
+            var_dict['scale@GRAD'] = scale_grad
+            var_dict['bias@GRAD'] = bias_grad
 
             var_names = [
                 'x', 'scale', 'bias', 'mean', 'variance', 'y', 'saved_mean',
@@ -365,9 +401,8 @@ class TestBatchNormOpTraining(unittest.TestCase):
                     },
                     outputs={
                         "Y": block.var('y'),
-                        "MeanOut": block.var('mean'),  # share the same memory
-                        "VarianceOut":
-                        block.var('variance'),  # share the same memory
+                        "MeanOut": block.var('mean'),  # share memory
+                        "VarianceOut": block.var('variance'),  # share memory
                         "SavedMean": block.var('saved_mean'),
                         "SavedVariance": block.var('saved_variance')
                     },
@@ -377,13 +412,14 @@ class TestBatchNormOpTraining(unittest.TestCase):
                         "is_test": False,
                         "data_layout": data_layout,
                         "use_mkldnn": self.use_mkldnn,
-                        "fuse_with_relu": self.fuse_with_relu
+                        "fuse_with_relu": self.fuse_with_relu,
+                        "use_global_stats": self.use_global_stats
                     })
                 block.create_var(name='y@GRAD', dtype='float32', shape=y.shape)
 
                 # generate backward op_desc
                 grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
-                    bn_op.desc, set(), [])
+                    bn_op.desc, self.no_grad_set, [])
                 grad_op_desc = grad_op_desc_list[0]
                 new_op_desc = block.desc.append_op()
                 new_op_desc.copy_from(grad_op_desc)
@@ -403,20 +439,10 @@ class TestBatchNormOpTraining(unittest.TestCase):
                         for name in
                         ['x', 'scale', 'bias', 'mean', 'variance', 'y@GRAD']
                     },
-                    fetch_list=[
-                        'y', 'mean', 'variance', 'saved_mean', 'saved_variance',
-                        'x@GRAD', 'scale@GRAD', 'bias@GRAD'
-                    ])
-
-            self.__assert_close(y, out[0], "y")
-            self.__assert_close(mean_out, out[1], "mean")
-            self.__assert_close(variance_out, out[2], "variance", 1e-3)
-            self.__assert_close(saved_mean, out[3], "saved_mean")
-            self.__assert_close(saved_variance, out[4], "saved_variance", 1e-3)
-            self.__assert_close(x_grad, out[5], "x_grad")
-            self.__assert_close(scale_grad, out[6], "scale_grad")
-            self.__assert_close(bias_grad, out[7], "bias_grad")
+                    fetch_list=self.fetch_list)
 
+            for id, name in enumerate(self.fetch_list):
+                self.__assert_close(var_dict[name], out[id], name)
             print("op test forward passed: ", str(place), data_layout)
 
         places = [core.CPUPlace()]
@@ -432,5 +458,66 @@ class TestBatchNormOpTraining(unittest.TestCase):
         pass
 
 
+class TestBatchNormOpFreezeStatsTraining(TestBatchNormOpTraining):
+    def init_test_case(self):
+        self.use_global_stats = True
+        self.no_grad_set = set()
+        self.fetch_list = [
+            'y', 'mean', 'variance', 'x@GRAD', 'scale@GRAD', 'bias@GRAD'
+        ]
+
+    def reference_grad(self, x, y_grad, scale, mean, var, epsilon, data_format):
+        if data_format == "NCHW":
+            x = np.transpose(x, (0, 2, 3, 1))
+            y_grad = np.transpose(y_grad, (0, 2, 3, 1))
+
+        x_grad = scale * y_grad / np.sqrt(var + epsilon)
+        grad_scale = np.sum(y_grad * (x - mean) / np.sqrt(var + epsilon),
+                            axis=(0, 1, 2))
+        grad_offset = np.sum(y_grad, axis=(0, 1, 2))
+
+        # transfer back to N, C, H, W
+        if data_format == "NCHW":
+            x_grad = np.transpose(x_grad, (0, 3, 1, 2))
+            x = np.transpose(x, (0, 3, 1, 2))
+            y_grad = np.transpose(y_grad, (0, 3, 1, 2))
+
+        return x_grad, grad_scale, grad_offset
+
+    def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance,
+                             epsilon, momentum, shape, data_layout):
+        if data_layout != "NCHW" and data_layout != "NHWC":
+            raise ValueError("Unknown data order.")
+
+        if data_layout == "NCHW":
+            x = np.transpose(x, (0, 2, 3, 1))
+
+        # run normalizaton
+        normalized = (x - mean) / np.sqrt(variance + epsilon)
+        y = normalized * scale + bias
+
+        # transfer back to N, C, H, W
+        if data_layout == "NCHW":
+            x = np.transpose(x, (0, 3, 1, 2))
+            y = np.transpose(y, (0, 3, 1, 2))
+
+        mean_out = mean
+        variance_out = variance
+        saved_variance = 1. / np.sqrt(variance + epsilon)
+        # run backward
+        x_grad, scale_grad, bias_grad = self.reference_grad(
+            x, y_grad, scale, mean, variance, epsilon, data_layout)
+
+        return y, mean_out, variance_out, mean, saved_variance, x_grad, scale_grad, bias_grad
+
+
+class TestBatchNormOpFreezeStatsAndScaleBiasTraining(
+        TestBatchNormOpFreezeStatsTraining):
+    def init_test_case(self):
+        self.use_global_stats = True
+        self.no_grad_set = set(['scale@GRAD', 'bias@GRAD'])
+        self.fetch_list = ['y', 'mean', 'variance', 'x@GRAD']
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py b/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8dc5fbd237d17f2d4e45b06e5806fff5cbf58fe
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py
@@ -0,0 +1,52 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest, randomize_probability
+
+
+class TestBprLossOp1(OpTest):
+    """Test BprLoss with discrete one-hot labels.
+    """
+
+    def setUp(self):
+        self.op_type = "bpr_loss"
+        batch_size = 40
+        class_num = 5
+        X = randomize_probability(batch_size, class_num, dtype='float64')
+        label = np.random.randint(0, class_num, (batch_size, 1), dtype="int64")
+        bpr_loss_result = []
+        for i in range(batch_size):
+            sum = 0.0
+            for j in range(class_num):
+                if j == label[i][0]:
+                    continue
+                sum += (-np.log(1.0 + np.exp(X[i][j] - X[i][label[i][0]])))
+            bpr_loss_result.append(-sum / (class_num - 1))
+        bpr_loss = np.asmatrix([[x] for x in bpr_loss_result], dtype="float64")
+        self.inputs = {"X": X, "Label": label}
+        self.outputs = {"Y": bpr_loss}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Y", numeric_grad_delta=0.001)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f2130f9049c7ee294444282e59c654551f76603
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py
@@ -0,0 +1,61 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+from test_concat_op import TestConcatOp, TestConcatOp2, TestConcatOp3
+
+
+class TestMKLDNNConcatOp(TestConcatOp):
+    def setUp(self):
+        super(TestMKLDNNConcatOp, self).setUp()
+        self.attrs["use_mkldnn"] = True
+        self._cpu_only = True
+
+    def test_check_grad(self):
+        pass
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNConcatOp2(TestConcatOp2):
+    def setUp(self):
+        super(TestMKLDNNConcatOp2, self).setUp()
+        self.attrs["use_mkldnn"] = True
+        self._cpu_only = True
+
+    def test_check_grad(self):
+        pass
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNConcatOp3(TestConcatOp3):
+    def setUp(self):
+        super(TestMKLDNNConcatOp3, self).setUp()
+        self.attrs["use_mkldnn"] = True
+        self._cpu_only = True
+
+    def test_check_grad(self):
+        pass
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
index 9f3f2f348166864be9583855fcd1949fd4ac818c..6cd71e39e41dae5d07e5761fc9caeca113f3b47e 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
@@ -128,6 +128,12 @@ class TestIdentityActivation(TestConv2dFusionOp):
         self.activation = 'identity'
 
 
+class TestIdentityActivation(TestConv2dFusionOp):
+    def init_activation(self):
+        self.activation = 'identity'
+        self.add_residual_data = False
+
+
 class TestWithGroup(TestConv2dFusionOp):
     def init_group(self):
         self.groups = 3
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..deefdd09abe6b9f9ca362654f21850f598337245
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+from test_conv2d_transpose_op import TestConv2dTransposeOp, TestWithPad, TestWithStride
+
+
+class TestMKLDNN(TestConv2dTransposeOp):
+    def init_op_type(self):
+        self.is_test = True
+        self.use_mkldnn = True
+        self.data_format = "NCHW"
+        self.op_type = "conv2d_transpose"
+        self._cpu_only = True
+
+    def test_check_grad(self):
+        return
+
+    def test_check_grad_no_input(self):
+        return
+
+    def test_check_grad_no_filter(self):
+        return
+
+
+class TestMKLDNNWithPad(TestWithPad):
+    def init_op_type(self):
+        self.is_test = True
+        self.use_mkldnn = True
+        self.data_format = "NCHW"
+        self.op_type = "conv2d_transpose"
+        self._cpu_only = True
+
+    def test_check_grad(self):
+        return
+
+    def test_check_grad_no_input(self):
+        return
+
+    def test_check_grad_no_filter(self):
+        return
+
+
+class TestMKLDNNWithStride(TestWithStride):
+    def init_op_type(self):
+        self.is_test = True
+        self.use_mkldnn = True
+        self.data_format = "NCHW"
+        self.op_type = "conv2d_transpose"
+        self._cpu_only = True
+
+    def test_check_grad(self):
+        return
+
+    def test_check_grad_no_input(self):
+        return
+
+    def test_check_grad_no_filter(self):
+        return
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
index 5bb769b16891d3b7163874751f9bcd25593b4b44..3b820f6ad716e5717e45d0c6341fb89010406d59 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
@@ -68,8 +68,11 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs):
 class TestConv2dTransposeOp(OpTest):
     def setUp(self):
         # init as conv transpose
+        self.is_test = False
         self.use_cudnn = False
+        self.use_mkldnn = False
         self.output_size = None
+        self.data_format = "AnyLayout"
         self.init_op_type()
         self.init_test_case()
 
@@ -83,7 +86,9 @@ class TestConv2dTransposeOp(OpTest):
             'groups': self.groups,
             'dilations': self.dilations,
             'use_cudnn': self.use_cudnn,
-            'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
+            'is_test': self.is_test,
+            'use_mkldnn': self.use_mkldnn,
+            'data_format': self.data_format
         }
         if self.output_size is not None:
             self.attrs['output_size'] = self.output_size
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0e1265e142b800587599783367eca2203033bf1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_mkldnn_op.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+from test_conv3d_op import TestConv3dOp, TestCase1, TestWithGroup1, TestWithGroup2, TestWith1x1, TestWithInput1x1Filter1x1
+
+
+class TestMKLDNN(TestConv3dOp):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+        self.data_format = "NCHW"
+
+
+class TestMKLDNNCase1(TestCase1):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+        self.data_format = "NCHW"
+
+
+class TestMKLDNNGroup1(TestWithGroup1):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+        self.data_format = "NCHW"
+
+
+class TestMKLDNNGroup2(TestWithGroup2):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+        self.data_format = "NCHW"
+
+
+class TestMKLDNNWith1x1(TestWith1x1):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+        self.data_format = "NCHW"
+
+
+class TestMKLDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+        self.data_format = "NCHW"
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
index 69c5ab7a4a4cbd552d27dcb07052d46752eeb54a..c6b749fe09b18b1d704f45a5a5b3adbd5c6a6d0b 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
@@ -74,6 +74,8 @@ class TestConv3dOp(OpTest):
     def setUp(self):
         self.op_type = "conv3d"
         self.use_cudnn = False
+        self.use_mkldnn = False
+        self.data_format = "AnyLayout"
         self.dtype = np.float32
         self.init_kernel_type()
         self.init_group()
@@ -83,8 +85,7 @@ class TestConv3dOp(OpTest):
         conv3d_param = {
             'stride': self.stride,
             'pad': self.pad,
-            'dilations': self.dilations,
-            'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
+            'dilations': self.dilations
         }
 
         input = np.random.random(self.input_size).astype(self.dtype)
@@ -101,7 +102,9 @@ class TestConv3dOp(OpTest):
             'paddings': self.pad,
             'groups': self.groups,
             'dilations': self.dilations,
-            'use_cudnn': self.use_cudnn
+            'use_cudnn': self.use_cudnn,
+            'use_mkldnn': self.use_mkldnn,
+            'data_format': self.data_format
         }
         self.outputs = {'Output': output}
 
@@ -109,59 +112,35 @@ class TestConv3dOp(OpTest):
         return core.is_compiled_with_cuda() and self.use_cudnn
 
     def test_check_output(self):
-        if self.testcudnn():
-            place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-5)
-        else:
-            self.check_output()
+        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        self.check_output_with_place(place, atol=1e-5)
 
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        if self.testcudnn():
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place,
-                set(['Input', 'Filter']),
-                'Output',
-                max_relative_error=0.03)
-        else:
-            self.check_grad(
-                set(['Input', 'Filter']), 'Output', max_relative_error=0.03)
+        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        self.check_grad_with_place(
+            place, {'Input', 'Filter'}, 'Output', max_relative_error=0.03)
 
     def test_check_grad_no_filter(self):
         if self.dtype == np.float16:
             return
-        if self.testcudnn():
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place, ['Input'],
-                'Output',
-                max_relative_error=0.03,
-                no_grad_set=set(['Filter']))
-        else:
-            self.check_grad(
-                ['Input'],
-                'Output',
-                max_relative_error=0.03,
-                no_grad_set=set(['Filter']))
+        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        self.check_grad_with_place(
+            place, ['Input'],
+            'Output',
+            max_relative_error=0.03,
+            no_grad_set=set(['Filter']))
 
     def test_check_grad_no_input(self):
         if self.dtype == np.float16:
             return
-        if self.testcudnn():
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place, ['Filter'],
-                'Output',
-                max_relative_error=0.03,
-                no_grad_set=set(['Input']))
-        else:
-            self.check_grad(
-                ['Filter'],
-                'Output',
-                max_relative_error=0.03,
-                no_grad_set=set(['Input']))
+        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
+        self.check_grad_with_place(
+            place, ['Input'],
+            'Output',
+            max_relative_error=0.03,
+            no_grad_set=set(['Input']))
 
     def init_test_case(self):
         self.pad = [0, 0, 0]
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 97e7ee6229f081ff67ca3e2aedcad0a2e3d9cabf..07cc44aaa266af39fbf3d726ee51a9afc5cb3756 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -32,7 +32,7 @@ DEFAULT_BATCH_SIZE = 2
 
 
 class TestDistRunnerBase(object):
-    def get_model(self, batch_size=DEFAULT_BATCH_SIZE):
+    def get_model(self, batch_size=DEFAULT_BATCH_SIZE, lr=0.1):
         raise NotImplementedError(
             "get_model should be implemented by child classes.")
 
@@ -56,6 +56,7 @@ class TestDistRunnerBase(object):
         return t
 
     def run_pserver(self, args):
+        self.lr = args.lr
         self.get_model(batch_size=args.batch_size)
         # NOTE: pserver should not call memory optimize
         t = self.get_transpiler(args.trainer_id,
@@ -71,17 +72,30 @@ class TestDistRunnerBase(object):
         exe.run(pserver_prog)
 
     def run_trainer(self, args):
+        self.lr = args.lr
         test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
             self.get_model(batch_size=args.batch_size)
 
         if args.mem_opt:
             fluid.memory_optimize(fluid.default_main_program(), skip_grads=True)
-        if args.is_dist:
+        if args.update_method == "pserver":
             t = self.get_transpiler(args.trainer_id,
                                     fluid.default_main_program(),
                                     args.endpoints, args.trainers,
                                     args.sync_mode, args.dc_asgd)
             trainer_prog = t.get_trainer_program()
+        elif args.update_method == "nccl2":
+            # transpile for nccl2
+            config = fluid.DistributeTranspilerConfig()
+            config.mode = "nccl2"
+            nccl2_t = fluid.DistributeTranspiler(config=config)
+            nccl2_t.transpile(
+                args.trainer_id,
+                program=fluid.default_main_program(),
+                startup_program=fluid.default_startup_program(),
+                trainers=args.endpoints,
+                current_endpoint=args.current_endpoint)
+            trainer_prog = fluid.default_main_program()
         else:
             trainer_prog = fluid.default_main_program()
 
@@ -110,11 +124,20 @@ class TestDistRunnerBase(object):
                 len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass")
             mypass.set_int("num_repeats", args.batch_merge_repeat)
 
+        if args.update_method == "nccl2":
+            num_trainers = len(args.endpoints.split(","))
+            trainer_id = args.trainer_id
+        else:
+            num_trainers = 1
+            trainer_id = 0
+
         exe = fluid.ParallelExecutor(
             args.use_cuda,
             loss_name=avg_cost.name,
             exec_strategy=strategy,
-            build_strategy=build_stra)
+            build_strategy=build_stra,
+            num_trainers=num_trainers,
+            trainer_id=trainer_id)
 
         feed_var_list = [
             var for var in trainer_prog.global_block().vars.values()
@@ -126,7 +149,7 @@ class TestDistRunnerBase(object):
 
         def get_data():
             origin_batch = next(reader_generator)
-            if args.is_dist and args.use_reader_alloc:
+            if args.update_method != "local" and args.use_reader_alloc:
                 new_batch = []
                 for offset, item in enumerate(origin_batch):
                     if offset % 2 == args.trainer_id:
@@ -151,7 +174,11 @@ def runtime_main(test_class):
     parser.add_argument(
         '--role', type=str, required=True, choices=['pserver', 'trainer'])
     parser.add_argument('--endpoints', type=str, required=False, default="")
-    parser.add_argument('--is_dist', action='store_true')
+    parser.add_argument(
+        '--update_method',
+        type=str,
+        default="local",
+        choices=["pserver", "nccl2", "local"])
     parser.add_argument('--trainer_id', type=int, required=False, default=0)
     parser.add_argument('--trainers', type=int, required=False, default=1)
     parser.add_argument(
@@ -164,13 +191,14 @@ def runtime_main(test_class):
     parser.add_argument(
         '--use_reader_alloc', action='store_true', required=False)
     parser.add_argument('--batch_size', required=False, type=int, default=2)
+    parser.add_argument('--lr', required=False, type=float, default=0.001)
     parser.add_argument(
         '--batch_merge_repeat', required=False, type=int, default=1)
 
     args = parser.parse_args()
 
     model = test_class()
-    if args.role == "pserver" and args.is_dist:
+    if args.role == "pserver" and args.update_method == "pserver":
         model.run_pserver(args)
     else:
         model.run_trainer(args)
@@ -199,6 +227,7 @@ class TestDistBase(unittest.TestCase):
     def setUp(self):
         self._trainers = 2
         self._pservers = 2
+        self._port_set = set()
         self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
             self._find_free_port(), self._find_free_port())
         self._python_interp = sys.executable
@@ -208,17 +237,27 @@ class TestDistBase(unittest.TestCase):
         self._use_reduce = False
         self._dc_asgd = False  # must use with async mode
         self._use_reader_alloc = True
+        self._nccl2_mode = False
+        self._lr = 0.001
         self._setup_config()
         self._after_setup_config()
 
     def _find_free_port(self):
-        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
-            s.bind(('', 0))
-            return s.getsockname()[1]
+        def __free_port():
+            with closing(socket.socket(socket.AF_INET,
+                                       socket.SOCK_STREAM)) as s:
+                s.bind(('', 0))
+                return s.getsockname()[1]
+
+        while True:
+            port = __free_port()
+            if port not in self._port_set:
+                self._port_set.add(port)
+                return port
 
     def start_pserver(self, model_file, check_error_log, required_envs):
         ps0_ep, ps1_ep = self._ps_endpoints.split(",")
-        ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist"
+        ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --update_method pserver"
         ps0_cmd = ps_cmd % \
                   (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
                    self._trainers)
@@ -258,7 +297,8 @@ class TestDistBase(unittest.TestCase):
                    batch_size=DEFAULT_BATCH_SIZE,
                    batch_merge_repeat=1):
 
-        cmd = "%s %s --role trainer" % (self._python_interp, model)
+        cmd = "%s %s --role trainer --lr %f" % (self._python_interp, model,
+                                                self._lr)
         if batch_size != DEFAULT_BATCH_SIZE:
             cmd += " --batch_size %d" % batch_size
         if batch_merge_repeat > 1:
@@ -270,7 +310,8 @@ class TestDistBase(unittest.TestCase):
         else:
             env_local = {'CPU_NUM': '1'}
 
-        envs.update(env_local)
+        env_local.update(envs)
+        print("local_cmd: {}, env: {}".format(cmd, env_local))
 
         if check_error_log:
             err_log = open("/tmp/trainer.err.log", "wb")
@@ -278,21 +319,21 @@ class TestDistBase(unittest.TestCase):
                 cmd.split(" "),
                 stdout=subprocess.PIPE,
                 stderr=err_log,
-                env=envs)
+                env=env_local)
         else:
             local_proc = subprocess.Popen(
                 cmd.split(" "),
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
-                env=envs)
+                env=env_local)
 
         local_out, local_err = local_proc.communicate()
 
         if check_error_log:
             err_log.close()
 
-        sys.stderr.write('local_stdout: %s\n' % pickle.loads(local_out))
         sys.stderr.write('local_stderr: %s\n' % local_err)
+        sys.stderr.write('local_stdout: %s\n' % pickle.loads(local_out))
 
         return pickle.loads(local_out)
 
@@ -303,13 +344,13 @@ class TestDistBase(unittest.TestCase):
 
         ps0_ep, ps1_ep = self._ps_endpoints.split(",")
 
-        tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist"
+        tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --update_method pserver --lr %f"
         tr0_cmd = tr_cmd % \
                   (self._python_interp, model, self._ps_endpoints,
-                   0, ps0_ep, self._trainers)
+                   0, ps0_ep, self._trainers, self._lr)
         tr1_cmd = tr_cmd % \
                   (self._python_interp, model, self._ps_endpoints,
-                   1, ps1_ep, self._trainers)
+                   1, ps1_ep, self._trainers, self._lr)
 
         if self._sync_mode:
             tr0_cmd += " --sync_mode"
@@ -335,8 +376,8 @@ class TestDistBase(unittest.TestCase):
         env0.update(envs)
         env1.update(envs)
 
-        print("tr0_cmd:{}".format(tr0_cmd))
-        print("tr1_cmd:{}".format(tr1_cmd))
+        print("tr0_cmd: {}, env: {}".format(tr0_cmd, env0))
+        print("tr1_cmd: {}, env: {}".format(tr1_cmd, env1))
         tr0_pipe = open("/tmp/tr0_err.log", "wb")
         tr1_pipe = open("/tmp/tr1_err.log", "wb")
 
@@ -351,28 +392,111 @@ class TestDistBase(unittest.TestCase):
             stderr=tr1_pipe,
             env=env1)
 
+        # Wait until trainer process terminate
+        while True:
+            stat0 = tr0_proc.poll()
+            time.sleep(0.1)
+            if stat0 is not None:
+                break
+        while True:
+            stat1 = tr1_proc.poll()
+            time.sleep(0.1)
+            if stat1 is not None:
+                break
+
         tr0_out, tr0_err = tr0_proc.communicate()
         tr1_out, tr1_err = tr1_proc.communicate()
 
         # close trainer file
         tr0_pipe.close()
         tr1_pipe.close()
-
         ps0_pipe.close()
         ps1_pipe.close()
-        # FIXME: use terminate() instead of sigkill.
-        os.kill(ps0.pid, signal.SIGKILL)
-        os.kill(ps1.pid, signal.SIGKILL)
+
         ps0.terminate()
         ps1.terminate()
 
+        # print server log
+        with open("/tmp/ps0_err.log", "r") as fn:
+            sys.stderr.write("ps0 stderr: %s\n" % fn.read())
+        with open("/tmp/ps1_err.log", "r") as fn:
+            sys.stderr.write("ps1 stderr: %s\n" % fn.read())
+
+        # print log
+        if stat0 == 0:
+            sys.stderr.write('trainer 0 stdout: %s\n' % pickle.loads(tr0_out))
+        with open("/tmp/tr0_err.log", "r") as fn:
+            sys.stderr.write('trainer 0 stderr: %s\n' % fn.read())
+        if stat1 == 0:
+            sys.stderr.write('trainer 1 stdout: %s\n' % pickle.loads(tr1_out))
+        with open("/tmp/tr1_err.log", "r") as fn:
+            sys.stderr.write('trainer 1 stderr: %s\n' % fn.read())
+
+        return pickle.loads(tr0_out), pickle.loads(tr1_out)
+
+    def _run_cluster_nccl2(self, model, envs, check_error_log):
+        # NOTE: we reuse ps_endpoints as nccl2 worker endpoints
+        worker_endpoints = self._ps_endpoints.split(",")
+        w0_ep, w1_ep = worker_endpoints
+
+        tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method nccl2 --lr %f"
+        tr0_cmd = tr_cmd % \
+                  (self._python_interp, model, self._ps_endpoints,
+                   0, w0_ep, self._lr / 2)
+        tr1_cmd = tr_cmd % \
+                  (self._python_interp, model, self._ps_endpoints,
+                   1, w1_ep, self._lr / 2)
+
+        if self._mem_opt:
+            tr0_cmd += " --mem_opt"
+            tr1_cmd += " --mem_opt"
+        if self._use_reduce:
+            tr0_cmd += " --use_reduce"
+            tr1_cmd += " --use_reduce"
+        if self._use_reader_alloc:
+            tr0_cmd += " --use_reader_alloc"
+            tr1_cmd += " --use_reader_alloc"
+        if self.__use_cuda:
+            tr0_cmd += " --use_cuda"
+            tr1_cmd += " --use_cuda"
+            env0 = {"CUDA_VISIBLE_DEVICES": "0"}
+            env1 = {"CUDA_VISIBLE_DEVICES": "1"}
+        else:
+            env0 = {'CPU_NUM': '1'}
+            env1 = {'CPU_NUM': '1'}
+
+        env0.update(envs)
+        env1.update(envs)
+
+        print("tr0_cmd:{}, env: {}".format(tr0_cmd, env0))
+        print("tr1_cmd:{}, env: {}".format(tr1_cmd, env1))
+        tr0_pipe = open("/tmp/tr0_err.log", "wb")
+        tr1_pipe = open("/tmp/tr1_err.log", "wb")
+
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=tr0_pipe,
+            env=env0)
+        tr1_proc = subprocess.Popen(
+            tr1_cmd.strip().split(" "),
+            stdout=subprocess.PIPE,
+            stderr=tr1_pipe,
+            env=env1)
+
+        tr0_out, tr0_err = tr0_proc.communicate()
+        tr1_out, tr1_err = tr1_proc.communicate()
+
+        # close trainer file
+        tr0_pipe.close()
+        tr1_pipe.close()
+
         # print log
-        sys.stderr.write('trainer 0 stdout: %s\n' % pickle.loads(tr0_out))
         sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
-        sys.stderr.write('trainer 1 stdout: %s\n' % pickle.loads(tr1_out))
         sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
+        sys.stderr.write('trainer 0 stdout: %s\n' % tr0_out)
+        sys.stderr.write('trainer 1 stdout: %s\n' % tr1_out)
 
-        # return tr0_losses, tr1_losses
         return pickle.loads(tr0_out), pickle.loads(tr1_out)
 
     def check_with_place(self,
@@ -386,21 +510,27 @@ class TestDistBase(unittest.TestCase):
             "PYTHONPATH": os.getenv("PYTHONPATH", ""),
             "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
             "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
+            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
             "FLAGS_cudnn_deterministic": "1",
-            "http_proxy": ""
+            "http_proxy": "",
+            "NCCL_P2P_DISABLE": "1"
         }
 
         required_envs.update(need_envs)
 
         if check_error_log:
-            required_envs["GLOG_v"] = "7"
+            required_envs["GLOG_v"] = "3"
             required_envs["GLOG_logtostderr"] = "1"
 
         local_losses\
             = self._run_local(model_file, required_envs,
                                        check_error_log)
-        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs,
-                                                   check_error_log)
+        if self._nccl2_mode:
+            tr0_losses, tr1_losses = self._run_cluster_nccl2(
+                model_file, required_envs, check_error_log)
+        else:
+            tr0_losses, tr1_losses = self._run_cluster(
+                model_file, required_envs, check_error_log)
 
         for step_id in range(RUN_STEP):
             local_loss = local_losses[step_id]
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
index 81eb651878209164b3f339cc5030dbac847942d1..49a2ca40e3cb1dd35027345e9c38eb8b6912d2cd 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
@@ -26,6 +26,19 @@ class TestDistMnist2x2(TestDistBase):
         self.check_with_place("dist_mnist.py", delta=1e-5)
 
 
+class TestDistMnistNCCL2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+        self._use_reader_alloc = False
+        self._nccl2_mode = True
+
+    def test_dist_train(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place("dist_mnist.py", delta=1e-5)
+
+
 class TestDistMnist2x2Lars(TestDistBase):
     def _setup_config(self):
         self._sync_mode = True
diff --git a/python/paddle/fluid/tests/unittests/test_dist_save_load.py b/python/paddle/fluid/tests/unittests/test_dist_save_load.py
index ea2b554dac83988955e3a7e8919e57a4ed7a8215..4588ca7c17ba5db893f080813d299feaa47626a7 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_save_load.py
@@ -44,7 +44,7 @@ class TestDistSaveLoadDense2x2(TestDistBase):
         required_envs.update(need_envs)
 
         if check_error_log:
-            required_envs["GLOG_v"] = "7"
+            required_envs["GLOG_v"] = "3"
             required_envs["GLOG_logtostderr"] = "1"
 
         model_dir = tempfile.mkdtemp()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index d132dd3c48f55c07725515e40faeb5076398adeb..d9ad4e2e2c7b8d0a99d917495fbc8efc6cbd188d 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -769,6 +769,7 @@ class TestNCCL2Transpile(TranspilerTest):
 
             config = fluid.DistributeTranspilerConfig()
             config.mode = "nccl2"
+            config.wait_port = False
             t = fluid.DistributeTranspiler(config=config)
             t.transpile(
                 0,
@@ -782,5 +783,46 @@ class TestNCCL2Transpile(TranspilerTest):
             pass
 
 
+# test for remote prefetch
+class TestRemoteLookupTable(TestDistLookupTableBase):
+    def net_conf(self):
+        import os
+        os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
+        self.network_with_table(is_sparse=True, is_distributed=False)
+
+    def transpiler_test_impl(self):
+        pserver1, startup1 = self.get_pserver(self.pserver1_ep)
+
+        self.assertEqual(len(pserver1.blocks), 4)
+        # 0 listen_and_serv
+        # 1 optimize for fc_w or fc_b adam
+        self.assertEqual([op.type for op in pserver1.blocks[1].ops],
+                         ["sum", "scale", "adam", "scale", "scale"])
+        # 2 optimize for table adam
+        # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
+        self.assertEqual([op.type for op in pserver1.blocks[2].ops],
+                         ["sum", "scale", "adam", "scale", "scale"])
+
+        # 3 optimize for table 2 adam
+        # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
+        self.assertEqual([op.type for op in pserver1.blocks[3].ops],
+                         ["sum", "scale", "adam", "scale", "scale"])
+
+        trainer, _ = self.get_trainer()
+        self.assertEqual(len(trainer.blocks), 1)
+        ops = [
+            'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
+            'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add',
+            'cross_entropy', 'mean', 'fill_constant', 'mean_grad',
+            'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad',
+            'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
+            'split_selected_rows', 'send', 'sequence_pool_grad',
+            'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
+            'sum', 'split_selected_rows', 'send', 'send_barrier', 'recv',
+            'recv', 'fetch_barrier'
+        ]
+        self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..e91cfe0b45ab7e4e56fccf8d49eb381fbbd199d1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0'
+os.environ['CPU_NUM'] = '2'
+
+import six
+import unittest
+
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+
+
+def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
+    if use_cuda and not core.is_compiled_with_cuda():
+        print('Skip use_cuda=True because Paddle is not compiled with cuda')
+        return
+
+    word_dict = paddle.dataset.imdb.word_dict()
+    train_reader = paddle.batch(
+        paddle.dataset.imdb.train(word_dict), batch_size=batch_size)
+
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+    cost = network(data, label, len(word_dict))
+    optimizer = fluid.optimizer.Adagrad(learning_rate=0.2)
+    optimizer.minimize(cost)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
+    reader = feeder.decorate_reader(
+        train_reader, multi_devices=use_parallel_executor)
+
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    if use_parallel_executor:
+        train_exe = fluid.ParallelExecutor(
+            use_cuda=use_cuda, loss_name=cost.name)
+        fetch_list = [cost.name]
+    else:
+        train_exe = exe
+        fetch_list = [cost]
+
+    for pass_id in six.moves.xrange(pass_num):
+        batch_id = 0
+        for data in reader():
+            train_exe.run(feed=data,
+                          fetch_list=fetch_list if batch_id % 4 == 0 else [])
+            batch_id += 1
+            if batch_id > 16:
+                break
+
+
+class TestBase(unittest.TestCase):
+    def setUp(self):
+        self.net = None
+
+    def test_network(self):
+        if self.net is None:
+            return
+
+        for use_cuda in [True, False]:
+            for use_parallel_executor in [False, True]:
+                print('network: {}, use_cuda: {}, use_parallel_executor: {}'.
+                      format(self.net.__name__, use_cuda,
+                             use_parallel_executor))
+                with fluid.program_guard(fluid.Program(), fluid.Program()):
+                    with fluid.scope_guard(core.Scope()):
+                        train(self.net, use_cuda, use_parallel_executor)
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ed3d9fdf3bf765f1b9ef8ba1ef2a5795f1874c7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from test_eager_deletion_dynamic_rnn_base import TestBase
+import paddle.fluid as fluid
+
+
+def gru_net(data,
+            label,
+            dict_dim,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2,
+            emb_lr=400.0):
+    emb = fluid.layers.embedding(
+        input=data,
+        size=[dict_dim, emb_dim],
+        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
+    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 3)
+    gru_h = fluid.layers.dynamic_gru(input=fc0, size=hid_dim, is_reverse=False)
+    gru_max = fluid.layers.sequence_pool(input=gru_h, pool_type='max')
+    gru_max_tanh = fluid.layers.tanh(gru_max)
+    fc1 = fluid.layers.fc(input=gru_max_tanh, size=hid_dim2, act='tanh')
+    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    return avg_cost
+
+
+class GRUTest(TestBase):
+    def setUp(self):
+        self.net = gru_net
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..8462c06aa56e0469fd06c7dc4b2ed514f7eb51ba
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from test_eager_deletion_dynamic_rnn_base import TestBase
+import paddle.fluid as fluid
+import unittest
+
+
+def lstm_net(data,
+             label,
+             dict_dim,
+             emb_dim=128,
+             hid_dim=128,
+             hid_dim2=96,
+             class_dim=2,
+             emb_lr=30.0):
+    emb = fluid.layers.embedding(
+        input=data,
+        size=[dict_dim, emb_dim],
+        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
+    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
+    lstm_h, c = fluid.layers.dynamic_lstm(
+        input=fc0, size=hid_dim * 4, is_reverse=False)
+    lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max')
+    lstm_max_tanh = fluid.layers.tanh(lstm_max)
+    fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh')
+    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    return avg_cost
+
+
+class LSTMTest(TestBase):
+    def setUp(self):
+        self.net = lstm_net
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ec1f0ae753724dac5c4675926ead87a097a7a99
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
+
+from test_parallel_executor_mnist import TestMNIST
+
+
+class EagerDeletionTestMNIST(TestMNIST):
+    pass
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..754d5fd40953311a5deb466fa42216f72671a65a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
+
+from test_parallel_executor_transformer import TestTransformer
+
+
+class EagerDeletionTestTransformer(TestTransformer):
+    pass
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py b/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..021b950b3b6245caecab22d476bbb9d6b6b45c5e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py
@@ -0,0 +1,65 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid.core as core
+import numpy as np
+from paddle.fluid.op import Operator
+
+
+class TestGetTensorFromSelectedRows(unittest.TestCase):
+    def get_places(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        return places
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+        x_rows = [0, 5, 5, 4, 20]
+        height = 20
+        row_numel = 2
+
+        np_array = np.ones((len(x_rows), row_numel)).astype("float32")
+        np_array[1, :] = 2.0
+        np_array[2, :] = 3.0
+        np_array[3, :] = 4.0
+
+        # initialize input variable X
+        x = scope.var('X').get_selected_rows()
+        x.set_rows(x_rows)
+        x.set_height(height)
+        x_tensor = x.get_tensor()
+        x_tensor.set(np_array, place)
+
+        # initialize input variable Out
+        out = scope.var("Out").get_tensor()
+
+        op = Operator("get_tensor_from_selected_rows", X="X", Out="Out")
+
+        op.run(scope, place)
+
+        out_array = np.array(out)
+        self.assertEqual((5, 2), out_array.shape)
+        assert (out_array == np_array).all()
+
+    def test_check_output(self):
+        for place in self.get_places():
+            self.check_with_place(place)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
new file mode 100644
index 0000000000000000000000000000000000000000..e49239da6d3918211fbbc302d2c56818460b6d51
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
@@ -0,0 +1,161 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+
+
+def bow_net(data,
+            label,
+            dict_dim,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2):
+    """
+    BOW net
+    This model is from https://github.com/PaddlePaddle/models:
+    fluid/PaddleNLP/text_classification/nets.py
+    """
+    emb = fluid.layers.embedding(
+        input=data, is_sparse=True, size=[dict_dim, emb_dim])
+    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
+    bow_tanh = fluid.layers.tanh(bow)
+    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
+    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
+    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    return avg_cost
+
+
+class TestGradientClip(unittest.TestCase):
+    def setUp(self):
+        self.word_dict = paddle.dataset.imdb.word_dict()
+        self.BATCH_SIZE = 2
+        self.train_data = paddle.batch(
+            paddle.dataset.imdb.train(self.word_dict),
+            batch_size=self.BATCH_SIZE)
+
+    def get_places(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        return places
+
+    def check_operators(self, place):
+        CLIP = 1
+
+        prog = fluid.framework.Program()
+        startup_program = fluid.framework.Program()
+        with fluid.program_guard(
+                main_program=prog, startup_program=startup_program):
+            image = fluid.layers.data(name='x', shape=[784], dtype='float32')
+            label = fluid.layers.data(name='y', shape=[1], dtype='int64')
+
+            hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
+            hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
+            predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(cost)
+
+        prog_clip = prog.clone()
+        avg_cost_clip = prog_clip.block(0).var(avg_cost.name)
+
+        p_g = fluid.backward.append_backward(loss=avg_cost)
+        p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip)
+
+        with fluid.program_guard(
+                main_program=prog_clip, startup_program=startup_program):
+            fluid.clip.set_gradient_clip(
+                fluid.clip.GradientClipByGlobalNorm(clip_norm=CLIP))
+            p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip)
+
+        grad_list = [elem[1] for elem in p_g]
+        grad_clip_list = [elem[1] for elem in p_g_clip]
+
+        train_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.mnist.train(), buf_size=8192),
+            batch_size=128)
+
+        exe = fluid.Executor(place)
+        feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
+        exe.run(startup_program)
+
+        count = 0
+        for data in train_reader():
+            count += 1
+            if count > 5:
+                break
+            out = exe.run(prog, feed=feeder.feed(data), fetch_list=grad_list)
+            out_clip = exe.run(prog_clip,
+                               feed=feeder.feed(data),
+                               fetch_list=grad_clip_list)
+            global_norm = 0
+            for v in out:
+                global_norm += np.sum(np.power(v, 2))
+            global_norm = np.sqrt(global_norm)
+
+            global_norm_clip = 0
+            for v in out_clip:
+                global_norm_clip += np.sum(np.power(v, 2))
+            global_norm_clip = np.sqrt(global_norm_clip)
+
+            assert np.isclose(
+                a=global_norm_clip, b=np.minimum(global_norm, CLIP), rtol=5e-3)
+
+    def check_sparse_gradient_clip(self, place):
+        prog = fluid.framework.Program()
+        startup_program = fluid.framework.Program()
+        with fluid.program_guard(
+                main_program=prog, startup_program=startup_program):
+            data = fluid.layers.data(
+                name="words", shape=[1], dtype="int64", lod_level=1)
+            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+            cost = bow_net(data, label, len(self.word_dict))
+
+            fluid.clip.set_gradient_clip(
+                clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0))
+
+            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd_optimizer.minimize(cost)
+
+        exe = fluid.Executor(place)
+        feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
+        exe.run(startup_program)
+
+        data = next(self.train_data())
+        val = exe.run(prog, feed=feeder.feed(data), fetch_list=[cost])[0]
+        self.assertEqual((1, ), val.shape)
+        print(val)
+        self.assertFalse(np.isnan(val))
+
+    def test_operators(self):
+        self.check_operators(core.CPUPlace())
+
+    def test_sparse_gradient_clip(self):
+        for place in self.get_places():
+            self.check_sparse_gradient_clip(place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5b6305155d1ef3dcf6ce590c221664754c5bdc8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import sys
+import numpy as np
+
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+
+class MyLayer(fluid.imperative.PyLayer):
+    def __init__(self):
+        super(MyLayer, self).__init__()
+
+    def forward(self, inputs):
+        x = fluid.layers.relu(inputs[0])
+        self._x_for_debug = x
+        return [fluid.layers.elementwise_mul(x, x)]
+
+
+class TestImperative(unittest.TestCase):
+    def test_layer(self):
+        with fluid.imperative.guard():
+            cl = core.Layer()
+            cl.forward([])
+            l = fluid.imperative.PyLayer()
+            l.forward([])
+
+    def test_layer_in_out(self):
+        with fluid.imperative.guard():
+            l = MyLayer()
+            x = l(np.array([1.0, 2.0, -1.0], dtype=np.float32))[0]
+            self.assertIsNotNone(x)
+            sys.stderr.write("%s output: %s\n" % (x, x._numpy()))
+            x._backward()
+            sys.stderr.write("grad %s\n" % l._x_for_debug._gradient())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 541160771152dd2ebc8a782863bb4ad3643892e5..fb3e4da1efd32ca99f57da8f9955803ddde04f8a 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -170,9 +170,10 @@ class TestBook(unittest.TestCase):
         with program_guard(program):
             dat = layers.data(name='data', shape=[10], dtype='float32')
             lbl = layers.data(name='label', shape=[10], dtype='float32')
+            ignore_index = -1
             self.assertIsNotNone(
                 layers.sigmoid_cross_entropy_with_logits(
-                    x=dat, label=lbl))
+                    x=dat, label=lbl, ignore_index=ignore_index))
         print(str(program))
 
     def test_hsigmoid(self):
@@ -510,6 +511,16 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(output)
         print(str(program))
 
+    def test_psroi_pool(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name="x", shape=[245, 30, 30], dtype="float32")
+            rois = layers.data(
+                name="rois", shape=[4], dtype="float32", lod_level=1)
+            output = layers.psroi_pool(x, rois, 5, 0.25, 7, 7)
+            self.assertIsNotNone(output)
+        print(str(program))
+
     def test_roi_align(self):
         program = Program()
         with program_guard(program):
@@ -636,13 +647,21 @@ class TestBook(unittest.TestCase):
         with program_guard(program):
             input = layers.data(
                 name="input", shape=[3, 100, 100], dtype="float32")
+            paddings = layers.fill_constant(shape=[4], dtype='int32', value=1)
             out = layers.pad2d(
                 input,
                 paddings=[1, 2, 3, 4],
                 mode='reflect',
                 data_format='NCHW',
                 name="shape")
+            out_1 = layers.pad2d(
+                input,
+                paddings=paddings,
+                mode='reflect',
+                data_format='NCHW',
+                name="shape")
             self.assertIsNotNone(out)
+            self.assertIsNotNone(out_1)
         print(str(program))
 
     def test_prelu(self):
@@ -837,6 +856,15 @@ class TestBook(unittest.TestCase):
             out = layers.cross_entropy(x, label, False, 4)
             self.assertIsNotNone(out)
 
+    def test_bpr_loss(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name="x", shape=[30, 10], dtype="float32")
+            label = layers.data(name="label", shape=[30, 1], dtype="int32")
+            out = layers.bpr_loss(x, label)
+            self.assertIsNotNone(out)
+        print(str(program))
+
     def test_expand(self):
         program = Program()
         with program_guard(program):
@@ -955,6 +983,15 @@ class TestBook(unittest.TestCase):
 
         print(str(program))
 
+    def test_batch_norm(self):
+        program = Program()
+        with program_guard(program):
+            data = layers.data(
+                name='data', shape=[32, 128, 128], dtype="float32")
+            out = layers.batch_norm(data)
+
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..47830fb56b4e31018c2691cfa38c8d0d9cb4016e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py
@@ -0,0 +1,203 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import signal
+import time
+import unittest
+from multiprocessing import Process
+
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from paddle.fluid.framework import Program, program_guard
+
+
+def run_pserver(pserver_id, use_cuda, sync_mode):
+    scope = fluid.core.Scope()
+    program = Program()
+    with fluid.scope_guard(scope):
+        with program_guard(program, startup_program=Program()):
+            # create table parameter in scope
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+            # create and initialize Param Variable
+            param = scope.var('table').get_tensor()
+
+            param_array = np.ones((10, 8)).astype("float32")
+            for i in range(len(param_array)):
+                param_array[i] *= param_array[i] * i + pserver_id * 10
+            param.set(param_array, place)
+
+            optimize_block = program._create_block(program.global_block().idx)
+            program.global_block().append_op(
+                type="listen_and_serv",
+                inputs={'X': []},
+                outputs={},
+                attrs={
+                    "optimize_blocks": [optimize_block],
+                    "endpoint": '127.0.0.1:0',
+                    "Fanin": 1,
+                    "sync_mode": True,
+                    "grad_to_block_id": []
+                })
+
+            exe = fluid.Executor(place)
+            exe.run(program)
+
+
+class TestListenAndServOp(unittest.TestCase):
+    def setUp(self):
+        self.ps_timeout = 5
+
+    def _start_pserver(self, pserver_id, use_cuda, sync_mode, pserver_func):
+        p = Process(target=pserver_func, args=(pserver_id, use_cuda, sync_mode))
+        p.daemon = True
+        p.start()
+        return p
+
+    def _wait_ps_ready(self, pid):
+        start_left_time = self.ps_timeout
+        sleep_time = 0.5
+        while True:
+            assert start_left_time >= 0, "wait ps ready failed"
+            time.sleep(sleep_time)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                start_left_time -= sleep_time
+
+    def _get_pserver_port(self, pid):
+        with open("/tmp/paddle.%d.port" % pid, 'r') as f:
+            port = int(f.read().strip())
+        return port
+
+    def _run_lookup_table_op_one_pserver(self, place, port):
+        scope = fluid.core.Scope()
+        program = Program()
+        with fluid.scope_guard(scope):
+            with program_guard(program, startup_program=Program()):
+                # create and initialize Param Variable
+                param = scope.var('W').get_tensor()
+                param_array = np.full((10, 8), 1.0).astype("float32")
+                param.set(param_array, place)
+
+                ids = scope.var('Ids').get_tensor()
+                ids_array = np.array([[1], [2], [5]]).astype("int64")
+                ids.set(ids_array, place)
+                ids_lod = [[0, 1, 2, 3]]
+                ids.set_lod(ids_lod)
+
+                out = scope.var('Out').get_tensor()
+
+                emaps = ['127.0.0.1:' + str(port)]
+                table_names = ['table']
+                height_sections = [10]
+
+                # create and run sgd operator
+                lookup_table_op = Operator(
+                    "lookup_table",
+                    W='W',
+                    Ids='Ids',
+                    Out='Out',
+                    remote_prefetch=True,
+                    epmap=emaps,
+                    table_names=table_names,
+                    height_sections=height_sections)
+                lookup_table_op.run(scope, place)
+
+                # get and compare result
+                result_array = np.array(out)
+
+                self.assertEqual(out.lod(), ids_lod)
+                self.assertEqual(list(result_array.shape), [len(ids_array), 8])
+                for i in range(len(ids_array)):
+                    id = ids_array[i][0]
+                    self.assertTrue((result_array[i] == id).all())
+
+    def _run_lookup_table_op_two_pserver(self, place, port0, port1):
+        scope = fluid.core.Scope()
+        program = Program()
+        with fluid.scope_guard(scope):
+            with program_guard(program, startup_program=Program()):
+                # create and initialize Param Variable
+                param = scope.var('W').get_tensor()
+                param_array = np.full((10, 8), 1.0).astype("float32")
+                param.set(param_array, place)
+
+                ids = scope.var('Ids').get_tensor()
+                ids_array = np.array([[1], [2], [11], [13]]).astype("int64")
+                ids.set(ids_array, place)
+                ids_lod = [[0, 2, 3, 4]]
+                ids.set_lod(ids_lod)
+
+                out = scope.var('Out').get_tensor()
+
+                emaps = ['127.0.0.1:' + str(port0), '127.0.0.1:' + str(port1)]
+                table_names = ['table', 'table']
+                height_sections = [10, 20]
+
+                # create and run sgd operator
+                lookup_table_op = Operator(
+                    "lookup_table",
+                    W='W',
+                    Ids='Ids',
+                    Out='Out',
+                    remote_prefetch=True,
+                    epmap=emaps,
+                    table_names=table_names,
+                    height_sections=height_sections)
+                lookup_table_op.run(scope, place)
+
+                # get and compare result
+                result_array = np.array(out)
+                self.assertEqual(out.lod(), ids_lod)
+                self.assertEqual(list(result_array.shape), [len(ids_array), 8])
+                for i in range(len(ids_array)):
+                    id = ids_array[i][0]
+                    self.assertTrue((result_array[i] == id).all())
+
+    def test_lookup_remote_table(self):
+        os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
+        # run pserver on CPU in sync mode
+        p0 = self._start_pserver(0, False, True, run_pserver)
+        self._wait_ps_ready(p0.pid)
+        port0 = self._get_pserver_port(p0.pid)
+
+        p1 = self._start_pserver(1, False, True, run_pserver)
+        self._wait_ps_ready(p1.pid)
+        port1 = self._get_pserver_port(p1.pid)
+
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+
+        for place in places:
+            self._run_lookup_table_op_one_pserver(place, port0)
+            self._run_lookup_table_op_two_pserver(place, port0, port1)
+
+        # raise SIGTERM to pserver
+        os.kill(p0.pid, signal.SIGINT)
+        p0.join()
+        os.kill(p1.pid, signal.SIGINT)
+        p1.join()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e9e2e8429e51a328e397f9e2a05ab7209c9c1a2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
@@ -0,0 +1,192 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle.fluid.core as core
+from op_test import OpTest
+import paddle.fluid as fluid
+
+SIGMOID_THRESHOLD_MIN = -40.0
+SIGMOID_THRESHOLD_MAX = 13.0
+EXP_MAX_INPUT = 40.0
+
+
+def lstm_naive(
+        input,
+        w, ):
+    seq_len, batch_size, hidden_size = input.shape
+
+    offset = 0
+    wi = w[offset:offset + hidden_size * hidden_size].reshape(
+        (hidden_size, hidden_size)).transpose()
+    offset += hidden_size * hidden_size
+    wf = w[offset:offset + hidden_size * hidden_size].reshape(
+        (hidden_size, hidden_size)).transpose()
+    offset += hidden_size * hidden_size
+    wc = w[offset:offset + hidden_size * hidden_size].reshape(
+        (hidden_size, hidden_size)).transpose()
+    offset += hidden_size * hidden_size
+    wo = w[offset:offset + hidden_size * hidden_size].reshape(
+        (hidden_size, hidden_size)).transpose()
+    offset += hidden_size * hidden_size
+    ri = w[offset:offset + hidden_size * hidden_size].reshape(
+        (hidden_size, hidden_size)).transpose()
+    offset += hidden_size * hidden_size
+    rf = w[offset:offset + hidden_size * hidden_size].reshape(
+        (hidden_size, hidden_size)).transpose()
+    offset += hidden_size * hidden_size
+    rc = w[offset:offset + hidden_size * hidden_size].reshape(
+        (hidden_size, hidden_size)).transpose()
+    offset += hidden_size * hidden_size
+    ro = w[offset:offset + hidden_size * hidden_size].reshape(
+        (hidden_size, hidden_size)).transpose()
+    offset += hidden_size * hidden_size
+
+    bi_1 = w[offset:offset + hidden_size]
+    offset += hidden_size
+    bf_1 = w[offset:offset + hidden_size]
+    offset += hidden_size
+    bc_1 = w[offset:offset + hidden_size]
+    offset += hidden_size
+    bo_1 = w[offset:offset + hidden_size]
+    offset += hidden_size
+
+    bi_2 = w[offset:offset + hidden_size]
+    offset += hidden_size
+    bf_2 = w[offset:offset + hidden_size]
+    offset += hidden_size
+    bc_2 = w[offset:offset + hidden_size]
+    offset += hidden_size
+    bo_2 = w[offset:offset + hidden_size]
+
+    def sigmoid(x):
+        y = np.copy(x)
+        y[x < SIGMOID_THRESHOLD_MIN] = SIGMOID_THRESHOLD_MIN
+        y[x > SIGMOID_THRESHOLD_MAX] = SIGMOID_THRESHOLD_MAX
+        return 1. / (1. + np.exp(-y))
+
+    def tanh(x):
+        y = -2. * x
+        y[y > EXP_MAX_INPUT] = EXP_MAX_INPUT
+        return (2. / (1. + np.exp(y))) - 1.
+
+    output = []
+    pre_h = np.zeros((batch_size, hidden_size), dtype=input.dtype)
+    pre_c = np.zeros((batch_size, hidden_size), dtype=input.dtype)
+
+    for i in range(seq_len):
+        emb_1 = input[i]
+
+        input_gate = sigmoid(
+            np.matmul(emb_1, wi) + np.matmul(pre_h, ri) + bi_1 + bi_2)
+        forget_gate = sigmoid(
+            np.matmul(emb_1, wf) + np.matmul(pre_h, rf) + bf_1 + bf_2)
+        output_gate = sigmoid(
+            np.matmul(emb_1, wo) + np.matmul(pre_h, ro) + bo_1 + bo_2)
+        c_t_temp = tanh(
+            np.matmul(emb_1, wc) + np.matmul(pre_h, rc) + bc_1 + bc_2)
+        new_c = input_gate * c_t_temp + forget_gate * pre_c
+        new_h = output_gate * tanh(new_c)
+
+        pre_h = new_h
+        pre_c = new_c
+
+        output.append(new_h)
+
+    output = np.concatenate(output, -1)
+    output = output.reshape((batch_size, -1, hidden_size))
+
+    output = output.transpose((1, 0, 2))
+
+    return output, pre_h, pre_c
+
+
+class TestCUDNNLstmOp(OpTest):
+    def setUp(self):
+        self.op_type = "cudnn_lstm"
+        self.dtype = np.float32
+
+        num_steps = 20
+        batch_size = 5
+        hidden_size = 20
+
+        input_weight_size = (hidden_size * hidden_size) * 4
+        hidden_weight_size = (hidden_size * hidden_size) * 4
+        weight_size = input_weight_size + hidden_weight_size
+        weight_size += hidden_size * 8
+
+        input = np.random.uniform(
+            low=-0.1, high=0.1, size=(num_steps, batch_size,
+                                      hidden_size)).astype(self.dtype)
+        flat_w = np.random.uniform(
+            low=-0.1, high=0.1, size=(weight_size)).astype(self.dtype)
+
+        output, last_hidden, last_cell = lstm_naive(input, flat_w)
+
+        init_h = np.zeros((batch_size, hidden_size), dtype=np.float32)
+        init_c = np.zeros((batch_size, hidden_size), dtype=np.float32)
+        scope = core.Scope()
+        program = fluid.Program()
+        block = program.global_block()
+
+        cache_temp = block.create_var(
+            name="Cache",
+            persistable=True,
+            type=core.VarDesc.VarType.RAW,
+            stop_gradient=True)
+        self.inputs = {
+            'Input': OpTest.np_dtype_to_fluid_dtype(input),
+            'W': OpTest.np_dtype_to_fluid_dtype(flat_w),
+            'InitH': OpTest.np_dtype_to_fluid_dtype(init_h),
+            'InitC': OpTest.np_dtype_to_fluid_dtype(init_c),
+        }
+        self.cache_name_list = ['Cache']
+        self.attrs = {
+            'max_len': num_steps,
+            'dropout_prob': 0.0,
+            'is_bidirec': False,
+            'input_size': hidden_size,
+            'hidden_size': hidden_size,
+            'num_layers': 1,
+        }
+        self.outputs = {
+            'Out': output,
+            "last_h": last_hidden,
+            'last_c': last_cell
+        }
+
+    def test_output_with_place(self):
+        if self.testcuda():
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=1e-5)
+
+    def test_grad_with_place(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place,
+                set(['Input', 'W', 'InitH', 'InitC']),
+                ['Out', 'last_h', 'last_c'],
+                max_relative_error=0.02)
+
+    def testcuda(self):
+        return core.is_compiled_with_cuda()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
index 275e5c49d5c298a95b012582a74f8073b800991e..fa16f082880eb97f54abe8bf75e26321f72b3bd3 100644
--- a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
@@ -22,6 +22,15 @@ from paddle.fluid.framework import Program, program_guard
 from paddle.fluid.transpiler import memory_optimize
 
 
+def _get_vars(prog):
+    assert (isinstance(prog, Program))
+    all_vars = set()
+    for op in prog.global_block().ops:
+        all_vars.update(op.input_arg_names)
+        all_vars.update(op.output_arg_names)
+    return all_vars
+
+
 class TestControlFlowGraph(unittest.TestCase):
     def setUp(self):
         program = Program()
@@ -37,11 +46,11 @@ class TestControlFlowGraph(unittest.TestCase):
         self.program = program
 
     def test_control_flow_graph(self):
-        print("before optimization")
-        print(str(self.program))
-        result_program = memory_optimize(self.program)
-        print("after optimization")
-        print(str(result_program))
+        result_program = self.program.clone()
+        memory_optimize(self.program)
+        old_vars = _get_vars(self.program)
+        new_vars = _get_vars(result_program)
+        self.assertTrue(old_vars != new_vars)
 
 
 class TestMemoryTranspiler2(unittest.TestCase):
@@ -58,14 +67,22 @@ class TestMemoryTranspiler2(unittest.TestCase):
             avg_cost = layers.mean(cost)
             opt = optimizer.SGD(learning_rate=0.001)
             opt.minimize(avg_cost)
+        self.skip_set = set([cost.name, fc.name])
         self.program = program
 
     def test_inplace_ops(self):
-        print("before optimization")
-        print(str(self.program))
-        result_program = memory_optimize(self.program)
-        print("after optimization")
-        print(str(result_program))
+        result_program = self.program.clone()
+        memory_optimize(self.program)
+        old_vars = _get_vars(self.program)
+        new_vars = _get_vars(result_program)
+        self.assertTrue(old_vars != new_vars)
+
+    def test_skip_opt(self):
+        result_program = self.program.clone()
+        memory_optimize(self.program, skip_opt_set=self.skip_set)
+        old_vars = _get_vars(self.program)
+        new_vars = _get_vars(result_program)
+        self.assertTrue(old_vars != new_vars)
 
 
 class TestMemoryTranspiler3(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py b/python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce64da0478d3997f4889ca942c67e0defac80b45
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py
@@ -0,0 +1,73 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid.core as core
+import numpy as np
+from paddle.fluid.op import Operator
+
+
+class TestMergeSelectedRows(unittest.TestCase):
+    def get_places(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        return places
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+        x_rows = [0, 5, 5, 4, 20]
+        out_rows = [0, 4, 5, 20]
+        height = 20
+        row_numel = 2
+
+        np_array = np.ones((len(x_rows), row_numel)).astype("float32")
+        np_array[1, :] = 2.0
+        np_array[2, :] = 3.0
+        np_array[3, :] = 4.0
+
+        # initialize input variable X
+        x = scope.var('X').get_selected_rows()
+        x.set_rows(x_rows)
+        x.set_height(height)
+        x_tensor = x.get_tensor()
+        x_tensor.set(np_array, place)
+
+        # initialize input variable Out
+        out = scope.var("Out").get_selected_rows()
+
+        op = Operator("merge_selected_rows", X="X", Out="Out")
+
+        op.run(scope, place)
+
+        self.assertEqual(out.rows(), out_rows)
+        self.assertEqual(out.height(), height)
+
+        out_array = np.array(out.get_tensor())
+        self.assertEqual((4, 2), out_array.shape)
+
+        assert (out_array[0, :] == 1.0).all()
+        assert (out_array[1, :] == 4.0).all()
+        assert (out_array[2, :] == 5.0).all()
+        assert (out_array[3, :] == 1.0).all()
+
+    def test_check_output(self):
+        for place in self.get_places():
+            self.check_with_place(place)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pad2d_op.py b/python/paddle/fluid/tests/unittests/test_pad2d_op.py
index 728b8c181a4410d7df7f304bcc8d2816e91ea6d8..5c4a6ca59e53d0edafda87eae19516a80ec32c40 100644
--- a/python/paddle/fluid/tests/unittests/test_pad2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pad2d_op.py
@@ -20,11 +20,17 @@ from op_test import OpTest
 class TestPad2dOp(OpTest):
     def setUp(self):
         self.pad_value = 0.0
+        self.variable_paddings = False
         self.initTestCase()
         self.op_type = "pad2d"
         self.inputs = {'X': np.random.random(self.shape).astype("float32"), }
         self.attrs = {}
-        self.attrs['paddings'] = np.array(self.paddings).flatten()
+        if self.variable_paddings:
+            self.attrs['paddings'] = []
+            self.inputs['Paddings'] = np.array(self.paddings).flatten().astype(
+                "int32")
+        else:
+            self.attrs['paddings'] = np.array(self.paddings).flatten()
         self.attrs['pad_value'] = self.pad_value
         self.attrs['mode'] = self.mode
         self.attrs['data_format'] = self.data_format
@@ -98,5 +104,24 @@ class TestCase5(TestPad2dOp):
         self.data_format = "NHWC"
 
 
+class TestCase6(TestPad2dOp):
+    def initTestCase(self):
+        self.shape = (2, 4, 4, 2)
+        self.paddings = [0, 1, 2, 3]
+        self.mode = "constant"
+        self.pad_value = 1.2
+        self.data_format = "NHWC"
+        self.variable_paddings = True
+
+
+class TestCase7(TestPad2dOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 4)
+        self.paddings = [0, 1, 2, 3]
+        self.mode = "reflect"
+        self.data_format = "NCHW"
+        self.variable_paddings = True
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py b/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..abe014a38c6ecfd008b0f1028536bfb49b628fb4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py
@@ -0,0 +1,134 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import math
+import numpy as np
+import unittest
+from op_test import OpTest
+
+
+class TestPSROIPoolOp(OpTest):
+    def set_data(self):
+        self.init_test_case()
+        self.make_rois()
+        self.calc_psroi_pool()
+        self.inputs = {'X': self.x, 'ROIs': (self.rois[:, 1:5], self.rois_lod)}
+        self.attrs = {
+            'output_channels': self.output_channels,
+            'spatial_scale': self.spatial_scale,
+            'pooled_height': self.pooled_height,
+            'pooled_width': self.pooled_width
+        }
+        self.outputs = {'Out': self.outs}
+
+    def init_test_case(self):
+        self.batch_size = 3
+        self.channels = 3 * 2 * 2
+        self.height = 6
+        self.width = 4
+
+        self.x_dim = [self.batch_size, self.channels, self.height, self.width]
+
+        self.spatial_scale = 1.0 / 4.0
+        self.output_channels = 3
+        self.pooled_height = 2
+        self.pooled_width = 2
+
+        self.x = np.random.random(self.x_dim).astype('float32')
+
+    def make_rois(self):
+        rois = []
+        self.rois_lod = [[]]
+        for bno in range(self.batch_size):
+            self.rois_lod[0].append(bno + 1)
+            for i in range(bno + 1):
+                x1 = np.random.random_integers(
+                    0, self.width // self.spatial_scale - self.pooled_width)
+                y1 = np.random.random_integers(
+                    0, self.height // self.spatial_scale - self.pooled_height)
+
+                x2 = np.random.random_integers(x1 + self.pooled_width,
+                                               self.width // self.spatial_scale)
+                y2 = np.random.random_integers(
+                    y1 + self.pooled_height, self.height // self.spatial_scale)
+                roi = [bno, x1, y1, x2, y2]
+                rois.append(roi)
+        self.rois_num = len(rois)
+        self.rois = np.array(rois).astype('float32')
+
+    def calc_psroi_pool(self):
+        output_shape = (self.rois_num, self.output_channels, self.pooled_height,
+                        self.pooled_width)
+        out_data = np.zeros(output_shape)
+        for i in range(self.rois_num):
+            roi = self.rois[i]
+            roi_batch_id = int(roi[0])
+            roi_start_w = round(roi[1]) * self.spatial_scale
+            roi_start_h = round(roi[2]) * self.spatial_scale
+            roi_end_w = (round(roi[3]) + 1.) * self.spatial_scale
+            roi_end_h = (round(roi[4]) + 1.) * self.spatial_scale
+
+            roi_height = max(roi_end_h - roi_start_h, 0.1)
+            roi_width = max(roi_end_w - roi_start_w, 0.1)
+
+            bin_size_h = roi_height / float(self.pooled_height)
+            bin_size_w = roi_width / float(self.pooled_width)
+
+            x_i = self.x[roi_batch_id]
+
+            for c in range(self.output_channels):
+                for ph in range(self.pooled_height):
+                    for pw in range(self.pooled_width):
+                        hstart = int(
+                            math.floor(float(ph) * bin_size_h + roi_start_h))
+                        wstart = int(
+                            math.floor(float(pw) * bin_size_w + roi_start_w))
+                        hend = int(
+                            math.ceil(
+                                float(ph + 1) * bin_size_h + roi_start_h))
+                        wend = int(
+                            math.ceil(
+                                float(pw + 1) * bin_size_w + roi_start_w))
+                        hstart = min(max(hstart, 0), self.height)
+                        hend = min(max(hend, 0), self.height)
+                        wstart = min(max(wstart, 0), self.width)
+                        wend = min(max(wend, 0), self.width)
+
+                        c_in = (c * self.pooled_height + ph
+                                ) * self.pooled_width + pw
+                        is_empty = (hend <= hstart) or (wend <= wstart)
+                        out_sum = 0.
+                        for ih in range(hstart, hend):
+                            for iw in range(wstart, wend):
+                                out_sum += x_i[c_in, ih, iw]
+                        bin_area = (hend - hstart) * (wend - wstart)
+                        out_data[i, c, ph, pw] = 0. if is_empty else (
+                            out_sum / float(bin_area))
+        self.outs = out_data.astype('float32')
+
+    def setUp(self):
+        self.op_type = 'psroi_pool'
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_regularizer.py b/python/paddle/fluid/tests/unittests/test_regularizer.py
index 20f91cf4485f2e79c20fe90143c8b7deebb9fc49..62994eec7e7f56267a0990d9a5e3b5c62d7d5fe4 100644
--- a/python/paddle/fluid/tests/unittests/test_regularizer.py
+++ b/python/paddle/fluid/tests/unittests/test_regularizer.py
@@ -15,7 +15,12 @@
 from __future__ import print_function
 
 import unittest
-
+from functools import partial
+import contextlib
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
 import paddle.fluid.framework as framework
 import paddle.fluid.optimizer as optimizer
 import paddle.fluid.regularizer as regularizer
@@ -97,5 +102,134 @@ class TestL1DecayRegularizer(unittest.TestCase):
         self.assertEqual(block.ops[-3].type, 'sign')
 
 
+def bow_net(data,
+            label,
+            dict_dim,
+            is_sparse=False,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2):
+    """
+    BOW net
+    This model is from https://github.com/PaddlePaddle/models:
+    fluid/PaddleNLP/text_classification/nets.py
+    """
+    emb = fluid.layers.embedding(
+        input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim])
+    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
+    bow_tanh = fluid.layers.tanh(bow)
+    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
+    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
+    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    return avg_cost
+
+
+class TestRegularizer(unittest.TestCase):
+    def setUp(self):
+        self.word_dict = paddle.dataset.imdb.word_dict()
+        reader = paddle.batch(
+            paddle.dataset.imdb.train(self.word_dict), batch_size=8)()
+        self.train_data = [next(reader) for _ in range(5)]
+
+    def get_places(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        return places
+
+    @contextlib.contextmanager
+    def scope_prog_guard(self, main_prog, startup_prog):
+        scope = fluid.core.Scope()
+        with fluid.unique_name.guard():
+            with fluid.scope_guard(scope):
+                with fluid.program_guard(main_prog, startup_prog):
+                    yield
+
+    def run_program(self, place, feed_list):
+        exe = fluid.Executor(place)
+        feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
+        exe.run(fluid.default_startup_program())
+
+        main_prog = fluid.default_main_program()
+        param_list = [var.name for var in main_prog.block(0).all_parameters()]
+
+        param_sum = []
+        for data in self.train_data:
+            out = exe.run(main_prog,
+                          feed=feeder.feed(data),
+                          fetch_list=param_list)
+            p_sum = 0
+            for v in out:
+                p_sum += np.sum(np.abs(v))
+            param_sum.append(p_sum)
+        return param_sum
+
+    def check_l2decay_regularizer(self, place, model):
+        main_prog = fluid.framework.Program()
+        startup_prog = fluid.framework.Program()
+        startup_prog.random_seed = 1
+        with self.scope_prog_guard(
+                main_prog=main_prog, startup_prog=startup_prog):
+            data = fluid.layers.data(
+                name="words", shape=[1], dtype="int64", lod_level=1)
+            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+            avg_cost = model(data, label, len(self.word_dict))
+
+            optimizer = fluid.optimizer.Adagrad(
+                learning_rate=0.1,
+                regularization=fluid.regularizer.L2Decay(1.0))
+            optimizer.minimize(avg_cost)
+            param_sum = self.run_program(place, [data, label])
+        return param_sum
+
+    def check_l2decay(self, place, model):
+        main_prog = fluid.framework.Program()
+        startup_prog = fluid.framework.Program()
+        startup_prog.random_seed = 1
+        with self.scope_prog_guard(
+                main_prog=main_prog, startup_prog=startup_prog):
+            data = fluid.layers.data(
+                name="words", shape=[1], dtype="int64", lod_level=1)
+            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+            avg_cost_l2 = model(data, label, len(self.word_dict))
+
+            param_list = fluid.default_main_program().block(0).all_parameters()
+            para_sum = []
+            for para in param_list:
+                para_mul = fluid.layers.square(x=para)
+                para_sum.append(fluid.layers.reduce_sum(input=para_mul))
+            avg_cost_l2 += fluid.layers.sums(para_sum) * .5
+
+            optimizer = fluid.optimizer.Adagrad(learning_rate=0.1)
+            optimizer.minimize(avg_cost_l2)
+            param_sum = self.run_program(place, [data, label])
+        return param_sum
+
+    def test_l2(self):
+        for place in self.get_places():
+            dense_sparse_p_sum = []
+            for sparse in [True, False]:
+                model = partial(bow_net, is_sparse=sparse)
+                framework_l2 = self.check_l2decay_regularizer(place, model)
+                l2 = self.check_l2decay(place, model)
+                assert len(l2) == len(framework_l2)
+                for i in range(len(l2)):
+                    assert np.isclose(a=framework_l2[i], b=l2[i], rtol=5e-5)
+                dense_sparse_p_sum.append(framework_l2)
+
+            assert len(dense_sparse_p_sum[0]) == len(dense_sparse_p_sum[1])
+            for i in range(len(dense_sparse_p_sum[0])):
+                assert np.isclose(
+                    a=dense_sparse_p_sum[0][i],
+                    b=dense_sparse_p_sum[1][i],
+                    rtol=5e-5)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
index 97ff203499c0bf223930c904de46e1abdd902799..41797a241cab9f2b3bc4b492a1c4b6db89ac2948 100644
--- a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
@@ -56,6 +56,40 @@ class TestSigmoidCrossEntropyWithLogitsOp2(OpTest):
     """Test sigmoid_cross_entropy_with_logit_op with probabalistic label
     """
 
+    def setUp(self):
+        self.op_type = "sigmoid_cross_entropy_with_logits"
+        batch_size = 64
+        num_classes = 20
+        ignore_index = -1
+        self.inputs = {
+            'X': logit(
+                np.random.uniform(0, 1, (batch_size, num_classes))
+                .astype("float32")),
+            'Label': np.random.randint(-1, 2, (batch_size, num_classes))
+            .astype("float32")
+        }
+        self.attrs = {'ignore_index': ignore_index, }
+        # Fw Pass is implemented as elementwise sigmoid followed by
+        # elementwise logistic loss
+        # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
+        sigmoid_X = expit(self.inputs['X'])
+        term1 = self.inputs['Label'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
+        out = -term1 - term2
+        out[np.where(self.inputs['Label'] == ignore_index)] = 0
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestSigmoidCrossEntropyWithLogitsOp3(OpTest):
+    """Test sigmoid_cross_entropy_with_logit_op with probabalistic label
+    """
+
     def setUp(self):
         self.op_type = "sigmoid_cross_entropy_with_logits"
         batch_size = 64
diff --git a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
index 50204b8a77c187aa695da83860960566448d290f..f8847e1570dc47d432777faa15f4004f1a7111a6 100644
--- a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
@@ -63,6 +63,7 @@ class TestSpliteSelectedRows(unittest.TestCase):
         # expected output selected rows
         expected_out0_rows = [0, 4]
         expected_out1_rows = [0, 2]
+        expected_out2_rows = []
         expected_out4_rows = [0]
 
         op = Operator(
@@ -75,6 +76,7 @@ class TestSpliteSelectedRows(unittest.TestCase):
 
         self.assertEqual(outs[0].rows(), expected_out0_rows)
         self.assertEqual(outs[1].rows(), expected_out1_rows)
+        self.assertEqual(outs[2].rows(), expected_out2_rows)
         self.assertEqual(outs[4].rows(), expected_out4_rows)
 
         self.assertEqual(outs[0].height(), height_sections[0])
@@ -84,6 +86,9 @@ class TestSpliteSelectedRows(unittest.TestCase):
         self.assertAlmostEqual(4.0, np.array(outs[1].get_tensor())[1, 1])
         self.assertAlmostEqual(8.0, np.array(outs[4].get_tensor())[0, 1])
 
+        self.assertEqual(outs[2].numel(), 0)
+        self.assertEqual(outs[3].numel(), 0)
+
     def check_grad_with_place(self, place):
         scope = core.Scope()
         height = 10
diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..544fe4b4f81909b69a05d9751316e3d3137fdc45
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
@@ -0,0 +1,215 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+from paddle.fluid import core
+
+
+def sigmoid(x):
+    return 1.0 / (1.0 + np.exp(-1.0 * x))
+
+
+def mse(x, y, num):
+    return ((y - x)**2).sum() / num
+
+
+def bce(x, y, mask):
+    x = x.reshape((-1))
+    y = y.reshape((-1))
+    mask = mask.reshape((-1))
+
+    error_sum = 0.0
+    count = 0
+    for i in range(x.shape[0]):
+        if mask[i] > 0:
+            error_sum += y[i] * np.log(x[i]) + (1 - y[i]) * np.log(1 - x[i])
+            count += 1
+    return error_sum / (-1.0 * count)
+
+
+def box_iou(box1, box2):
+    b1_x1 = box1[0] - box1[2] / 2
+    b1_x2 = box1[0] + box1[2] / 2
+    b1_y1 = box1[1] - box1[3] / 2
+    b1_y2 = box1[1] + box1[3] / 2
+    b2_x1 = box2[0] - box2[2] / 2
+    b2_x2 = box2[0] + box2[2] / 2
+    b2_y1 = box2[1] - box2[3] / 2
+    b2_y2 = box2[1] + box2[3] / 2
+
+    b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1)
+    b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1)
+
+    inter_rect_x1 = max(b1_x1, b2_x1)
+    inter_rect_y1 = max(b1_y1, b2_y1)
+    inter_rect_x2 = min(b1_x2, b2_x2)
+    inter_rect_y2 = min(b1_y2, b2_y2)
+    inter_area = max(inter_rect_x2 - inter_rect_x1, 0) * max(
+        inter_rect_y2 - inter_rect_y1, 0)
+
+    return inter_area / (b1_area + b2_area + inter_area)
+
+
+def build_target(gtboxs, gtlabel, attrs, grid_size):
+    n, b, _ = gtboxs.shape
+    ignore_thresh = attrs["ignore_thresh"]
+    anchors = attrs["anchors"]
+    class_num = attrs["class_num"]
+    an_num = len(anchors) // 2
+    obj_mask = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
+    noobj_mask = np.ones((n, an_num, grid_size, grid_size)).astype('float32')
+    tx = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
+    ty = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
+    tw = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
+    th = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
+    tconf = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
+    tcls = np.zeros(
+        (n, an_num, grid_size, grid_size, class_num)).astype('float32')
+
+    for i in range(n):
+        for j in range(b):
+            if gtboxs[i, j, :].sum() == 0:
+                continue
+
+            gt_label = gtlabel[i, j]
+            gx = gtboxs[i, j, 0] * grid_size
+            gy = gtboxs[i, j, 1] * grid_size
+            gw = gtboxs[i, j, 2] * grid_size
+            gh = gtboxs[i, j, 3] * grid_size
+
+            gi = int(gx)
+            gj = int(gy)
+
+            gtbox = [0, 0, gw, gh]
+            max_iou = 0
+            for k in range(an_num):
+                anchor_box = [0, 0, anchors[2 * k], anchors[2 * k + 1]]
+                iou = box_iou(gtbox, anchor_box)
+                if iou > max_iou:
+                    max_iou = iou
+                    best_an_index = k
+                if iou > ignore_thresh:
+                    noobj_mask[i, best_an_index, gj, gi] = 0
+
+            obj_mask[i, best_an_index, gj, gi] = 1
+            noobj_mask[i, best_an_index, gj, gi] = 0
+            tx[i, best_an_index, gj, gi] = gx - gi
+            ty[i, best_an_index, gj, gi] = gy - gj
+            tw[i, best_an_index, gj, gi] = np.log(gw / anchors[2 *
+                                                               best_an_index])
+            th[i, best_an_index, gj, gi] = np.log(
+                gh / anchors[2 * best_an_index + 1])
+            tconf[i, best_an_index, gj, gi] = 1
+            tcls[i, best_an_index, gj, gi, gt_label] = 1
+
+    return (tx, ty, tw, th, tconf, tcls, obj_mask, noobj_mask)
+
+
+def YoloV3Loss(x, gtbox, gtlabel, attrs):
+    n, c, h, w = x.shape
+    an_num = len(attrs['anchors']) // 2
+    class_num = attrs["class_num"]
+    x = x.reshape((n, an_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2))
+    pred_x = sigmoid(x[:, :, :, :, 0])
+    pred_y = sigmoid(x[:, :, :, :, 1])
+    pred_w = x[:, :, :, :, 2]
+    pred_h = x[:, :, :, :, 3]
+    pred_conf = sigmoid(x[:, :, :, :, 4])
+    pred_cls = sigmoid(x[:, :, :, :, 5:])
+
+    tx, ty, tw, th, tconf, tcls, obj_mask, noobj_mask = build_target(
+        gtbox, gtlabel, attrs, x.shape[2])
+
+    obj_mask_expand = np.tile(
+        np.expand_dims(obj_mask, 4), (1, 1, 1, 1, int(attrs['class_num'])))
+    loss_x = mse(pred_x * obj_mask, tx * obj_mask, obj_mask.sum())
+    loss_y = mse(pred_y * obj_mask, ty * obj_mask, obj_mask.sum())
+    loss_w = mse(pred_w * obj_mask, tw * obj_mask, obj_mask.sum())
+    loss_h = mse(pred_h * obj_mask, th * obj_mask, obj_mask.sum())
+    loss_conf_target = bce(pred_conf * obj_mask, tconf * obj_mask, obj_mask)
+    loss_conf_notarget = bce(pred_conf * noobj_mask, tconf * noobj_mask,
+                             noobj_mask)
+    loss_class = bce(pred_cls * obj_mask_expand, tcls * obj_mask_expand,
+                     obj_mask_expand)
+
+    return attrs['loss_weight_xy'] * (loss_x + loss_y) \
+            + attrs['loss_weight_wh'] * (loss_w + loss_h) \
+            + attrs['loss_weight_conf_target'] * loss_conf_target \
+            + attrs['loss_weight_conf_notarget'] * loss_conf_notarget \
+            + attrs['loss_weight_class'] * loss_class
+
+
+class TestYolov3LossOp(OpTest):
+    def setUp(self):
+        self.loss_weight_xy = 1.0
+        self.loss_weight_wh = 1.0
+        self.loss_weight_conf_target = 1.0
+        self.loss_weight_conf_notarget = 1.0
+        self.loss_weight_class = 1.0
+        self.initTestCase()
+        self.op_type = 'yolov3_loss'
+        x = np.random.random(size=self.x_shape).astype('float32')
+        gtbox = np.random.random(size=self.gtbox_shape).astype('float32')
+        gtlabel = np.random.randint(0, self.class_num,
+                                    self.gtbox_shape[:2]).astype('int32')
+
+        self.attrs = {
+            "anchors": self.anchors,
+            "class_num": self.class_num,
+            "ignore_thresh": self.ignore_thresh,
+            "loss_weight_xy": self.loss_weight_xy,
+            "loss_weight_wh": self.loss_weight_wh,
+            "loss_weight_conf_target": self.loss_weight_conf_target,
+            "loss_weight_conf_notarget": self.loss_weight_conf_notarget,
+            "loss_weight_class": self.loss_weight_class,
+        }
+
+        self.inputs = {'X': x, 'GTBox': gtbox, 'GTLabel': gtlabel}
+        self.outputs = {
+            'Loss': np.array(
+                [YoloV3Loss(x, gtbox, gtlabel, self.attrs)]).astype('float32')
+        }
+
+    def test_check_output(self):
+        place = core.CPUPlace()
+        self.check_output_with_place(place, atol=1e-3)
+
+    def test_check_grad_ignore_gtbox(self):
+        place = core.CPUPlace()
+        self.check_grad_with_place(
+            place, ['X'],
+            'Loss',
+            no_grad_set=set(["GTBox", "GTLabel"]),
+            max_relative_error=0.06)
+
+    def initTestCase(self):
+        self.anchors = [10, 13, 12, 12]
+        self.class_num = 10
+        self.ignore_thresh = 0.5
+        self.x_shape = (5, len(self.anchors) // 2 * (5 + self.class_num), 7, 7)
+        self.gtbox_shape = (5, 10, 4)
+        self.loss_weight_xy = 2.5
+        self.loss_weight_wh = 0.8
+        self.loss_weight_conf_target = 1.5
+        self.loss_weight_conf_notarget = 0.5
+        self.loss_weight_class = 1.2
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py
index 34fbb1b549cf5fc5f75bcc0715e5c83665f1d200..dc3b2cb8bc15836a4bf067caa05c3a37a917ecad 100644
--- a/python/paddle/fluid/tests/unittests/testsuite.py
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
@@ -20,7 +20,7 @@ import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 
 
-def create_op(scope, op_type, inputs, outputs, attrs):
+def create_op(scope, op_type, inputs, outputs, attrs, cache_list=None):
     kwargs = dict()
 
     op_maker = core.op_proto_and_checker_maker
@@ -43,6 +43,11 @@ def create_op(scope, op_type, inputs, outputs, attrs):
                     __create_var__(in_name, sub_in_name)
             else:
                 __create_var__(in_name, in_name)
+    if cache_list != None and isinstance(cache_list, list):
+        for name in cache_list:
+            kwargs[name] = []
+            scope.var(name)
+            kwargs[name].append(name)
 
     for out_name, out_dup in Operator.get_op_outputs(op_type):
         if out_name in outputs:
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index ebd0d18d36eed4fffed86ba0903ff76f6052ef7a..d21ec42dccde80fd354a730274edb04f654113c3 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -125,13 +125,14 @@ def slice_variable(var_list, slice_count, min_block_size):
 
 class DistributeTranspilerConfig(object):
     """
-    slice_var_up (bool): Do Tensor slice for pservers, default is True.
-    split_method (PSDispatcher): RoundRobin or HashName can be used
-        try to choose the best method to balance loads for pservers.
-    min_block_size (int): Minimum splitted element number in block.
-        According:https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156
-        We can use bandwidth effiently when data size is larger than 2MB.If you
-        want to change it, please be sure you see the slice_variable function.
+    Args:
+        slice_var_up (bool): Do Tensor slice for pservers, default is True.
+        split_method (PSDispatcher): RoundRobin or HashName can be used
+          try to choose the best method to balance loads for pservers.
+        min_block_size (int): Minimum splitted element number in block.
+          According:https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156
+          We can use bandwidth effiently when data size is larger than 2MB.If you
+          want to change it, please be sure you see the slice_variable function.
     """
 
     slice_var_up = True
@@ -141,6 +142,7 @@ class DistributeTranspilerConfig(object):
     # supported modes: pserver, nccl2
     mode = "pserver"
     print_log = False
+    wait_port = True
 
 
 class DistributeTranspiler(object):
@@ -163,35 +165,34 @@ class DistributeTranspiler(object):
     Examples:
         .. code-block:: python
 
-           # for pserver mode
-           pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
-           trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
-           current_endpoint = "192.168.0.1:6174"
-           trainer_id = 0
-           trainers = 4
-           role = os.getenv("PADDLE_TRAINING_ROLE")
-
-           t = fluid.DistributeTranspiler()
-           t.transpile(
-                trainer_id, pservers=pserver_endpoints, trainers=trainers)
-           if role == "PSERVER":
-                pserver_program = t.get_pserver_program(current_endpoint)
-                pserver_startup_program = t.get_startup_program(current_endpoint,
+            # for pserver mode
+            pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
+            trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
+            current_endpoint = "192.168.0.1:6174"
+            trainer_id = 0
+            trainers = 4
+            role = os.getenv("PADDLE_TRAINING_ROLE")
+            t = fluid.DistributeTranspiler()
+            t.transpile(
+                 trainer_id, pservers=pserver_endpoints, trainers=trainers)
+            if role == "PSERVER":
+                 pserver_program = t.get_pserver_program(current_endpoint)
+                 pserver_startup_program = t.get_startup_program(current_endpoint,
                                                                 pserver_program)
-           elif role == "TRAINER":
-                trainer_program = t.get_trainer_program()
-
-           # for nccl2 mode
-           config = fluid.DistributeTranspilerConfig()
-           config.mode = "nccl2"
-           t = fluid.DistributeTranspiler(config=config)
-           t.transpile(trainer_id, workers=workers, current_endpoint=curr_ep)
-           exe = fluid.ParallelExecutor(
-               use_cuda,
-               loss_name=loss_var.name,
-               num_trainers=len(trainers.split(",)),
-               trainer_id=trainer_id
-           )
+            elif role == "TRAINER":
+                 trainer_program = t.get_trainer_program()
+
+            # for nccl2 mode
+            config = fluid.DistributeTranspilerConfig()
+            config.mode = "nccl2"
+            t = fluid.DistributeTranspiler(config=config)
+            t.transpile(trainer_id, workers=workers, current_endpoint=curr_ep)
+            exe = fluid.ParallelExecutor(
+                use_cuda,
+                loss_name=loss_var.name,
+                num_trainers=len(trainers.split(",)),
+                trainer_id=trainer_id
+            )
     """
 
     def __init__(self, config=None):
@@ -213,13 +214,16 @@ class DistributeTranspiler(object):
                          trainer_id,
                          trainers,
                          current_endpoint,
-                         startup_program=None):
+                         startup_program=None,
+                         wait_port=True):
         if not startup_program:
             startup_program = default_startup_program()
         if trainer_id >= 0:
             worker_endpoints = trainers.split(",")
             # send NCCL_ID to others or recv from trainer 0
             worker_endpoints.remove(current_endpoint)
+            if trainer_id == 0 and wait_port:
+                wait_server_ready(worker_endpoints)
 
             nccl_id_var = startup_program.global_block().create_var(
                 name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW)
@@ -236,6 +240,31 @@ class DistributeTranspiler(object):
         else:
             raise ValueError("must set trainer_id > 0")
 
+    def _get_all_remote_sparse_update_op(self, main_program):
+        sparse_update_ops = []
+        sparse_update_op_types = ["lookup_table"]
+        for op in main_program.global_block().ops:
+            if op.type in sparse_update_op_types and op.attr(
+                    'remote_prefetch') is True and not op.attr(
+                        'is_distributed'):
+                sparse_update_ops.append(op)
+        return sparse_update_ops
+
+    def _update_remote_sparse_update_op(self, param_varname, height_sections,
+                                        endpint_map, table_names):
+        for op in self.sparse_update_ops:
+            if param_varname in op.input_arg_names:
+                op._set_attr('epmap', endpint_map)
+                op._set_attr('table_names', table_names)
+                op._set_attr('height_sections', height_sections)
+                op._set_attr('trainer_id', self.trainer_id)
+
+    def _is_input_of_remote_sparse_update_op(self, param_name):
+        for op in self.sparse_update_ops:
+            if param_name in op.input_arg_names:
+                return True
+        return False
+
     def transpile(self,
                   trainer_id,
                   program=None,
@@ -276,11 +305,13 @@ class DistributeTranspiler(object):
 
         if self.config.mode == "nccl2":
             assert (isinstance(trainers, str))
+            self.origin_program._trainers_endpoints = trainers.split(",")
             self._transpile_nccl2(
                 trainer_id,
                 trainers,
                 current_endpoint,
-                startup_program=startup_program)
+                startup_program=startup_program,
+                wait_port=self.config.wait_port)
             return
 
         self.trainer_num = trainers
@@ -299,6 +330,12 @@ class DistributeTranspiler(object):
             self.param_name_to_grad_name[param_var.name] = grad_var.name
             self.grad_name_to_param_name[grad_var.name] = param_var.name
 
+        # get all sparse update ops
+        self.sparse_update_ops = self._get_all_remote_sparse_update_op(
+            self.origin_program)
+        # use_sparse_update_param_name -> split_height_section
+        self.sparse_param_to_height_sections = dict()
+
         # add distributed attrs to program
         self.origin_program._is_distributed = True
         self.origin_program._endpoints = self.pserver_endpoints
@@ -336,6 +373,13 @@ class DistributeTranspiler(object):
                 splited_grad_varname = splited_vars[0].name
                 index = find_op_by_output_arg(
                     program.global_block(), splited_grad_varname, reverse=True)
+                if splited_vars[0].type == core.VarDesc.VarType.SELECTED_ROWS:
+                    sparse_param_name = self.grad_name_to_param_name[
+                        grad_varname]
+                    if self._is_input_of_remote_sparse_update_op(
+                            sparse_param_name):
+                        self.sparse_param_to_height_sections[
+                            sparse_param_name] = [splited_vars[0].shape[0]]
             elif len(splited_vars) > 1:
                 orig_var = program.global_block().vars[splited_grad_varname]
                 index = find_op_by_output_arg(
@@ -406,16 +450,18 @@ class DistributeTranspiler(object):
         all_recv_outputs = []
         for param_varname, splited_var in six.iteritems(self.param_var_mapping):
             eps = []
+            table_names = []
             for var in splited_var:
                 index = [v.name for v in recv_vars].index(var.name)
                 eps.append(eplist[index])
+                table_names.append(var.name)
             if self.sync_mode:
                 recv_dep_in = send_barrier_out
             else:
                 # connect deps to send op in async mode
                 recv_dep_in = self.grad_name_to_send_dummy_out[
                     self.param_name_to_grad_name[param_varname]]
-            all_recv_outputs.extend(splited_var)
+
             # get recv op_role_var, if not splited, the grad should have .trainer suffix
             # if splited, grad should be the original grad var name. ParallelExecutor
             # will use op_role_var to get expected device place to run this op.
@@ -425,18 +471,25 @@ class DistributeTranspiler(object):
             if len(splited_trainer_grad) == 1:
                 recv_op_role_var_name = splited_trainer_grad[0].name
 
-            program.global_block().append_op(
-                type="recv",
-                inputs={"X": [recv_dep_in]},
-                outputs={"Out": splited_var},
-                attrs={
-                    "epmap": eps,
-                    "trainer_id": self.trainer_id,
-                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
-                    OP_ROLE_VAR_ATTR_NAME:
-                    [param_varname, recv_op_role_var_name],
-                    "sync_mode": not self.sync_mode
-                })
+            if param_varname in self.sparse_param_to_height_sections:
+                height_sections = self.sparse_param_to_height_sections[
+                    param_varname]
+                self._update_remote_sparse_update_op(
+                    param_varname, height_sections, eps, table_names)
+            else:
+                all_recv_outputs.extend(splited_var)
+                program.global_block().append_op(
+                    type="recv",
+                    inputs={"X": [recv_dep_in]},
+                    outputs={"Out": splited_var},
+                    attrs={
+                        "epmap": eps,
+                        "trainer_id": self.trainer_id,
+                        RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
+                        OP_ROLE_VAR_ATTR_NAME:
+                        [param_varname, recv_op_role_var_name],
+                        "sync_mode": not self.sync_mode
+                    })
 
         if self.sync_mode:
             # form a WAW dependency
@@ -454,14 +507,15 @@ class DistributeTranspiler(object):
             if len(splited_var) <= 1:
                 continue
             orig_param = program.global_block().vars[param_varname]
-            program.global_block().append_op(
-                type="concat",
-                inputs={"X": splited_var},
-                outputs={"Out": [orig_param]},
-                attrs={
-                    "axis": 0,
-                    RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE
-                })
+            if param_varname not in self.sparse_param_to_height_sections:
+                program.global_block().append_op(
+                    type="concat",
+                    inputs={"X": splited_var},
+                    outputs={"Out": [orig_param]},
+                    attrs={
+                        "axis": 0,
+                        RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE
+                    })
 
         self._get_trainer_startup_program(recv_vars=recv_vars, eplist=eplist)
 
@@ -603,9 +657,6 @@ class DistributeTranspiler(object):
         # NOTE: assume blocks of the same variable is not distributed
         # on the same pserver, only change param/grad varnames for
         # trainers to fetch.
-        sys.stderr.write("get_pserver_program() is deprecated, call \
-get_pserver_programs() to get pserver main and startup \
-in a single call.")
         # step1
         pserver_program = Program()
         pserver_program.random_seed = self.origin_program.random_seed
@@ -873,18 +924,6 @@ in a single call.")
         Returns:
             Program: parameter server side startup program.
         """
-        sys.stderr.write("get_startup_program() is deprecated, call \
-get_pserver_programs() to get pserver main and startup \
-in a single call.")
-        if pserver_program != None:
-            sys.stderr.write("passing pserver_program to get_startup_program() \
-is deprecated, you can use new API get_pserver_programs() to \
-get both pserver main program and startup program.")
-        if startup_program != None:
-            sys.stderr.write("passing startup_program to get_startup_program() \
-is deprecated, use fluid.program_guard() or pass this argument \
-to transpile() call.")
-
         s_prog = Program()
         orig_s_prog = self.startup_program
         s_prog.random_seed = orig_s_prog.random_seed
@@ -1420,6 +1459,10 @@ to transpile() call.")
             height_sections = []
             for v in splited_vars:
                 height_sections.append(v.shape[0])
+            sparse_param_name = self.grad_name_to_param_name[orig_var.name]
+            if self._is_input_of_remote_sparse_update_op(sparse_param_name):
+                self.sparse_param_to_height_sections[
+                    sparse_param_name] = height_sections
             program.global_block()._insert_op(
                 index=index + 1,
                 type="split_selected_rows",
diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
index c9f1be934773cc28f026f2b867b9e3a4f7aa8472..95aafec05361a8b66b849268c7a738bb2ee5da86 100755
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import six
 from collections import defaultdict, MutableSet
 from .. import core
 from ... import compat as cpt
@@ -470,8 +471,21 @@ def memory_optimize(input_program,
     Returns:
         None
     """
+
+    def to_name_str(var):
+        if isinstance(var, Variable):
+            return var.desc.name()
+        elif isinstance(var, str):
+            return var
+        elif isinstance(var, six.string_types):
+            return str(var)
+        else:
+            raise TypeError(str(var) + " should be Variable or str")
+
     if level != 0 and level != 1:
         raise ValueError("only support opt_level 0 or 1.")
+    if skip_opt_set is not None and not isinstance(skip_opt_set, set):
+        raise ValueError("only support skip_opt_set as set.")
     global PRINT_LOG
     PRINT_LOG = print_log
     if skip_grads:
@@ -486,6 +500,8 @@ def memory_optimize(input_program,
             skip_opt_set = grad_set
         else:
             skip_opt_set.update(grad_set)
+    if skip_opt_set is not None:
+        skip_opt_set = set(map(to_name_str, skip_opt_set))
     cfgs = _get_cfgs(input_program)
     for cfg in cfgs:
         cfg.memory_optimize(skip_opt_set=skip_opt_set, level=level)
diff --git a/python/paddle/reader/tests/decorator_test.py b/python/paddle/reader/tests/decorator_test.py
index b9af8348e16c051db64d57a9594aee303d83aef2..a9dddbbcc82e649b6c98db0fd58c62b58435b8db 100644
--- a/python/paddle/reader/tests/decorator_test.py
+++ b/python/paddle/reader/tests/decorator_test.py
@@ -62,10 +62,10 @@ class TestBuffered(unittest.TestCase):
         for idx, i in enumerate(b()):
             elapsed_time = time.time() - last_time
             if i == 0:
-                time.sleep(0.3)
+                time.sleep(1)
             else:
                 # read time should be short, meaning already buffered.
-                self.assertLess(elapsed_time, 0.05)
+                self.assertLess(elapsed_time, 0.08)
             last_time = time.time()
 
 
diff --git a/python/setup.py.in b/python/setup.py.in
index 200b96ec54ee5daeb905e155d0b7b57ab7740250..0eb69cdb5c7d140527dba7a648728750bfb404f7 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -101,6 +101,7 @@ packages=['paddle',
           'paddle.dataset',
           'paddle.reader',
           'paddle.fluid',
+          'paddle.fluid.imperative',
           'paddle.fluid.proto',
           'paddle.fluid.proto.profiler',
           'paddle.fluid.layers',
@@ -165,9 +166,9 @@ if '${WITH_MKL}' == 'ON':
     shutil.copy('${MKLML_LIB}', libs_path)
     shutil.copy('${MKLML_IOMP_LIB}', libs_path)
     package_data['paddle.libs']+=['libmklml_intel' + ext_name,'libiomp5' + ext_name]
-if '${CMAKE_BUILD_TYPE}' == 'Release':
-    # only change rpath in Release mode.
-    if '${WITH_MKLDNN}' == 'ON':
+if '${WITH_MKLDNN}' == 'ON':
+    if '${CMAKE_BUILD_TYPE}' == 'Release':
+        # only change rpath in Release mode.
         # TODO(typhoonzero): use install_name_tool to patch mkl libs once
         # we can support mkl on mac.
         #
@@ -177,14 +178,19 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
         command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}"
         if os.system(command) != 0:
             raise Exception("patch libmkldnn.so failed, command: %s" % command)
-        package_data['paddle.libs']+=['libmkldnn.so.0']
-        shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
+    package_data['paddle.libs']+=['libmkldnn.so.0']
+    shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
 if '${WITH_NGRAPH}' == 'ON':
+    # only change rpath in Release mode,
+    # since in Debug mode, nGraph lib may be too large to be changed?
     if '${CMAKE_BUILD_TYPE}' == 'Release':
-        # only change rpath in Release mode.
-        command = "patchelf --set-rpath '$ORIGIN/' ${NGRAPH_SHARED_LIB}"
-        if os.system(command) != 0:
-            raise Exception("patch ${NGRAPH_SHARED_LIB_NAME} failed, command: %s" % command)
+        if os.name != 'nt':
+            if "@APPLE@" == "1":
+                command = "install_name_tool -id \"@loader_path/\" ${NGRAPH_SHARED_LIB}"
+            else:
+                command = "patchelf --set-rpath '$ORIGIN/' ${NGRAPH_SHARED_LIB}"
+            if os.system(command) != 0:
+                raise Exception("patch ${NGRAPH_SHARED_LIB_NAME} failed, command: %s" % command)
     shutil.copy('${NGRAPH_SHARED_LIB}', libs_path)
     shutil.copy('${NGRAPH_CPU_LIB}', libs_path)
     shutil.copy('${NGRAPH_TBB_LIB}', libs_path)
diff --git a/tools/print_signatures.py b/tools/print_signatures.py
index e2805c4e7e6aa26a5865b64a874feef672bf9b36..7e61dde0a446cf5bfe656105ffd2472f03576f05 100644
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -15,7 +15,7 @@
 Print all signature of a python module in alphabet order.
 
 Usage:
-    ./print_signature  "paddle.fluid" > signature.txt
+    ./print_signature  "paddle.fluid,paddle.reader" > signature.txt
 """
 from __future__ import print_function
 
@@ -27,6 +27,8 @@ import pydoc
 
 member_dict = collections.OrderedDict()
 
+experimental_namespace = {"paddle.fluid.imperative"}
+
 
 def visit_member(parent_name, member):
     cur_name = ".".join([parent_name, member.__name__])
@@ -43,13 +45,16 @@ def visit_member(parent_name, member):
                 line.strip() for line in pydoc.render_doc(member).split('\n')
                 if "->" in line
             ])
-
+    elif inspect.isgetsetdescriptor(member):
+        return
     else:
         raise RuntimeError("Unsupported generate signature of member, type {0}".
                            format(str(type(member))))
 
 
 def visit_all_module(mod):
+    if (mod.__name__ in experimental_namespace):
+        return
     for member_name in (
             name
             for name in (mod.__all__ if hasattr(mod, "__all__") else dir(mod))
@@ -63,7 +68,9 @@ def visit_all_module(mod):
             visit_member(mod.__name__, instance)
 
 
-visit_all_module(importlib.import_module(sys.argv[1]))
+modules = sys.argv[1].split(",")
+for m in modules:
+    visit_all_module(importlib.import_module(m))
 
 for name in member_dict:
     print(name, member_dict[name])