Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_huber_regression_loss_op

test=develop

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_huber_regression_loss_op
test=develop
65d355a7 · minqiyang · c550e0ce · e2130502 · 65d355a7 · 65d355a7
372 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -54,7 +54,7 @@ option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
 option(WITH_DOUBLE      "Compile PaddlePaddle with double precision"    OFF)
 option(WITH_RDMA        "Compile PaddlePaddle with RDMA support"        OFF)
 option(WITH_TIMER       "Compile PaddlePaddle with stats timer"         OFF)
-option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler"        OFF)
+option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler and gperftools"        OFF)
 option(WITH_DOC         "Compile PaddlePaddle with documentation"       OFF)
 option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
@@ -254,6 +254,12 @@ elseif()
    set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in MKL only now." FORCE)
 endif()

+if (WITH_PROFILER)
+    find_package(Gperftools REQUIRED)
+    include_directories(${GPERFTOOLS_INCLUDE_DIR})
+    add_definitions(-DWITH_GPERFTOOLS)
+endif()
+
 include(generic)            # simplify cmake module
 include(package)            # set paddle packages
 include(ccache)             # set ccache for compilation

--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@


 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)

@@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.


-### Latest PaddlePaddle Release: [Fluid 1.1.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.1)
+### Latest PaddlePaddle Release: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2)
 ### Install Latest Stable Release:
 ```
 # Linux CPU
@@ -27,9 +27,9 @@ pip install paddlepaddle
 # Linux GPU cuda9cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda8cudnn7
-pip install paddlepaddle-gpu==1.1.0.post87
+pip install paddlepaddle-gpu==1.2.0.post87
 # Linux GPU cuda8cudnn5
-pip install paddlepaddle-gpu==1.1.0.post85
+pip install paddlepaddle-gpu==1.2.0.post85

 # For installation on other platform, refer to http://paddlepaddle.org/
 ```
@@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==1.1.0.post85

 ## Installation

-It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) on our website.
+It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) on our website.

 ## Documentation

-We provide [English](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html) and
-[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) documentation.
+We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) and
+[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) documentation.

 - [Deep Learning 101](https://github.com/PaddlePaddle/book)

  You might want to start from this online interactive book that can run in a Jupyter Notebook.

- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.1/user_guides/howto/training/cluster_howto.html)
+- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html)

  You can run distributed training jobs on MPI clusters.

- [Python API](http://paddlepaddle.org/documentation/api/zh/1.1/fluid.html)
+- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html)

   Our new API enables much shorter programs.

- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.1/advanced_usage/development/contribute_to_paddle.html)
+- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html)

   We appreciate your contributions!


--- a/cmake/FindGperftools.cmake
+++ b/cmake/FindGperftools.cmake
+# Tries to find Gperftools.
+#
+# Usage of this module as follows:
+#
+#     find_package(Gperftools)
+#
+# Variables used by this module, they can change the default behaviour and need
+# to be set before calling find_package:
+#
+#  Gperftools_ROOT_DIR  Set this variable to the root installation of
+#                       Gperftools if the module has problems finding
+#                       the proper installation path.
+#
+# Variables defined by this module:
+#
+#  GPERFTOOLS_FOUND              System has Gperftools libs/headers
+#  GPERFTOOLS_LIBRARIES          The Gperftools libraries (tcmalloc & profiler)
+#  GPERFTOOLS_INCLUDE_DIR        The location of Gperftools headers
+
+find_library(GPERFTOOLS_TCMALLOC
+  NAMES tcmalloc
+  HINTS ${Gperftools_ROOT_DIR}/lib)
+
+find_library(GPERFTOOLS_PROFILER
+  NAMES profiler
+  HINTS ${Gperftools_ROOT_DIR}/lib)
+
+find_library(GPERFTOOLS_TCMALLOC_AND_PROFILER
+  NAMES tcmalloc_and_profiler
+  HINTS ${Gperftools_ROOT_DIR}/lib)
+
+find_path(GPERFTOOLS_INCLUDE_DIR
+  NAMES gperftools/heap-profiler.h
+  HINTS ${Gperftools_ROOT_DIR}/include)
+
+set(GPERFTOOLS_LIBRARIES ${GPERFTOOLS_TCMALLOC_AND_PROFILER})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(
+  Gperftools
+  DEFAULT_MSG
+  GPERFTOOLS_LIBRARIES
+  GPERFTOOLS_INCLUDE_DIR)
+
+mark_as_advanced(
+  Gperftools_ROOT_DIR
+  GPERFTOOLS_TCMALLOC
+  GPERFTOOLS_PROFILER
+  GPERFTOOLS_TCMALLOC_AND_PROFILER
+  GPERFTOOLS_LIBRARIES
+  GPERFTOOLS_INCLUDE_DIR)
+
+# create IMPORTED targets
+if (Gperftools_FOUND AND NOT TARGET gperftools::tcmalloc)
+  add_library(gperftools::tcmalloc UNKNOWN IMPORTED)
+  set_target_properties(gperftools::tcmalloc PROPERTIES
+    IMPORTED_LOCATION ${GPERFTOOLS_TCMALLOC}
+    INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}")
+  add_library(gperftools::profiler UNKNOWN IMPORTED)
+  set_target_properties(gperftools::profiler PROPERTIES
+    IMPORTED_LOCATION ${GPERFTOOLS_PROFILER}
+    INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}")
+endif()
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -86,6 +86,7 @@ endif(NOT WITH_GOLANG)

 if(WITH_GPU)
    add_definitions(-DPADDLE_WITH_CUDA)
+    add_definitions(-DEIGEN_USE_GPU)

    FIND_PACKAGE(CUDA REQUIRED)


--- a/cmake/external/gzstream.cmake
+++ b/cmake/external/gzstream.cmake
@@ -27,13 +27,14 @@ SET(GZSTREAM_INCLUDE_DIR "${GZSTREAM_INSTALL_DIR}/include/" CACHE PATH "gzstream

 ExternalProject_Add(
        extern_gzstream
+        DEPENDS zlib
        GIT_REPOSITORY "https://github.com/jacquesqiao/gzstream.git"
        GIT_TAG ""
        PREFIX          ${GZSTREAM_SOURCES_DIR}
        UPDATE_COMMAND  ""
        CONFIGURE_COMMAND ""
        BUILD_IN_SOURCE 1
-        BUILD_COMMAND   make -j8
+        BUILD_COMMAND   make EXTERN_CPPFLAGS="-I${THIRD_PARTY_PATH}/install/zlib/include" EXTERM_LDFLAGS="-L${THIRD_PARTY_PATH}/install/zlib/lib" -j8
        INSTALL_COMMAND mkdir -p ${GZSTREAM_INSTALL_DIR}/lib/ && mkdir -p ${GZSTREAM_INSTALL_DIR}/include/
        && cp ${GZSTREAM_SOURCES_DIR}/src/extern_gzstream/libgzstream.a ${GZSTREAM_INSTALL_DIR}/lib
        && cp -r ${GZSTREAM_SOURCES_DIR}/src/extern_gzstream/gzstream.h ${GZSTREAM_INSTALL_DIR}/include

--- a/cmake/external/ngraph.cmake
+++ b/cmake/external/ngraph.cmake
@@ -32,6 +32,8 @@ IF(NOT ${WITH_NGRAPH})
    return()
 ENDIF()

+INCLUDE(GNUInstallDirs)
+
 INCLUDE(ExternalProject)

 SET(NGRAPH_PROJECT         "extern_ngraph")
@@ -40,10 +42,14 @@ SET(NGRAPH_GIT_TAG         "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0")
 SET(NGRAPH_SOURCES_DIR     ${THIRD_PARTY_PATH}/ngraph)
 SET(NGRAPH_INSTALL_DIR     ${THIRD_PARTY_PATH}/install/ngraph)
 SET(NGRAPH_INC_DIR         ${NGRAPH_INSTALL_DIR}/include)
+SET(NGRAPH_LIB_DIR         ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
 SET(NGRAPH_SHARED_LIB_NAME libngraph.so.${NGRAPH_VERSION})
 SET(NGRAPH_CPU_LIB_NAME    libcpu_backend.so)
 SET(NGRAPH_TBB_LIB_NAME    libtbb.so.2)
 SET(NGRAPH_GIT_REPO        "https://github.com/NervanaSystems/ngraph.git")
+SET(NGRAPH_SHARED_LIB      ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME})
+SET(NGRAPH_CPU_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME})
+SET(NGRAPH_TBB_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME})

 ExternalProject_Add(
    ${NGRAPH_PROJECT}
@@ -63,18 +69,6 @@ ExternalProject_Add(
    CMAKE_ARGS          -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib
 )

-if(UNIX AND NOT APPLE)
-    include(GNUInstallDirs)
-    SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
-else()
-    SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/lib)
-endif()
-MESSAGE(STATUS "nGraph lib will be installed at: ${NGRAPH_LIB_DIR}")
-
-SET(NGRAPH_SHARED_LIB      ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME})
-SET(NGRAPH_CPU_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME})
-SET(NGRAPH_TBB_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME})
-
 # Workaround for nGraph expecting mklml to be in mkldnn install directory.
 ExternalProject_Add_Step(
    ${NGRAPH_PROJECT}

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -110,6 +110,14 @@ function(find_fluid_modules TARGET_NAME)
  endif()
 endfunction(find_fluid_modules)

+
+function(common_link TARGET_NAME)
+  if (WITH_PROFILER)
+    target_link_libraries(${TARGET_NAME} gperftools::profiler)
+  endif()
+endfunction()
+
+
 # find all third_party modules is used for paddle static library
 # for reduce the dependency when building the inference libs.
 set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY)
@@ -274,6 +282,7 @@ function(cc_library TARGET_NAME)
      endif()
      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
      add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
+      common_link(${TARGET_NAME})
    endif()

    # cpplint code style
@@ -340,6 +349,7 @@ function(cc_binary TARGET_NAME)
  if(cc_binary_DEPS)
    target_link_libraries(${TARGET_NAME} ${cc_binary_DEPS})
    add_dependencies(${TARGET_NAME} ${cc_binary_DEPS})
+    common_link(${TARGET_NAME})
  endif()
 endfunction(cc_binary)

@@ -362,6 +372,7 @@ function(cc_test TARGET_NAME)
      target_link_libraries(${TARGET_NAME} ${win32_deps})
    endif(WIN32)
    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    common_link(${TARGET_NAME})
    add_test(NAME ${TARGET_NAME}
             COMMAND ${TARGET_NAME} ${cc_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
@@ -420,6 +431,7 @@ function(nv_binary TARGET_NAME)
    if(nv_binary_DEPS)
      target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS})
      add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
+      common_link(${TARGET_NAME})
    endif()
  endif()
 endfunction(nv_binary)
@@ -433,6 +445,7 @@ function(nv_test TARGET_NAME)
    cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    common_link(${TARGET_NAME})
    add_test(${TARGET_NAME} ${TARGET_NAME})
    if (nv_test_SERIAL)
        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
@@ -499,6 +512,7 @@ function(hip_binary TARGET_NAME)
    if(hip_binary_DEPS)
      target_link_libraries(${TARGET_NAME} ${hip_binary_DEPS})
      add_dependencies(${TARGET_NAME} ${hip_binary_DEPS})
+      common_link(${TARGET_NAME})
    endif()
  endif()
 endfunction(hip_binary)
@@ -518,6 +532,7 @@ function(hip_test TARGET_NAME)
    set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
    target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
    add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
+    common_link(${TARGET_NAME})
    add_test(${TARGET_NAME} ${TARGET_NAME})
  endif()
 endfunction(hip_test)
@@ -560,6 +575,7 @@ function(go_library TARGET_NAME)
  endif()
  if(go_library_DEPS)
    add_dependencies(${TARGET_NAME} ${go_library_DEPS})
+    common_link(${TARGET_NAME})
  endif(go_library_DEPS)

  # The "source file" of the library is `${dummyfile}` which never

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -129,6 +129,15 @@ if (WITH_MKLDNN)
            )
 endif ()

+if (WITH_NGRAPH)
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/ngraph")
+    copy(ngraph_lib
+            SRCS ${NGRAPH_INC_DIR} ${NGRAPH_LIB_DIR}
+            DSTS ${dst_dir} ${dst_dir}
+            DEPS ngraph
+            )
+endif ()
+
 if (NOT WIN32)
    if (NOT MOBILE_INFERENCE AND NOT RPI)
        set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")

--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -166,6 +166,8 @@ function(op_library TARGET)
      # Append first implemented MKLDNN activation operator
      if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
+      elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n")
      else()
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
      endif()

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -32,6 +32,13 @@ paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.c
 paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy) -> None
 paddle.fluid.create_lod_tensor ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.create_random_int_lodtensor ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DataFeedDesc.__init__ ArgSpec(args=['self', 'proto_file'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DataFeedDesc.desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DataFeedDesc.set_batch_size ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DataFeedDesc.set_dense_slots ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DataFeedDesc.set_use_slots ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'debug'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.io.save_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
@@ -59,6 +66,7 @@ paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr
 paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100))
+paddle.fluid.layers.bpr_loss ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None))
@@ -69,7 +77,7 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name']
 paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
 paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
 paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
-paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False))
+paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False))
 paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
 paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
@@ -175,7 +183,7 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None,
 paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
-paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name'], varargs=None, keywords=None, defaults=(-100, None))
 paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
@@ -187,6 +195,10 @@ paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=Non
 paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
 paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None))
+paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
+paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
@@ -291,6 +303,7 @@ paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'i
 paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None))
 paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'class_num', 'ignore_thresh', 'loss_weight_xy', 'loss_weight_wh', 'loss_weight_conf_target', 'loss_weight_conf_notarget', 'loss_weight_class', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None))
 paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
 paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1))
 paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
@@ -411,3 +424,17 @@ paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None
 paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
 paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope
 paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
+paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None)
+paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None)
+paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None)
+paddle.reader.chain ArgSpec(args=[], varargs='readers', keywords=None, defaults=None)
+paddle.reader.shuffle ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None)
+paddle.reader.firstn ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None)
+paddle.reader.xmap_readers ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,))
+paddle.reader.PipeReader.__init__ ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain'))
+paddle.reader.PipeReader.get_line ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n'))
+paddle.reader.multiprocess_reader ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000))
+paddle.reader.Fake.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.reader.creator.np_array ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
+paddle.reader.creator.text_file ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None)
+paddle.reader.creator.recordio ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,))
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
 add_subdirectory(memory)
 add_subdirectory(platform)
 add_subdirectory(framework)
+add_subdirectory(imperative)
 add_subdirectory(operators)
 add_subdirectory(string)
 add_subdirectory(recordio)

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -34,6 +34,7 @@ add_subdirectory(ir)
 add_subdirectory(details)
 # ddim lib
 proto_library(framework_proto SRCS framework.proto)
+proto_library(async_executor_proto SRCS data_feed.proto)

 cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
@@ -71,6 +72,8 @@ cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)

+cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory)
+
 cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
 cc_test(reader_test SRCS reader_test.cc DEPS reader)

@@ -117,8 +120,9 @@ cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)

 cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context)
+cc_library(op_kernel_type SRCS op_kernel_type.cc DEPS device_context place)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
-    shape_inference data_transform lod_tensor profiler transfer_scope_cache)
+    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type)

 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)

@@ -126,16 +130,19 @@ cc_library(version SRCS version.cc)
 cc_test(version_test SRCS version_test.cc DEPS version)

 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
-cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto)
-if(NOT WIN32)
-cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
-  shape_inference data_transform lod_tensor profiler)
-endif(NOT WIN32)
+
+if(WITH_NGRAPH)
+  if(NOT WIN32)
+    cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
+    cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
+      shape_inference data_transform lod_tensor profiler ngraph)
+  endif(NOT WIN32)
+endif(WITH_NGRAPH)

 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)

-py_proto_compile(framework_py_proto SRCS framework.proto)
+py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
@@ -157,27 +164,37 @@ endif(NOT WIN32)
 cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)

 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
+cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor)

-cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
+cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)

 if(WITH_DISTRIBUTE)
-  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass)
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper)
  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
  set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()
-  if(NOT WIN32)
-    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator)
-  else(NOT WIN32)
-    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
-  endif(NOT WIN32)
+  if(WITH_NGRAPH)
+    if(NOT WIN32)
+      cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph ngraph_operator variable_helper)
+    else(NOT WIN32)
+      cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
+    endif(NOT WIN32)
+  else(WITH_NGRAPH)
+    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
+  endif(WITH_NGRAPH)
  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()

+target_link_libraries(executor garbage_collector)
+
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
        graph build_strategy
-        fast_threaded_ssa_graph_executor)
+        fast_threaded_ssa_graph_executor variable_helper)
+
+cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper)

+cc_test(data_feed_test SRCS data_feed_test.cc DEPS async_executor)
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
 cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
@@ -185,7 +202,7 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
 cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)

-cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
+cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto op_kernel_type)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)

 cc_test(tuple_test SRCS tuple_test.cc )

--- a/paddle/fluid/framework/async_executor.cc
+++ b/paddle/fluid/framework/async_executor.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/async_executor.h"
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "google/protobuf/message.h"
+#include "google/protobuf/text_format.h"
+
+#include "gflags/gflags.h"
+#include "paddle/fluid/framework/data_feed_factory.h"
+#include "paddle/fluid/framework/executor_thread_worker.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/pybind/pybind.h"
+
+namespace paddle {
+namespace framework {
+AsyncExecutor::AsyncExecutor(Scope* scope, const platform::Place& place)
+    : root_scope_(scope), place_(place) {}
+
+void AsyncExecutor::CreateThreads(
+    ExecutorThreadWorker* worker, const ProgramDesc& main_program,
+    const std::shared_ptr<DataFeed>& reader,
+    const std::vector<std::string>& fetch_var_names, Scope* root_scope,
+    const int thread_index, const bool debug) {
+  worker->SetThreadId(thread_index);
+  worker->SetDebug(debug);
+  worker->SetRootScope(root_scope);
+  worker->CreateThreadResource(main_program, place_);
+  worker->SetDataFeed(reader);
+  worker->SetFetchVarNames(fetch_var_names);
+  worker->BindingDataFeedMemory();
+}
+
+void PrepareReaders(std::vector<std::shared_ptr<DataFeed>>& readers,  // NOLINT
+                    const int thread_num, const DataFeedDesc& data_feed_desc,
+                    const std::vector<std::string>& filelist) {
+  readers.resize(thread_num);
+  for (size_t i = 0; i < readers.size(); ++i) {
+    readers[i] = DataFeedFactory::CreateDataFeed(data_feed_desc.name());
+    readers[i]->Init(data_feed_desc);  // set batch_size and queue_size here
+  }
+  readers[0]->SetFileList(filelist);
+}
+
+void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
+                                const std::string& data_feed_desc_str,
+                                const std::vector<std::string>& filelist,
+                                const int thread_num,
+                                const std::vector<std::string>& fetch_var_names,
+                                const bool debug) {
+  std::vector<std::thread> threads;
+
+  auto& block = main_program.Block(0);
+  for (auto var_name : fetch_var_names) {
+    auto var_desc = block.FindVar(var_name);
+    auto shapes = var_desc->GetShape();
+    PADDLE_ENFORCE(shapes[shapes.size() - 1] == 1,
+                   "var %s: Fetched var has wrong shape, "
+                   "only variables with the last dimension size 1 supported",
+                   var_name);
+  }
+
+  DataFeedDesc data_feed_desc;
+  google::protobuf::TextFormat::ParseFromString(data_feed_desc_str,
+                                                &data_feed_desc);
+
+  int actual_thread_num = thread_num;
+  int file_cnt = filelist.size();
+  PADDLE_ENFORCE(file_cnt > 0, "File list cannot be empty");
+
+  if (actual_thread_num > file_cnt) {
+    VLOG(1) << "Thread num = " << thread_num << ", file num = " << file_cnt
+            << ". Changing thread_num = " << file_cnt;
+    actual_thread_num = file_cnt;
+  }
+
+  /*
+    readerDesc: protobuf description for reader initlization
+    argument: class_name, batch_size, use_slot, queue_size, buffer_size,
+    padding_index
+
+    reader:
+    1) each thread has a reader, reader will read input data and
+    put it into input queue
+    2) each reader has a Next() iterface, that can fetch an instance
+    from the input queue
+   */
+  // todo: should be factory method for creating datafeed
+  std::vector<std::shared_ptr<DataFeed>> readers;
+  PrepareReaders(readers, actual_thread_num, data_feed_desc, filelist);
+
+  std::vector<std::shared_ptr<ExecutorThreadWorker>> workers;
+  workers.resize(actual_thread_num);
+  for (auto& worker : workers) {
+    worker.reset(new ExecutorThreadWorker);
+  }
+
+  // prepare thread resource here
+  for (int thidx = 0; thidx < actual_thread_num; ++thidx) {
+    CreateThreads(workers[thidx].get(), main_program, readers[thidx],
+                  fetch_var_names, root_scope_, thidx, debug);
+  }
+
+  // start executing ops in multiple threads
+  for (int thidx = 0; thidx < actual_thread_num; ++thidx) {
+    threads.push_back(
+        std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get()));
+  }
+
+  for (auto& th : threads) {
+    th.join();
+  }
+
+  root_scope_->DropKids();
+
+  return;
+}
+
+}  // einit_modelnd namespace framework
+}  // end namespace paddle
--- a/paddle/fluid/framework/async_executor.h
+++ b/paddle/fluid/framework/async_executor.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <set>
+#include <string>
+#include <thread>  // NOLINT
+#include <typeinfo>
+#include <vector>
+#include "paddle/fluid/framework/data_feed.pb.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/executor_thread_worker.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+class AsyncExecutor {
+ public:
+  AsyncExecutor(Scope* scope, const platform::Place& place);
+  virtual ~AsyncExecutor() {}
+  void RunFromFile(const ProgramDesc& main_program,
+                   const std::string& data_feed_desc_str,
+                   const std::vector<std::string>& filelist,
+                   const int thread_num,
+                   const std::vector<std::string>& fetch_names,
+                   const bool debug = false);
+
+ private:
+  void CreateThreads(ExecutorThreadWorker* worker,
+                     const ProgramDesc& main_program,
+                     const std::shared_ptr<DataFeed>& reader,
+                     const std::vector<std::string>& fetch_var_names,
+                     Scope* root_scope, const int thread_index,
+                     const bool debug);
+
+ public:
+  Scope* root_scope_;
+  platform::Place place_;
+};
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "google/protobuf/message.h"
+#include "google/protobuf/text_format.h"
+
+#include "gflags/gflags.h"
+#include "paddle/fluid/framework/data_feed.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+
+namespace paddle {
+namespace framework {
+
+std::vector<std::string> DataFeed::filelist_;
+size_t DataFeed::file_idx_;
+std::mutex DataFeed::mutex_for_pick_file_;
+bool DataFeed::finish_set_filelist_;
+
+void DataFeed::AddFeedVar(Variable* var, const std::string& name) {
+  CheckInit();
+  for (size_t i = 0; i < use_slots_.size(); ++i) {
+    if (name == use_slots_[i]) {
+      feed_vec_[i] = var->GetMutable<LoDTensor>();
+    }
+  }
+}
+
+bool DataFeed::SetFileList(const std::vector<std::string>& files) {
+  std::unique_lock<std::mutex> lock(mutex_for_pick_file_);
+  CheckInit();
+  if (finish_set_filelist_) {
+    VLOG(3) << "info: you have set the filelist.";
+    return false;
+  }
+  PADDLE_ENFORCE(files.size(), "You have set an empty filelist.");
+  filelist_.assign(files.begin(), files.end());
+  file_idx_ = 0;
+
+  finish_set_filelist_ = true;
+  return true;
+}
+
+void DataFeed::SetBatchSize(int batch_size) {
+  PADDLE_ENFORCE(batch_size > 0, "Illegal batch size: %d.", batch_size);
+  default_batch_size_ = batch_size;
+}
+
+bool DataFeed::PickOneFile(std::string* filename) {
+  std::unique_lock<std::mutex> lock(mutex_for_pick_file_);
+  if (file_idx_ == filelist_.size()) {
+    return false;
+  }
+  *filename = filelist_[file_idx_++];
+  return true;
+}
+
+void DataFeed::CheckInit() {
+  PADDLE_ENFORCE(finish_init_, "Initialization did not succeed.");
+}
+
+void DataFeed::CheckSetFileList() {
+  PADDLE_ENFORCE(finish_set_filelist_, "Set filelist did not succeed.");
+}
+
+void DataFeed::CheckStart() {
+  PADDLE_ENFORCE(finish_start_, "Datafeed has not started running yet.");
+}
+
+template <typename T>
+void PrivateQueueDataFeed<T>::SetQueueSize(int queue_size) {
+  PADDLE_ENFORCE(queue_size > 0, "Illegal queue size: %d.", queue_size);
+  queue_size_ = queue_size;
+  queue_ = std::unique_ptr<paddle::operators::reader::BlockingQueue<T>>(
+      new paddle::operators::reader::BlockingQueue<T>(queue_size_));
+}
+
+template <typename T>
+bool PrivateQueueDataFeed<T>::Start() {
+  CheckSetFileList();
+  read_thread_ = std::thread(&PrivateQueueDataFeed::ReadThread, this);
+  read_thread_.detach();
+
+  finish_start_ = true;
+  return true;
+}
+
+template <typename T>
+void PrivateQueueDataFeed<T>::ReadThread() {
+  std::string filename;
+  while (PickOneFile(&filename)) {
+    file_.open(filename.c_str());  // is_text_feed
+    PADDLE_ENFORCE(file_.good(), "Open file<%s> fail.", filename.c_str());
+    T instance;
+    while (ParseOneInstance(&instance)) {
+      queue_->Send(instance);
+    }
+    file_.close();
+  }
+  queue_->Close();
+}
+
+template <typename T>
+int PrivateQueueDataFeed<T>::Next() {
+  CheckStart();
+  int index = 0;
+  T instance;
+  T ins_vec;
+  while (index < default_batch_size_) {
+    if (!queue_->Receive(&instance)) {
+      break;
+    }
+    AddInstanceToInsVec(&ins_vec, instance, index++);
+  }
+  batch_size_ = index;
+  if (batch_size_ != 0) {
+    PutToFeedVec(ins_vec);
+  }
+  return batch_size_;
+}
+
+#ifdef _WIN32
+template class PrivateQueueDataFeed<std::vector<MultiSlotType>>;
+#endif
+
+void MultiSlotDataFeed::Init(
+    const paddle::framework::DataFeedDesc& data_feed_desc) {
+  finish_init_ = false;
+  finish_set_filelist_ = false;
+  finish_start_ = false;
+
+  PADDLE_ENFORCE(data_feed_desc.has_multi_slot_desc(),
+                 "Multi_slot_desc has not been set.");
+  paddle::framework::MultiSlotDesc multi_slot_desc =
+      data_feed_desc.multi_slot_desc();
+  SetBatchSize(data_feed_desc.batch_size());
+  SetQueueSize(data_feed_desc.batch_size());
+  size_t all_slot_num = multi_slot_desc.slots_size();
+  all_slots_.resize(all_slot_num);
+  all_slots_type_.resize(all_slot_num);
+  use_slots_index_.resize(all_slot_num);
+  use_slots_.clear();
+  use_slots_is_dense_.clear();
+  for (size_t i = 0; i < all_slot_num; ++i) {
+    const auto& slot = multi_slot_desc.slots(i);
+    all_slots_[i] = slot.name();
+    all_slots_type_[i] = slot.type();
+    use_slots_index_[i] = slot.is_used() ? use_slots_.size() : -1;
+    if (slot.is_used()) {
+      use_slots_.push_back(all_slots_[i]);
+      use_slots_is_dense_.push_back(slot.is_dense());
+    }
+  }
+  feed_vec_.resize(use_slots_.size());
+  finish_init_ = true;
+}
+
+bool MultiSlotDataFeed::CheckFile(const char* filename) {
+  CheckInit();  // get info of slots
+  std::ifstream fin(filename);
+  if (!fin.good()) {
+    VLOG(1) << "error: open file<" << filename << "> fail";
+    return false;
+  }
+  std::string line;
+  int instance_cout = 0;
+  std::string all_slots_alias = "";
+  for (const auto& alias : all_slots_) {
+    all_slots_alias += alias + " ";
+  }
+  std::string use_slots_alias = "";
+  for (const auto& alias : use_slots_) {
+    use_slots_alias += alias + " ";
+  }
+  VLOG(3) << "total slots num: " << all_slots_.size();
+  VLOG(3) << "total slots alias: " << all_slots_alias;
+  VLOG(3) << "used slots num: " << use_slots_.size();
+  VLOG(3) << "used slots alias: " << use_slots_alias;
+  while (getline(fin, line)) {
+    ++instance_cout;
+    const char* str = line.c_str();
+    char* endptr = const_cast<char*>(str);
+    int len = line.length();
+    for (size_t i = 0; i < all_slots_.size(); ++i) {
+      int num = strtol(endptr, &endptr, 10);
+      if (num < 0) {
+        VLOG(0) << "error: the number of ids is a negative number: " << num;
+        VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                << filename << ">";
+        return false;
+      } else if (num == 0) {
+        VLOG(0)
+            << "error: the number of ids can not be zero, you need "
+               "padding it in data generator; or if there is something wrong"
+               " with the data, please check if the data contains unresolvable "
+               "characters.";
+        VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                << filename << ">";
+        return false;
+      } else if (errno == ERANGE || num > INT_MAX) {
+        VLOG(0) << "error: the number of ids greater than INT_MAX";
+        VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                << filename << ">";
+        return false;
+      }
+      if (all_slots_type_[i] == "float") {
+        for (int i = 0; i < num; ++i) {
+          strtof(endptr, &endptr);
+          if (errno == ERANGE) {
+            VLOG(0) << "error: the value is out of the range of "
+                       "representable values for float";
+            VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                    << filename << ">";
+            return false;
+          }
+          if (i + 1 != num && endptr - str == len) {
+            VLOG(0) << "error: there is a wrong with the number of ids.";
+            VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                    << filename << ">";
+            return false;
+          }
+        }
+      } else if (all_slots_type_[i] == "uint64") {
+        for (int i = 0; i < num; ++i) {
+          strtoull(endptr, &endptr, 10);
+          if (errno == ERANGE) {
+            VLOG(0) << "error: the value is out of the range of "
+                       "representable values for uint64_t";
+            VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                    << filename << ">";
+            return false;
+          }
+          if (i + 1 != num && endptr - str == len) {
+            VLOG(0) << "error: there is a wrong with the number of ids.";
+            VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                    << filename << ">";
+            return false;
+          }
+        }
+      } else {
+        VLOG(0) << "error: this type<" << all_slots_type_[i]
+                << "> is not supported";
+        return false;
+      }
+    }
+    // It may be added '\t' character to the end of the output of reduce
+    // task when processes data by Hadoop(when the output of the reduce
+    // task of Hadoop has only one field, it will add a '\t' at the end
+    // of the line by default, and you can use this option to avoid it:
+    // `-D mapred.textoutputformat.ignoreseparator=true`), which does
+    // not affect the correctness of the data. Therefore, it should be
+    // judged that the data is not normal when the end of each line of
+    // data contains characters which are not spaces.
+    while (endptr - str != len) {
+      if (!isspace(*(endptr++))) {
+        VLOG(0)
+            << "error: there is some extra characters at the end of the line.";
+        VLOG(0) << "please check line<" << instance_cout << "> in file<"
+                << filename << ">";
+        return false;
+      }
+    }
+  }
+  VLOG(3) << "instances cout: " << instance_cout;
+  VLOG(3) << "The file format is correct";
+  return true;
+}
+
+bool MultiSlotDataFeed::ParseOneInstance(std::vector<MultiSlotType>* instance) {
+  std::string line;
+  if (getline(file_, line)) {
+    int use_slots_num = use_slots_.size();
+    instance->resize(use_slots_num);
+    // parse line
+    const char* str = line.c_str();
+    char* endptr = const_cast<char*>(str);
+    int pos = 0;
+    for (size_t i = 0; i < use_slots_index_.size(); ++i) {
+      int idx = use_slots_index_[i];
+      int num = strtol(&str[pos], &endptr, 10);
+      PADDLE_ENFORCE(
+          num,
+          "The number of ids can not be zero, you need padding "
+          "it in data generator; or if there is something wrong with "
+          "the data, please check if the data contains unresolvable "
+          "characters.\nplease check this error line: %s",
+          str);
+
+      if (idx != -1) {
+        (*instance)[idx].Init(all_slots_type_[i]);
+        if ((*instance)[idx].GetType()[0] == 'f') {  // float
+          for (int j = 0; j < num; ++j) {
+            float feasign = strtof(endptr, &endptr);
+            (*instance)[idx].AddValue(feasign);
+          }
+        } else if ((*instance)[idx].GetType()[0] == 'u') {  // uint64
+          for (int j = 0; j < num; ++j) {
+            uint64_t feasign = (uint64_t)strtoull(endptr, &endptr, 10);
+            (*instance)[idx].AddValue(feasign);
+          }
+        }
+        pos = endptr - str;
+      } else {
+        for (int j = 0; j <= num; ++j) {
+          pos = line.find_first_of(' ', pos + 1);
+        }
+      }
+    }
+  } else {
+    return false;
+  }
+  return true;
+}
+
+void MultiSlotDataFeed::AddInstanceToInsVec(
+    std::vector<MultiSlotType>* ins_vec,
+    const std::vector<MultiSlotType>& instance, int index) {
+  if (index == 0) {
+    ins_vec->resize(instance.size());
+    for (size_t i = 0; i < instance.size(); ++i) {
+      (*ins_vec)[i].Init(instance[i].GetType());
+      (*ins_vec)[i].InitOffset();
+    }
+  }
+
+  for (size_t i = 0; i < instance.size(); ++i) {
+    (*ins_vec)[i].AddIns(instance[i]);
+  }
+}
+
+void MultiSlotDataFeed::PutToFeedVec(
+    const std::vector<MultiSlotType>& ins_vec) {
+  for (size_t i = 0; i < use_slots_.size(); ++i) {
+    const auto& type = ins_vec[i].GetType();
+    const auto& offset = ins_vec[i].GetOffset();
+    int total_instance = static_cast<int>(offset.back());
+
+    if (type[0] == 'f') {  // float
+      const auto& feasign = ins_vec[i].GetFloatData();
+      float* tensor_ptr = feed_vec_[i]->mutable_data<float>(
+          {total_instance, 1}, platform::CPUPlace());
+      memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
+    } else if (type[0] == 'u') {  // uint64
+      // no uint64_t type in paddlepaddle
+      const auto& feasign = ins_vec[i].GetUint64Data();
+      int64_t* tensor_ptr = feed_vec_[i]->mutable_data<int64_t>(
+          {total_instance, 1}, platform::CPUPlace());
+      memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
+    }
+
+    LoD data_lod{offset};
+    feed_vec_[i]->set_lod(data_lod);
+    if (use_slots_is_dense_[i]) {
+      int dim = total_instance / batch_size_;
+      feed_vec_[i]->Resize({batch_size_, dim});
+    }
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <fstream>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "paddle/fluid/framework/data_feed.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/operators/reader/blocking_queue.h"
+
+namespace paddle {
+namespace framework {
+
+// DataFeed is the base virtual class for all ohther DataFeeds.
+// It is used to read files and parse the data for subsequent trainer.
+// Example:
+//   DataFeed* reader =
+//   paddle::framework::DataFeedFactory::CreateDataFeed(data_feed_name);
+//   reader->Init(data_feed_desc); // data_feed_desc is a protobuf object
+//   reader->SetFileList(filelist);
+//   const std::vector<std::string> & use_slot_alias =
+//   reader->GetUseSlotAlias();
+//   for (auto name: use_slot_alias){ // for binding memory
+//     reader->AddFeedVar(scope->Var(name), name);
+//   }
+//   reader->Start();
+//   while (reader->Next()) {
+//      // trainer do something
+//   }
+class DataFeed {
+ public:
+  DataFeed() {}
+  virtual ~DataFeed() {}
+  virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc) = 0;
+  virtual bool CheckFile(const char* filename) {
+    PADDLE_THROW("This function(CheckFile) is not implemented.");
+  }
+  // Set filelist for DataFeed.
+  // Pay attention that it must init all readers before call this function.
+  // Otherwise, Init() function will init finish_set_filelist_ flag.
+  virtual bool SetFileList(const std::vector<std::string>& files);
+  virtual bool Start() = 0;
+  // The trainer calls the Next() function, and the DataFeed will load a new
+  // batch to the feed_vec. The return value of this function is the batch
+  // size of the current batch.
+  virtual int Next() = 0;
+  // Get all slots' alias which defined in protofile
+  virtual const std::vector<std::string>& GetAllSlotAlias() {
+    return all_slots_;
+  }
+  // Get used slots' alias which defined in protofile
+  virtual const std::vector<std::string>& GetUseSlotAlias() {
+    return use_slots_;
+  }
+  // This function is used for binding feed_vec memory
+  virtual void AddFeedVar(Variable* var, const std::string& name);
+
+ protected:
+  // The following three functions are used to check if it is executed in this
+  // order:
+  //   Init() -> SetFileList() -> Start() -> Next()
+  virtual void CheckInit();
+  virtual void CheckSetFileList();
+  virtual void CheckStart();
+  virtual void SetBatchSize(
+      int batch);  // batch size will be set in Init() function
+  // This function is used to pick one file from the global filelist(thread
+  // safe).
+  virtual bool PickOneFile(std::string* filename);
+
+  static std::vector<std::string> filelist_;
+  static size_t file_idx_;
+  static std::mutex mutex_for_pick_file_;
+
+  // the alias of used slots, and its order is determined by
+  // data_feed_desc(proto object)
+  std::vector<std::string> use_slots_;
+  std::vector<bool> use_slots_is_dense_;
+
+  // the alias of all slots, and its order is determined by data_feed_desc(proto
+  // object)
+  std::vector<std::string> all_slots_;
+  std::vector<std::string> all_slots_type_;
+  std::vector<int>
+      use_slots_index_;  // -1: not used; >=0: the index of use_slots_
+
+  // The data read by DataFeed will be stored here
+  std::vector<LoDTensor*> feed_vec_;
+
+  // the batch size defined by user
+  int default_batch_size_;
+  // current batch size
+  int batch_size_;
+
+  bool finish_init_;
+  static bool finish_set_filelist_;
+  bool finish_start_;
+};
+
+// PrivateQueueDataFeed is the base virtual class for ohther DataFeeds.
+// It use a read-thread to read file and parse data to a private-queue
+// (thread level), and get data from this queue when trainer call Next().
+template <typename T>
+class PrivateQueueDataFeed : public DataFeed {
+ public:
+  PrivateQueueDataFeed() {}
+  virtual ~PrivateQueueDataFeed() {}
+  virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc) = 0;
+  virtual bool Start();
+  virtual int Next();
+
+ protected:
+  // The thread implementation function for reading file and parse.
+  virtual void ReadThread();
+  // This function is used to set private-queue size, and the most
+  // efficient when the queue size is close to the batch size.
+  virtual void SetQueueSize(int queue_size);
+  // The reading and parsing method called in the ReadThread.
+  virtual bool ParseOneInstance(T* instance) = 0;
+  // This function is used to put instance to vec_ins
+  virtual void AddInstanceToInsVec(T* vec_ins, const T& instance,
+                                   int index) = 0;
+  // This function is used to put ins_vec to feed_vec
+  virtual void PutToFeedVec(const T& ins_vec) = 0;
+
+  // The thread for read files
+  std::thread read_thread_;
+  // using ifstream one line and one line parse is faster
+  // than using fread one buffer and one buffer parse.
+  //   for a 601M real data:
+  //     ifstream one line and one line parse: 6034 ms
+  //     fread one buffer and one buffer parse: 7097 ms
+  std::ifstream file_;
+  size_t queue_size_;
+  // The queue for store parsed data
+  std::unique_ptr<paddle::operators::reader::BlockingQueue<T>> queue_;
+};
+
+// This class define the data type of instance(ins_vec) in MultiSlotDataFeed
+class MultiSlotType {
+ public:
+  MultiSlotType() {}
+  ~MultiSlotType() {}
+  void Init(const std::string& type) {
+    CheckType(type);
+    if (type_[0] == 'f') {
+      float_feasign_.clear();
+    } else if (type_[0] == 'u') {
+      uint64_feasign_.clear();
+    }
+    type_ = type;
+  }
+  void InitOffset() {
+    offset_.resize(1);
+    // LoDTensor' lod is counted from 0, the size of lod
+    // is one size larger than the size of data.
+    offset_[0] = 0;
+  }
+  const std::vector<size_t>& GetOffset() const { return offset_; }
+  void AddValue(const float v) {
+    CheckFloat();
+    float_feasign_.push_back(v);
+  }
+  void AddValue(const uint64_t v) {
+    CheckUint64();
+    uint64_feasign_.push_back(v);
+  }
+  void AddIns(const MultiSlotType& ins) {
+    if (ins.GetType()[0] == 'f') {  // float
+      CheckFloat();
+      auto& vec = ins.GetFloatData();
+      offset_.push_back(offset_.back() + vec.size());
+      float_feasign_.insert(float_feasign_.end(), vec.begin(), vec.end());
+    } else if (ins.GetType()[0] == 'u') {  // uint64
+      CheckUint64();
+      auto& vec = ins.GetUint64Data();
+      offset_.push_back(offset_.back() + vec.size());
+      uint64_feasign_.insert(uint64_feasign_.end(), vec.begin(), vec.end());
+    }
+  }
+  const std::vector<float>& GetFloatData() const { return float_feasign_; }
+  const std::vector<uint64_t>& GetUint64Data() const { return uint64_feasign_; }
+  const std::string& GetType() const { return type_; }
+
+ private:
+  void CheckType(const std::string& type) const {
+    PADDLE_ENFORCE((type == "uint64") || (type == "float"),
+                   "There is no this type<%s>.", type);
+  }
+  void CheckFloat() const {
+    PADDLE_ENFORCE(type_[0] == 'f', "Add %s value to float slot.", type_);
+  }
+  void CheckUint64() const {
+    PADDLE_ENFORCE(type_[0] == 'u', "Add %s value to uint64 slot.", type_);
+  }
+  std::vector<float> float_feasign_;
+  std::vector<uint64_t> uint64_feasign_;
+  std::string type_;
+  std::vector<size_t> offset_;
+};
+
+// This DataFeed is used to feed multi-slot type data.
+// The format of multi-slot type data:
+//   [n feasign_0 feasign_1 ... feasign_n]*
+class MultiSlotDataFeed
+    : public PrivateQueueDataFeed<std::vector<MultiSlotType>> {
+ public:
+  MultiSlotDataFeed() {}
+  virtual ~MultiSlotDataFeed() {}
+  virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc);
+  virtual bool CheckFile(const char* filename);
+
+ protected:
+  virtual void AddInstanceToInsVec(std::vector<MultiSlotType>* vec_ins,
+                                   const std::vector<MultiSlotType>& instance,
+                                   int index);
+  virtual bool ParseOneInstance(std::vector<MultiSlotType>* instance);
+  virtual void PutToFeedVec(const std::vector<MultiSlotType>& ins_vec);
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/data_feed.proto
+++ b/paddle/fluid/framework/data_feed.proto
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+syntax = "proto2";
+package paddle.framework;
+
+message Slot {
+  required string name = 1;
+  required string type = 2;
+  optional bool is_dense = 3 [ default = false ];
+  optional bool is_used = 4 [ default = false ];
+}
+
+message MultiSlotDesc { repeated Slot slots = 1; }
+
+message DataFeedDesc {
+  optional string name = 1;
+  optional int32 batch_size = 2 [ default = 32 ];
+  optional MultiSlotDesc multi_slot_desc = 3;
+}
--- a/paddle/fluid/framework/data_feed_factory.cc
+++ b/paddle/fluid/framework/data_feed_factory.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/data_feed_factory.h"
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "paddle/fluid/framework/data_feed.h"
+
+namespace paddle {
+namespace framework {
+typedef std::shared_ptr<DataFeed> (*Createdata_feedFunction)();
+typedef std::unordered_map<std::string, Createdata_feedFunction> data_feedMap;
+data_feedMap g_data_feed_map;
+
+#define REGISTER_DATAFEED_CLASS(data_feed_class)                      \
+  namespace {                                                         \
+  std::shared_ptr<DataFeed> Creator_##data_feed_class() {             \
+    return std::shared_ptr<DataFeed>(new data_feed_class);            \
+  }                                                                   \
+  class __Registerer_##data_feed_class {                              \
+   public:                                                            \
+    __Registerer_##data_feed_class() {                                \
+      g_data_feed_map[#data_feed_class] = &Creator_##data_feed_class; \
+    }                                                                 \
+  };                                                                  \
+  __Registerer_##data_feed_class g_registerer_##data_feed_class;      \
+  }  // namespace
+
+std::string DataFeedFactory::DataFeedTypeList() {
+  std::string data_feed_types;
+  for (auto iter = g_data_feed_map.begin(); iter != g_data_feed_map.end();
+       ++iter) {
+    if (iter != g_data_feed_map.begin()) {
+      data_feed_types += ", ";
+    }
+    data_feed_types += iter->first;
+  }
+  return data_feed_types;
+}
+
+std::shared_ptr<DataFeed> DataFeedFactory::CreateDataFeed(
+    std::string data_feed_class) {
+  if (g_data_feed_map.count(data_feed_class) < 1) {
+    exit(-1);
+  }
+  return g_data_feed_map[data_feed_class]();
+}
+
+REGISTER_DATAFEED_CLASS(MultiSlotDataFeed);
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/data_feed_factory.h
+++ b/paddle/fluid/framework/data_feed_factory.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "paddle/fluid/framework/data_feed.h"
+
+namespace paddle {
+namespace framework {
+class DataFeedFactory {
+ public:
+  static std::string DataFeedTypeList();
+  static std::shared_ptr<DataFeed> CreateDataFeed(std::string data_feed_class);
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/data_feed_test.cc
+++ b/paddle/fluid/framework/data_feed_test.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/data_feed.h"
+#include <fcntl.h>
+#include <chrono>  // NOLINT
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <mutex>  // NOLINT
+#include <set>
+#include <thread>  // NOLINT
+#include <utility>
+#include <vector>
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "google/protobuf/text_format.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/data_feed_factory.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+
+paddle::framework::DataFeedDesc load_datafeed_param_from_file(
+    const char* filename) {
+  paddle::framework::DataFeedDesc data_feed_desc;
+  int file_descriptor = open(filename, O_RDONLY);
+  PADDLE_ENFORCE(file_descriptor != -1, "Can not open %s.", filename);
+  google::protobuf::io::FileInputStream fileInput(file_descriptor);
+  google::protobuf::TextFormat::Parse(&fileInput, &data_feed_desc);
+  close(file_descriptor);
+  return data_feed_desc;
+}
+
+const std::vector<std::string> load_filelist_from_file(const char* filename) {
+  std::vector<std::string> filelist;
+  std::ifstream fin(filename);
+  PADDLE_ENFORCE(fin.good(), "Can not open %s.", filename);
+  std::string line;
+  while (getline(fin, line)) {
+    filelist.push_back(line);
+  }
+  fin.close();
+  return filelist;
+}
+
+void GenerateFileForTest(const char* protofile, const char* filelist) {
+  std::ofstream w_protofile(protofile);
+  w_protofile << "name: \"MultiSlotDataFeed\"\n"
+                 "batch_size: 2\n"
+                 "multi_slot_desc {\n"
+                 "    slots {\n"
+                 "        name: \"uint64_sparse_slot\"\n"
+                 "        type: \"uint64\"\n"
+                 "        is_dense: false\n"
+                 "        is_used: true\n"
+                 "    }\n"
+                 "    slots {\n"
+                 "        name: \"float_sparse_slot\"\n"
+                 "        type: \"float\"\n"
+                 "        is_dense: false\n"
+                 "        is_used: true\n"
+                 "    }\n"
+                 "    slots {\n"
+                 "        name: \"uint64_dense_slot\"\n"
+                 "        type: \"uint64\"\n"
+                 "        is_dense: true\n"
+                 "        is_used: true\n"
+                 "    }\n"
+                 "    slots {\n"
+                 "        name: \"float_dense_slot\"\n"
+                 "        type: \"float\"\n"
+                 "        is_dense: true\n"
+                 "        is_used: true\n"
+                 "    }\n"
+                 "    slots {\n"
+                 "        name: \"not_used_slot\"\n"
+                 "        type: \"uint64\"\n"
+                 "        is_dense: false\n"
+                 "        is_used: false\n"
+                 "    }\n"
+                 "}";
+  w_protofile.close();
+  std::ofstream w_filelist(filelist);
+  int total_file = 4;
+  for (int i = 0; i < total_file; ++i) {
+    std::string filename = "TestMultiSlotDataFeed.data." + std::to_string(i);
+    w_filelist << filename;
+    if (i + 1 != total_file) {
+      w_filelist << std::endl;
+    }
+    std::ofstream w_datafile(filename.c_str());
+    w_datafile << "3 3978 620 82 1 1926.08 1 1926 1 6.02 1 1996\n"
+                  "2 1300 2983353 1 985.211 1 8 1 0.618 1 12\n"
+                  "1 19260827 2 3.14 2.718 1 27 1 2.236 1 28\n";
+    w_datafile.close();
+  }
+  w_filelist.close();
+}
+
+class MultiTypeSet {
+ public:
+  MultiTypeSet() {
+    uint64_set_.clear();
+    float_set_.clear();
+  }
+  ~MultiTypeSet() {}
+  void AddValue(uint64_t v) { uint64_set_.insert(v); }
+  void AddValue(float v) { float_set_.insert(v); }
+  const std::set<uint64_t>& GetUint64Set() const { return uint64_set_; }
+  const std::set<float>& GetFloatSet() const { return float_set_; }
+
+ private:
+  std::set<uint64_t> uint64_set_;
+  std::set<float> float_set_;
+};
+
+void GetElemSetFromReader(std::vector<MultiTypeSet>* reader_elem_set,
+                          const paddle::framework::DataFeedDesc& data_feed_desc,
+                          const std::vector<std::string>& filelist,
+                          const int thread_num) {
+  int used_slot_num = 0;
+  for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) {
+    if (data_feed_desc.multi_slot_desc().slots(i).is_used()) {
+      ++used_slot_num;
+    }
+  }
+  reader_elem_set->resize(used_slot_num);
+  std::vector<std::thread> threads;
+  std::vector<std::shared_ptr<paddle::framework::DataFeed>> readers;
+  readers.resize(thread_num);
+  for (int i = 0; i < thread_num; ++i) {
+    readers[i] = paddle::framework::DataFeedFactory::CreateDataFeed(
+        data_feed_desc.name());
+    readers[i]->Init(data_feed_desc);
+  }
+  readers[0]->SetFileList(filelist);
+  std::mutex mu;
+  for (int idx = 0; idx < thread_num; ++idx) {
+    threads.emplace_back(std::thread([&, idx] {
+      std::unique_ptr<paddle::framework::Scope> scope(
+          new paddle::framework::Scope());
+      const auto& multi_slot_desc = data_feed_desc.multi_slot_desc();
+      std::map<std::string, const paddle::framework::LoDTensor*>
+          lodtensor_targets;
+      for (int i = 0; i < multi_slot_desc.slots_size(); ++i) {
+        const auto& slot = multi_slot_desc.slots(i);
+        if (slot.is_used()) {
+          const auto& name = slot.name();
+          readers[idx]->AddFeedVar(scope->Var(name), name);
+          lodtensor_targets[name] =
+              &scope->FindVar(name)->Get<paddle::framework::LoDTensor>();
+        }
+      }
+      readers[idx]->Start();
+      while (readers[idx]->Next()) {
+        int index = 0;
+        for (int k = 0; k < multi_slot_desc.slots_size(); ++k) {
+          const auto& slot = multi_slot_desc.slots(k);
+          if (!slot.is_used()) {
+            continue;
+          }
+          const paddle::framework::LoDTensor* tens =
+              lodtensor_targets[slot.name()];
+          if (slot.is_dense()) {  // dense branch
+            if (slot.type() == "uint64") {
+              const int64_t* data = tens->data<int64_t>();
+              int batch_size = tens->dims()[0];
+              int dim = tens->dims()[1];
+              for (int i = 0; i < batch_size; ++i) {
+                for (int j = 0; j < dim; ++j) {
+                  std::lock_guard<std::mutex> lock(mu);
+                  (*reader_elem_set)[index].AddValue(
+                      (uint64_t)data[i * dim + j]);
+                }
+              }
+            } else if (slot.type() == "float") {
+              const float* data = tens->data<float>();
+              int batch_size = tens->dims()[0];
+              int dim = tens->dims()[1];
+              for (int i = 0; i < batch_size; ++i) {
+                for (int j = 0; j < dim; ++j) {
+                  std::lock_guard<std::mutex> lock(mu);
+                  (*reader_elem_set)[index].AddValue(data[i * dim + j]);
+                }
+              }
+            } else {
+              PADDLE_THROW("Error type in proto file.");
+            }
+          } else {  // sparse branch
+            if (slot.type() == "uint64") {
+              const int64_t* data = tens->data<int64_t>();
+              for (size_t i = 0; i < tens->NumElements(); ++i) {
+                std::pair<size_t, size_t> element = tens->lod_element(0, i);
+                for (size_t j = element.first; j < element.second; ++j) {
+                  std::lock_guard<std::mutex> lock(mu);
+                  (*reader_elem_set)[index].AddValue((uint64_t)data[j]);
+                }
+              }
+            } else if (slot.type() == "float") {
+              const float* data = tens->data<float>();
+              for (size_t i = 0; i < tens->NumElements(); ++i) {
+                std::pair<size_t, size_t> element = tens->lod_element(0, i);
+                for (size_t j = element.first; j < element.second; ++j) {
+                  std::lock_guard<std::mutex> lock(mu);
+                  (*reader_elem_set)[index].AddValue(data[j]);
+                }
+              }
+            } else {
+              PADDLE_THROW("Error type in proto file.");
+            }
+          }  // end sparse branch
+          ++index;
+        }  // end slots loop
+      }    // end while Next()
+    }));   // end anonymous function
+  }
+  for (auto& th : threads) {
+    th.join();
+  }
+}
+
+void CheckIsUnorderedSame(const std::vector<MultiTypeSet>& s1,
+                          const std::vector<MultiTypeSet>& s2) {
+  EXPECT_EQ(s1.size(), s2.size());
+  for (size_t i = 0; i < s1.size(); ++i) {
+    // check for uint64
+    const std::set<uint64_t>& uint64_s1 = s1[i].GetUint64Set();
+    const std::set<uint64_t>& uint64_s2 = s2[i].GetUint64Set();
+    EXPECT_EQ(uint64_s1.size(), uint64_s2.size());
+    auto uint64_it1 = uint64_s1.begin();
+    auto uint64_it2 = uint64_s2.begin();
+    while (uint64_it1 != uint64_s1.end()) {
+      EXPECT_EQ(*uint64_it1, *uint64_it2);
+      ++uint64_it1;
+      ++uint64_it2;
+    }
+    // check for float
+    const std::set<float>& float_s1 = s1[i].GetFloatSet();
+    const std::set<float>& float_s2 = s2[i].GetFloatSet();
+    EXPECT_EQ(float_s1.size(), float_s2.size());
+    auto float_it1 = float_s1.begin();
+    auto float_it2 = float_s2.begin();
+    while (float_it1 != float_s1.end()) {
+      EXPECT_EQ(*float_it1, *float_it2);
+      ++float_it1;
+      ++float_it2;
+    }
+  }
+}
+
+void GetElemSetFromFile(std::vector<MultiTypeSet>* file_elem_set,
+                        const paddle::framework::DataFeedDesc& data_feed_desc,
+                        const std::vector<std::string>& filelist) {
+  int used_slot_num = 0;
+  for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) {
+    if (data_feed_desc.multi_slot_desc().slots(i).is_used()) {
+      ++used_slot_num;
+    }
+  }
+  file_elem_set->resize(used_slot_num);
+  for (const auto& file : filelist) {
+    std::ifstream fin(file.c_str());
+    PADDLE_ENFORCE(fin.good(), "Can not open %s.", file.c_str());
+    while (1) {
+      bool end_flag = false;
+      int index = 0;
+      for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) {
+        int num;
+        if (fin >> num) {
+          auto slot = data_feed_desc.multi_slot_desc().slots(i);
+          auto type = slot.type();
+          if (type == "uint64") {
+            while (num--) {
+              uint64_t feasign;
+              fin >> feasign;
+              if (slot.is_used()) {
+                (*file_elem_set)[index].AddValue(feasign);
+              }
+            }
+          } else if (type == "float") {
+            while (num--) {
+              float feasign;
+              fin >> feasign;
+              if (slot.is_used()) {
+                (*file_elem_set)[index].AddValue(feasign);
+              }
+            }
+          } else {
+            PADDLE_THROW("Error type in proto file.");
+          }
+          if (slot.is_used()) {
+            ++index;
+          }
+        } else {
+          end_flag = true;
+          break;
+        }
+      }
+      if (end_flag) {
+        break;
+      }
+    }
+    fin.close();
+  }
+}
+
+TEST(DataFeed, MultiSlotUnitTest) {
+  const char* protofile = "data_feed_desc.prototxt";
+  const char* filelist_name = "filelist.txt";
+  GenerateFileForTest(protofile, filelist_name);
+  const std::vector<std::string> filelist =
+      load_filelist_from_file(filelist_name);
+  paddle::framework::DataFeedDesc data_feed_desc =
+      load_datafeed_param_from_file(protofile);
+  std::vector<MultiTypeSet> reader_elem_set;
+  std::vector<MultiTypeSet> file_elem_set;
+  GetElemSetFromReader(&reader_elem_set, data_feed_desc, filelist, 4);
+  GetElemSetFromFile(&file_elem_set, data_feed_desc, filelist);
+  CheckIsUnorderedSame(reader_elem_set, file_elem_set);
+}
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -151,19 +151,22 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
  auto out_format =
      platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));

-  void* in_data = GetDataFromTensor(in, in_type);
-
  // output tensor has the same dims as input. Reorder don't change dims
  out->Resize(in.dims());

-  auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());
-
-  auto in_memory = memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
-  auto out_memory =
-      memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
+  if (in_format != out_format) {
+    void* in_data = GetDataFromTensor(in, in_type);
+    auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());

-  platform::Reorder(in_memory, out_memory);
+    auto in_memory =
+        memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
+    auto out_memory =
+        memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);

+    platform::Reorder(in_memory, out_memory);
+  } else {
+    out->ShareDataWith(in);
+  }
  out->set_layout(out_layout);
  // reset format since the out tensor will be feed to non-MKLDNN OPkernel
  out->set_format(memory::format::format_undef);

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -15,14 +15,26 @@ cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_ro
 if(WITH_GPU)
    nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
            dynload_cuda variable_visitor)
-    nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda)
+    if(WITH_DISTRIBUTE)
+        nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
+            ddim dynload_cuda selected_rows_functor sendrecvop_grpc)
+    else()
+        nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
+            ddim dynload_cuda selected_rows_functor)
+    endif()
    nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
    nv_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)

 else()
    cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
             variable_visitor)
-    cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim)
+    if(WITH_DISTRIBUTE)
+        cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
+            ddim selected_rows_functor sendrecvop_grpc)
+    else()
+        cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
+            ddim selected_rows_functor)
+    endif()
    cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
    cc_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
 endif()
@@ -33,10 +45,10 @@ cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base s

 cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)

-if (WITH_GPU)
-  cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle scale_loss_grad_op_handle rpc_op_handle
-          all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
-endif()
+cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle)
+cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
+cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass)
+cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper)

 cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
 cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)
@@ -44,10 +56,7 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)

-set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass) 
-if (WITH_GPU)
-  list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
-endif()
+set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass) 

 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})


--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -48,7 +48,14 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
 void AllReduceOpHandle::RunImpl() {
  platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);

+// FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR,
+// this is a distributed or inter-process call, find a better way.
+#ifdef PADDLE_WITH_CUDA
+  if (NoDummyInputSize() == 1 &&
+      local_scopes_[0]->FindLocalVar(NCCL_ID_VARNAME) == nullptr) {
+#else
  if (NoDummyInputSize() == 1) {
+#endif
    return;  // No need to all reduce when GPU count = 1;
  } else {
    // Wait input done

--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -58,10 +58,23 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
      }
    }

+    CollectiveContext *context = CollectiveContext::GetInstance();
+    context->endpoints_ = strategy_.trainers_endpoints_;
+    context->trainer_id_ = strategy_.trainer_id_;
+    PADDLE_ENFORCE(strategy_.trainer_id_ >= 0, "trainer_id_ >= 0");
+    if (strategy_.trainer_id_ > 0) {
+      PADDLE_ENFORCE((unsigned)(strategy_.trainer_id_) <
+                         strategy_.trainers_endpoints_.size(),
+                     "trainer_id_ < endpoints_ size");
+    }
+    VLOG(1) << "CollectiveContext:" << context->String();
+
    // Convert graph to run on multi-devices.
    auto multi_devices_pass = AppendPass("multi_devices_pass");
    multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
                                                         &strategy_);
+    multi_devices_pass->Set<int>("num_trainers",
+                                 new int(strategy_.num_trainers_));

    // Add a graph print pass to record a graph with device info.
    if (!strategy_.debug_graphviz_path_.empty()) {
@@ -133,16 +146,16 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
      pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
 #endif
    } else if (pass->Type() == "sequential_execution_pass") {
-      VLOG(1) << "set enable_sequential_execution:"
-              << enable_sequential_execution_;
+      LOG(INFO) << "set enable_sequential_execution:"
+                << enable_sequential_execution_;

      pass->Erase(kAllOpDescs);
      pass->Set<const std::vector<OpDesc *>>(
          kAllOpDescs,
          new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
    } else if (pass->Type() == "all_reduce_deps_pass") {
-      VLOG(1) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
-              << ", num_trainers:" << num_trainers_;
+      LOG(INFO) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
+                << ", num_trainers:" << num_trainers_;

      pass->Erase(kAllOpDescs);
      pass->Set<const std::vector<OpDesc *>>(

--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -74,6 +74,8 @@ struct BuildStrategy {
  bool fuse_broadcast_op_{false};

  int num_trainers_{1};
+  int trainer_id_{0};
+  std::vector<std::string> trainers_endpoints_;
  bool remove_unnecessary_lock_{false};

  // NOTE:

--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -20,11 +20,13 @@ namespace paddle {
 namespace framework {
 namespace details {
 ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
-                                         platform::Place place)
+                                         platform::Place place,
+                                         size_t scope_idx)
    : OpHandleBase(node),
      op_(framework::OpRegistry::CreateOp(*node->Op())),
      scope_(scope),
-      place_(place) {}
+      place_(place),
+      scope_idx_(scope_idx) {}

 void ComputationOpHandle::RunImpl() {
  WaitInputVarGenerated(place_);

--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@@ -28,7 +28,8 @@ namespace framework {
 namespace details {
 struct ComputationOpHandle : public OpHandleBase {
 public:
-  ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place);
+  ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place,
+                      size_t scope_idx);

  std::string Name() const override;

@@ -38,6 +39,8 @@ struct ComputationOpHandle : public OpHandleBase {

  void SetLockAndRecordEventFree(bool b) { is_lock_and_record_event_free_ = b; }

+  size_t GetScopeIdx() const { return scope_idx_; }
+
 protected:
  void RunImpl() override;

@@ -47,6 +50,7 @@ struct ComputationOpHandle : public OpHandleBase {
  std::unique_ptr<OperatorBase> op_;
  Scope *scope_;
  platform::Place place_;
+  size_t scope_idx_;
  bool is_lock_and_record_event_free_{false};
 };
 }  // namespace details

--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+EagerDeletionOpHandle::EagerDeletionOpHandle(
+    ir::Node *node, const Scope *scope, const platform::Place &place,
+    const std::unordered_set<std::string> &var_names, GarbageCollector *gc,
+    AtomicReferenceCountMap *ref_cnts)
+    : OpHandleBase(node),
+      scope_(scope),
+      var_names_(var_names),
+      gc_(gc),
+      ref_cnts_(ref_cnts) {
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_gpu_place(place)) {
+    dev_ctx_ = reinterpret_cast<platform::CUDADeviceContext *>(
+        platform::DeviceContextPool::Instance().Get(place));
+    if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
+      platform::CUDADeviceGuard guard(
+          boost::get<platform::CUDAPlace>(place).device);
+      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+      PADDLE_ENFORCE_NOT_NULL(event_);
+    }
+  }
+#endif
+}
+
+EagerDeletionOpHandle::~EagerDeletionOpHandle() {
+#ifdef PADDLE_WITH_CUDA
+  if (event_) {
+    auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
+    platform::CUDADeviceGuard guard(gpu_place.device);
+    PADDLE_ENFORCE(cudaEventDestroy(event_));
+  }
+#endif
+}
+
+std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; }
+
+void EagerDeletionOpHandle::RunImpl() {
+  auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  std::deque<std::shared_ptr<memory::Allocation>> garbages;
+  for (auto &name : var_names_) {
+    auto it = ref_cnts_->find(name);
+    // Var not found, not reference count has not decreased to 0
+    if (it == ref_cnts_->end() || it->second.fetch_sub(1) != 1) {
+      continue;
+    }
+
+    auto *var = exec_scope->FindVar(name);
+    if (var == nullptr) {
+      continue;
+    }
+
+    VLOG(2) << "Erase variable " << name;
+
+    if (var->IsType<LoDTensor>()) {
+      garbages.emplace_back(var->GetMutable<LoDTensor>()->MoveMemoryHolder());
+    } else if (var->IsType<SelectedRows>()) {
+      garbages.emplace_back(
+          var->GetMutable<SelectedRows>()->mutable_value()->MoveMemoryHolder());
+    } else if (var->IsType<LoDTensorArray>()) {
+      auto *tensor_arr = var->GetMutable<LoDTensorArray>();
+      for (auto &t : *tensor_arr) {
+        garbages.emplace_back(t.MoveMemoryHolder());
+      }
+    } else {
+      PADDLE_THROW("Type %s of %s is not supported eager deletion",
+                   var->Type().name(), name);
+    }
+  }
+
+  if (!garbages.empty()) {
+    ClearGarbages(&garbages);
+  }
+}
+
+void EagerDeletionOpHandle::ClearGarbages(
+    std::deque<std::shared_ptr<memory::Allocation>> *garbages) {
+#ifdef PADDLE_WITH_CUDA
+  if (event_) {
+    auto compute_stream = dev_ctx_->stream();
+    auto callback_stream =
+        reinterpret_cast<StreamGarbageCollector *>(gc_)->stream();
+    auto callback_func = [=]() {
+      PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
+      PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
+    };
+    gc_->Add(std::move(*garbages), callback_func);
+  } else {
+#endif
+    gc_->Add(std::move(*garbages));
+#ifdef PADDLE_WITH_CUDA
+  }
+#endif
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.h
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <deque>
+#include <string>
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace details {
+
+class EagerDeletionOpHandle : public OpHandleBase {
+ public:
+  EagerDeletionOpHandle(ir::Node *node, const Scope *scope,
+                        const platform::Place &place,
+                        const std::unordered_set<std::string> &var_names,
+                        GarbageCollector *gc,
+                        AtomicReferenceCountMap *ref_cnts);
+
+  ~EagerDeletionOpHandle();
+
+  std::string Name() const override;
+
+ protected:
+  void RunImpl() override;
+
+ private:
+  void ClearGarbages(std::deque<std::shared_ptr<memory::Allocation>> *garbages);
+
+  const Scope *scope_;
+  std::unordered_set<std::string> var_names_;
+  GarbageCollector *gc_;               // not own
+  AtomicReferenceCountMap *ref_cnts_;  // not own
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDADeviceContext *dev_ctx_{nullptr};
+  cudaEvent_t event_{nullptr};
+#endif
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/eager_deletion_pass.cc
+++ b/paddle/fluid/framework/details/eager_deletion_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
+#include "paddle/fluid/framework/details/eager_deletion_pass.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  auto &ref_cnts =
+      Get<std::vector<AtomicReferenceCountMap>>(kRuntimeReferenceCount);
+  PADDLE_ENFORCE(ref_cnts.empty(),
+                 "kRuntimeReferenceCount should be initialized here!");
+
+  const auto &vars = graph->Get<GraphVars>(kGraphVars);
+  ref_cnts.resize(vars.size());
+
+  const auto &last_live_ops =
+      Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
+  const auto &gcs = Get<GarbageCollectorMap>(kGarbageCollector);
+  const auto &places = Get<std::vector<platform::Place>>(kAllPlaces);
+
+  // a reverse map of last_live_ops
+  //   i.e., last op --> variable names which can be deleted.
+  std::unordered_map<ComputationOpHandle *, std::unordered_set<std::string>>
+      op_vars_map;
+
+  for (auto &var_ops_map : last_live_ops) {
+    for (auto &var_ops_pair : var_ops_map) {
+      const std::string &var_name = var_ops_pair.first;
+      for (auto *op : var_ops_pair.second) {
+        op_vars_map[op].insert(var_name);
+      }
+    }
+  }
+
+  for (auto &pair : op_vars_map) {
+    auto *op = pair.first;
+    auto &var_names = pair.second;
+
+    auto *eager_deletion_node =
+        graph->CreateEmptyNode("eager_deletion", ir::Node::Type::kOperation);
+    auto *eager_deletion_op = new EagerDeletionOpHandle(
+        eager_deletion_node, op->GetScope(), op->GetPlace(), var_names,
+        gcs.at(places[op->GetScopeIdx()]).get(),
+        &(ref_cnts[op->GetScopeIdx()]));
+
+    auto it = std::find_if(
+        op->Outputs().begin(), op->Outputs().end(), [](VarHandleBase *var) {
+          return dynamic_cast<DummyVarHandle *>(var) != nullptr;
+        });
+
+    if (it != op->Outputs().end()) {
+      eager_deletion_op->AddInput(*it);
+    } else {
+      auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
+      graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
+      op->AddOutput(dep_var);
+      eager_deletion_op->AddInput(dep_var);
+    }
+
+    auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar());
+    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dummy_leaf);
+    eager_deletion_op->AddOutput(dummy_leaf);
+  }
+
+  VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)";
+  return graph;
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(eager_deletion_pass,
+              paddle::framework::details::EagerDeletionPass)
+    .RequirePassAttr(paddle::framework::details::kRuntimeReferenceCount)
+    .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars)
+    .RequirePassAttr(paddle::framework::details::kAllPlaces)
+    .RequirePassAttr(paddle::framework::details::kGarbageCollector);
--- a/paddle/fluid/framework/details/eager_deletion_pass.h
+++ b/paddle/fluid/framework/details/eager_deletion_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class EagerDeletionPass : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -133,6 +133,7 @@ static const char kPlaces[] = "places";
 static const char kParams[] = "params";
 static const char kLocalScopes[] = "local_scopes";
 static const char kStrategy[] = "strategy";
+static const char kNumTrainers[] = "num_trainers";

 void MultiDevSSAGraphBuilder::Init() const {
  all_vars_.clear();
@@ -299,6 +300,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
  auto nodes = graph->ReleaseNodes();
  ir::Graph &result = *graph;

+  int num_trainers = Get<int>(kNumTrainers);
+
  for (auto &node : nodes) {
    if (node->IsVar() && node->Var()) {
      all_vars_.emplace(node->Name(), node->Var());
@@ -383,7 +386,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
          CreateComputationalOps(&result, node, places_.size());
        }

-        if (!is_forwarding && places_.size() > 1) {
+        if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) {
          // Currently, we assume that once gradient is generated, it can be
          // broadcast, and each gradient is only broadcast once.
          if (static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
@@ -562,7 +565,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result,
                                                    int dev_id) const {
  result->Get<GraphOps>(kGraphOps).emplace_back(
      new ComputationOpHandle(result->CreateOpNode(node->Op()),
-                              local_scopes_[dev_id], places_[dev_id]));
+                              local_scopes_[dev_id], places_[dev_id], dev_id));
  CreateOpHandleIOs(result, node, dev_id);
 }

@@ -685,8 +688,8 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result,
  for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) {
    auto p = places_[scope_idx];
    auto s = local_scopes_[scope_idx];
-    result->Get<GraphOps>(kGraphOps).emplace_back(
-        new ComputationOpHandle(result->CreateOpNode(node->Op()), s, p));
+    result->Get<GraphOps>(kGraphOps).emplace_back(new ComputationOpHandle(
+        result->CreateOpNode(node->Op()), s, p, scope_idx));
    CreateOpHandleIOs(result, node, scope_idx);
  }
 }
@@ -862,7 +865,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
      if (node->Op()->Type() == "fetch_barrier") {
        outvar_dev_id =
            GetVarDeviceID(*result, output->Name(), *sharded_var_device);
-        PADDLE_ENFORCE_NE(outvar_dev_id, -1);
+        PADDLE_ENFORCE_NE(outvar_dev_id, -1, "output name %s", output->Name());
      }
      p = places_[outvar_dev_id];
      ir::Node *new_node = nullptr;
@@ -895,4 +898,5 @@ REGISTER_PASS(multi_devices_pass,
    .RequirePassAttr(paddle::framework::details::kPlaces)
    .RequirePassAttr(paddle::framework::details::kParams)
    .RequirePassAttr(paddle::framework::details::kLocalScopes)
-    .RequirePassAttr(paddle::framework::details::kStrategy);
+    .RequirePassAttr(paddle::framework::details::kStrategy)
+    .RequirePassAttr(paddle::framework::details::kNumTrainers);
--- a/paddle/fluid/framework/details/op_graph_view.cc
+++ b/paddle/fluid/framework/details/op_graph_view.cc
@@ -23,6 +23,8 @@ namespace details {
 OpGraphView::OpGraphView(const std::vector<OpHandleBase *> &ops) { Build(ops); }

 void OpGraphView::Build(const std::vector<OpHandleBase *> &ops) {
+  preceding_ops_.clear();
+  pending_ops_.clear();
  for (auto &op : ops) {
    preceding_ops_[op];
    pending_ops_[op];
@@ -40,6 +42,7 @@ void OpGraphView::Build(const std::vector<OpHandleBase *> &ops) {

 std::unordered_set<OpHandleBase *> OpGraphView::AllOps() const {
  std::unordered_set<OpHandleBase *> ret;
+  ret.reserve(preceding_ops_.size());
  for (auto &pair : preceding_ops_) {
    ret.insert(pair.first);
  }

--- a/paddle/fluid/framework/details/op_graph_view.h
+++ b/paddle/fluid/framework/details/op_graph_view.h
@@ -14,7 +14,7 @@

 #pragma once

-#include <memory>
+#include <queue>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
@@ -34,6 +34,11 @@ class OpGraphView {

  bool HasOp(OpHandleBase *op) const;

+  // Use a visitor to visit all pending ops of op
+  // Stop when callback returns false
+  template <typename Callback>
+  bool VisitAllPendingOps(OpHandleBase *op, Callback &&callback) const;
+
 private:
  void Build(const std::vector<OpHandleBase *> &ops);
  void EnforceHasOp(OpHandleBase *op) const;
@@ -44,6 +49,28 @@ class OpGraphView {
      pending_ops_;
 };

+template <typename Callback>
+bool OpGraphView::VisitAllPendingOps(OpHandleBase *op,
+                                     Callback &&callback) const {
+  EnforceHasOp(op);
+  std::unordered_set<OpHandleBase *> visited;
+  std::queue<OpHandleBase *> q;
+  q.push(op);
+  do {
+    op = q.front();
+    q.pop();
+    for (auto &pending_op : pending_ops_.at(op)) {
+      if (visited.count(pending_op) == 0) {
+        visited.insert(pending_op);
+        if (!callback(pending_op)) {
+          return false;
+        }
+      }
+    }
+  } while (!q.empty());
+  return true;
+}
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/reduce_and_gather.h
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
@@ -53,7 +53,7 @@ struct ReduceLoDTensor {
  }
 };

-inline void GatherSelectedRows(
+inline void GatherLocalSelectedRows(
    const std::vector<const SelectedRows *> &src_selecte_rows_,
    const std::vector<platform::Place> &in_places,
    const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,

--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -16,6 +16,12 @@
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
+#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
+#include "paddle/fluid/operators/distributed/collective_client.h"
+#include "paddle/fluid/operators/distributed/collective_server.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
+#endif
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/profiler.h"

 DEFINE_bool(
@@ -26,6 +32,112 @@ namespace paddle {
 namespace framework {
 namespace details {

+std::once_flag CollectiveContext::init_flag_;
+std::unique_ptr<CollectiveContext> CollectiveContext::context_;
+
+static inline std::string GetRemoteVarName(const std::string &var_name,
+                                           int trainer_id) {
+  return string::Sprintf("%s_merged_tmp@trainer_%d", var_name, trainer_id);
+}
+
+void ReduceOpHandle::Wait(
+    const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes) {
+  // TODO(gongwb): use event wait?
+  for (auto &dev_ctx : dev_ctxes) {
+    dev_ctx.second->Wait();
+  }
+}
+
+#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
+template <typename DevCtx, typename DataType>
+void ReduceOpHandle::GatherSelectedRows(
+    const std::vector<const SelectedRows *> &src_selected_rows,
+    const std::vector<platform::Place> &in_places,
+    const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,
+    VarHandle *out_var_handle, const platform::Place &out_place,
+    SelectedRows *dst_selected_rows) {
+  const CollectiveContext &collective_context =
+      *CollectiveContext::GetInstance();
+
+  // 1. gather local selected rows, merge them
+  std::string gathered_var_name = out_var_handle->name_ + "_gathered_tmp";
+  auto scope = local_scopes_.at(out_var_handle->scope_idx_);
+  auto gathered_var_mid = scope->Var(gathered_var_name);
+  auto gathered_select_rows =
+      gathered_var_mid->GetMutable<framework::SelectedRows>();
+  GatherLocalSelectedRows(src_selected_rows, in_places, dev_ctxes, out_place,
+                          gathered_select_rows);
+  // FIXME(gongwb): remove this Wait.
+  Wait(dev_ctxes);
+
+  // merge them
+  auto merged_dev_ctx = dynamic_cast<DevCtx *>(dev_ctxes.at(out_place));
+  std::string merged_var_name =
+      GetRemoteVarName(out_var_handle->name_, collective_context.trainer_id_);
+  auto merged_select_rows =
+      scope->Var(merged_var_name)->GetMutable<SelectedRows>();
+  operators::math::scatter::MergeAdd<DevCtx, DataType> merge_func;
+  merge_func(*merged_dev_ctx, *gathered_select_rows, merged_select_rows);
+
+  // 2. start collective server if it doesn't exist
+  operators::distributed::CollectiveServer *server =
+      operators::distributed::CollectiveServer::GetInstance(
+          collective_context.endpoints_[collective_context.trainer_id_],
+          collective_context.endpoints_.size() - 1);
+
+  auto rpc_server = server->GetRPCServer();
+  rpc_server->RegisterVar(merged_var_name,
+                          operators::distributed::kRequestGetMonomerVariable,
+                          scope, merged_dev_ctx);
+
+  // 3. gather them from all remote nodes.
+  std::vector<const SelectedRows *> remote;
+  operators::distributed::CollectiveClient *client =
+      operators::distributed::CollectiveClient::GetInstance();
+
+  std::vector<operators::distributed::RemoteVar> vars;
+  for (unsigned int i = 0; i < collective_context.endpoints_.size(); i++) {
+    if (i == (unsigned)collective_context.trainer_id_) continue;
+
+    operators::distributed::RemoteVar var;
+    var.trainer_id_ = i;
+    var.var_name_ = GetRemoteVarName(out_var_handle->name_, i);
+    var.ep_ = collective_context.endpoints_[i];
+
+    vars.push_back(var);
+    VLOG(4) << "gather from:" << var.String();
+  }
+
+  // erase gathered vars
+  merged_dev_ctx->Wait();
+  scope->EraseVars(std::vector<std::string>{gathered_var_name});
+
+  PADDLE_ENFORCE(client->Gather(vars, &remote, *merged_dev_ctx, scope));
+  PADDLE_ENFORCE(remote.size() == vars.size());
+
+  // 4. merged local selected rows.
+  std::vector<const SelectedRows *> all;
+  all.resize(collective_context.endpoints_.size());
+  for (auto v : vars) {
+    all[v.trainer_id_] =
+        scope->FindVar(v.var_name_)->GetMutable<SelectedRows>();
+  }
+  all[collective_context.trainer_id_] = merged_select_rows;
+
+  merge_func(*merged_dev_ctx, all, dst_selected_rows);
+
+  rpc_server->WaitVarBarrier(merged_var_name);
+  rpc_server->ClearVar(merged_var_name);
+
+  // 5. clear mid vars
+  std::vector<std::string> tmp_vars{merged_var_name};
+  for (auto r : vars) {
+    tmp_vars.push_back(r.var_name_);
+  }
+  scope->EraseVars(tmp_vars);
+}
+#endif
+
 void ReduceOpHandle::RunImpl() {
  platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);

@@ -90,8 +202,36 @@ void ReduceOpHandle::RunImpl() {
    this->RunAndRecordEvent([&] {
      std::vector<const SelectedRows *> in_selected_rows =
          GetInputValues<SelectedRows>(in_var_handles, var_scopes);
-      GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_, t_out_p,
-                         out_var->GetMutable<framework::SelectedRows>());
+
+      const CollectiveContext &collective_context =
+          *CollectiveContext::GetInstance();
+      VLOG(10) << "GatherSelectedRows CollectiveContext:"
+               << collective_context.String();
+
+      // TODO(gongwb): add cpu support
+      if (collective_context.endpoints_.size() <= 1 ||
+          is_cpu_place(in_places[0]) || is_cpu_place(t_out_p)) {
+        GatherLocalSelectedRows(in_selected_rows, in_places, dev_ctxes_,
+                                t_out_p,
+                                out_var->GetMutable<framework::SelectedRows>());
+        return;
+      }
+
+#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
+      if (framework::IsType<const float>(in_selected_rows[0]->value().type())) {
+        GatherSelectedRows<platform::CUDADeviceContext, float>(
+            in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
+            out_var->GetMutable<framework::SelectedRows>());
+      } else if (framework::IsType<const double>(
+                     in_selected_rows[0]->value().type())) {
+        GatherSelectedRows<platform::CUDADeviceContext, double>(
+            in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
+            out_var->GetMutable<framework::SelectedRows>());
+      } else {
+        PADDLE_ENFORCE(false,
+                       "only support double or float when gahter SelectedRows");
+      }
+#endif
    });
  } else {
    std::vector<const LoDTensor *> lod_tensors =

--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -30,6 +30,32 @@
 namespace paddle {
 namespace framework {
 namespace details {
+struct CollectiveContext {
+  std::vector<std::string> endpoints_;
+  int trainer_id_{0};
+
+  std::string String() const {
+    std::stringstream ss;
+    ss << "endpoints_:";
+    for (auto e : endpoints_) {
+      ss << e << ",";
+    }
+
+    ss << "trainer_id_:" << trainer_id_;
+
+    return ss.str();
+  }
+
+  static CollectiveContext *GetInstance() {
+    std::call_once(init_flag_,
+                   [&]() { context_.reset(new CollectiveContext()); });
+    return context_.get();
+  }
+
+ private:
+  static std::once_flag init_flag_;
+  static std::unique_ptr<CollectiveContext> context_;
+};

 struct ReduceOpHandle : public OpHandleBase {
  std::vector<Scope *> local_scopes_;
@@ -64,6 +90,19 @@ struct ReduceOpHandle : public OpHandleBase {
 protected:
  void RunImpl() override;

+#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
+  template <typename DevCtx, typename DataType>
+  void GatherSelectedRows(
+      const std::vector<const SelectedRows *> &src_selecte_rows_,
+      const std::vector<platform::Place> &in_places,
+      const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,
+      VarHandle *out_var_handle, const platform::Place &out_place,
+      SelectedRows *dst_selecte_rows);
+#endif
+
+  void Wait(
+      const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes);
+
  template <typename T>
  std::vector<const T *> GetInputValues(
      const std::vector<VarHandle *> &in_var_handles,

--- a/paddle/fluid/framework/details/reference_count_op_handle.h
+++ b/paddle/fluid/framework/details/reference_count_op_handle.h
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <atomic>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/garbage_collector.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-using ReferenceCountMap = std::unordered_map<std::string, int>;
-using AtomicReferenceCountMap =
-    std::unordered_map<std::string, std::atomic<int>>;
-using DeviceReferenceCountMap =
-    std::unordered_map<int, std::unique_ptr<ReferenceCountMap>>;
-using AtomicDeviceReferenceCountMap =
-    std::unordered_map<int, std::unique_ptr<AtomicReferenceCountMap>>;
-using DeviceGarbageCollectorMap =
-    std::unordered_map<int,
-                       std::unique_ptr<GarbageCollector<framework::Tensor>>>;
-
-class ReferenceCountOpHandle : public OpHandleBase {
- public:
-  ReferenceCountOpHandle(ir::Node *node, const Scope *scope,
-                         const platform::CUDAPlace &place,
-                         const std::vector<std::string> &var_names,
-                         GarbageCollector<Tensor> *gc,
-                         AtomicReferenceCountMap *ref_cnts)
-      : OpHandleBase(node), scope_(scope), gc_(gc), ref_cnts_(ref_cnts) {
-    dev_ctx_ = static_cast<platform::CUDADeviceContext *>(
-        platform::DeviceContextPool::Instance().Get(place));
-    if (IsStreamGarabageCollector()) {
-      platform::SetDeviceId(place.device);
-      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
-    }
-
-    for (auto &name : var_names) AddVar(name);
-  }
-
-  ~ReferenceCountOpHandle() {
-    if (IsStreamGarabageCollector()) {
-      auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
-      platform::SetDeviceId(gpu_place.device);
-      PADDLE_ENFORCE(cudaEventDestroy(event_));
-    }
-  }
-
-  std::string Name() const override { return "reference_count"; }
-
-  void AddVar(const std::string &name) {
-    auto it = var_names_.find(name);
-    if (it != var_names_.end())
-      ++(it->second);
-    else
-      var_names_[name] = 1;
-  }
-
- protected:
-  void RunImpl() override {
-    auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
-    std::vector<Tensor *> tensors;
-    for (auto &pair : var_names_) {
-      auto &name = pair.first;
-      auto it = ref_cnts_->find(name);
-      if (it == ref_cnts_->end()) continue;
-
-      auto *var = exec_scope->FindVar(name);
-      if (var == nullptr) continue;
-
-      if (var->IsType<LoDTensor>()) {
-        if (it->second.fetch_sub(pair.second) <= pair.second) {
-          tensors.emplace_back(var->GetMutable<LoDTensor>());
-        }
-      } else if (var->IsType<SelectedRows>()) {
-        if (it->second.fetch_sub(pair.second) <= pair.second) {
-          tensors.emplace_back(
-              var->GetMutable<SelectedRows>()->mutable_value());
-        }
-      }
-    }
-
-    if (!tensors.empty()) {
-      ClearTensors(tensors);
-    }
-  }
-
- private:
-  void ClearTensors(const std::vector<Tensor *> &tensors) {
-    auto *gc = dynamic_cast<StreamGarbageCollector<Tensor> *>(gc_);
-    if (gc != nullptr) {
-      auto compute_stream = dev_ctx_->stream();
-      auto callback_stream = gc->stream();
-      auto callback_func = [=]() {
-        PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
-        PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
-      };
-      gc_->Add(tensors, callback_func);
-    } else {
-      gc_->Add(tensors);
-    }
-  }
-
-  bool IsStreamGarabageCollector() const {
-    return dynamic_cast<const StreamGarbageCollector<Tensor> *>(gc_) != nullptr;
-  }
-
-  const Scope *scope_;
-  platform::CUDADeviceContext *dev_ctx_;
-  std::unordered_map<std::string, int> var_names_;
-  GarbageCollector<Tensor> *gc_;       // not own
-  AtomicReferenceCountMap *ref_cnts_;  // not own
-  cudaEvent_t event_;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -14,187 +14,240 @@

 #include <queue>
 #include <string>
+#include <type_traits>
 #include <vector>

 #include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/details/op_graph_view.h"
 #include "paddle/fluid/framework/details/reference_count_pass.h"
+#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"

 namespace paddle {
 namespace framework {
 namespace details {

-static ComputationOpHandle *FindNextComputationOpHandle(VarHandle *var_in) {
-  std::queue<VarHandleBase *> queue;
-  queue.push(var_in);
-  do {
-    auto *var = queue.front();
-    queue.pop();
-    for (auto *op : var->PendingOps()) {
-      auto *compute_op = dynamic_cast<ComputationOpHandle *>(op);
-      if (compute_op != nullptr && compute_op->GetPlace() == var_in->place_) {
-        return compute_op;
+// A functor to shrink/remove operators who depend on other operators in a set
+class ShrinkDepsOpFunctor {
+ private:
+  enum RelationShip { kSame = 0, kNoDeps = 1, kBefore = 2, kAfter = 3 };
+
+ public:
+  explicit ShrinkDepsOpFunctor(const std::vector<OpHandleBase *> &all_ops)
+      : graph_(all_ops) {}
+
+  template <typename OpSet>
+  OpSet operator()(const OpSet &op_set) const {
+    using KeyType = typename OpSet::key_type;
+    static_assert(
+        std::is_base_of<OpHandleBase,
+                        typename std::remove_pointer<KeyType>::type>::value,
+        "Key type of OpSet must be OpHandleBase, or derived of OpHandleBase");
+
+    if (op_set.size() <= 1) return op_set;
+    std::vector<OpHandleBase *> ops(op_set.begin(), op_set.end());
+    OpSet ret;
+    auto rels = GetRelations(ops);
+    auto not_before = [](RelationShip r) { return r != kBefore; };
+    for (size_t i = 0; i < rels.size(); ++i) {
+      if (std::all_of(rels[i].begin(), rels[i].end(), not_before)) {
+        ret.emplace(static_cast<KeyType>(ops[i]));
      }
-      for (auto *out_var : op->Outputs()) {
-        queue.push(out_var);
+    }
+    return ret;
+  }
+
+ private:
+  std::vector<std::vector<RelationShip>> GetRelations(
+      const std::vector<OpHandleBase *> &ops) const {
+    std::unordered_map<OpHandleBase *, size_t> op_to_idx;
+    for (size_t i = 0; i < ops.size(); ++i) {
+      PADDLE_ENFORCE(graph_.HasOp(ops[i]), "Op does not exist in graph");
+      op_to_idx[ops[i]] = i;
+    }
+
+    PADDLE_ENFORCE(op_to_idx.size() == ops.size(), "Duplicate ops");
+
+    std::vector<std::vector<RelationShip>> ret(ops.size());
+    for (auto &e : ret) {
+      e.assign(ops.size(), kSame);
+    }
+
+    size_t found_num = ops.size();
+    size_t total_num = ops.size() * ops.size();
+    auto visitor = [&](OpHandleBase *op, size_t i) {
+      auto it = op_to_idx.find(op);
+      if (it != op_to_idx.end()) {
+        size_t j = it->second;
+        if (i != j && ret[i][j] == kSame) {
+          ret[i][j] = kBefore;
+          ret[j][i] = kAfter;
+          found_num += 2;
+          if (found_num == total_num) {
+            return false;
+          }
+        }
+      }
+      return true;
+    };
+
+    for (size_t i = 0; i < ops.size(); ++i) {
+      auto sub_visitor = [&, i](OpHandleBase *op) { return visitor(op, i); };
+      if (!graph_.VisitAllPendingOps(ops[i], sub_visitor)) {
+        break;
+      }
+    }
+
+    for (size_t i = 0; i < ops.size(); ++i) {
+      for (size_t j = i + 1; j < ops.size(); ++j) {
+        if (ret[i][j] != kSame) continue;
+        ret[i][j] = kNoDeps;
+        ret[j][i] = kNoDeps;
+      }
+    }
+
+    return ret;
+  }
+
+  const OpGraphView graph_;
+};
+
+/**
+ * Find the nearest downstream computation op handle. If the op is a
+ * computation op, just return itself.
+ */
+static ComputationOpHandle *FindNextComputationOpHandleOrReturnItself(
+    OpHandleBase *op, size_t scope_idx) {
+  std::queue<OpHandleBase *> q;
+  std::unordered_set<OpHandleBase *> visited;
+  q.push(op);
+  do {
+    auto *op = q.front();
+    q.pop();
+    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op);
+    if (compute_op != nullptr && compute_op->GetScopeIdx() == scope_idx) {
+      return compute_op;
+    }
+    for (auto *out_var : op->Outputs()) {
+      for (auto *pending_op : out_var->PendingOps()) {
+        if (visited.count(pending_op)) continue;
+        visited.insert(pending_op);
      }
    }
-  } while (!queue.empty());
+  } while (!q.empty());
  return nullptr;
 }

-static void AddDependencyBetween(OpHandleBase *in, OpHandleBase *out,
-                                 ir::Graph *graph) {
-  auto it = std::find_if(
-      in->Outputs().begin(), in->Outputs().end(), [](VarHandleBase *var) {
-        return dynamic_cast<DummyVarHandle *>(var) != nullptr;
-      });
-
-  if (it != in->Outputs().end()) {
-    out->AddInput(*it);
-  } else {
-    auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
-    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
-    in->AddOutput(dep_var);
-    out->AddInput(dep_var);
+static std::unordered_set<ComputationOpHandle *>
+ExtractComputationOpFromLastLivedVar(VarHandle *var, size_t scope_idx,
+                                     const ShrinkDepsOpFunctor &shrink_func,
+                                     bool *ok) {
+  // stage one. Get last op for variable.
+  std::unordered_set<OpHandleBase *> candidates;
+  {
+    if (var->PendingOps().empty() && var->GeneratedOp()) {
+      // No operator depends on this variable. So the last operator is the op
+      // who generates this variable.
+      candidates.emplace(var->GeneratedOp());
+    } else {
+      candidates = var->PendingOps();
+    }
+
+    // No pending ops or generated op is nullptr
+    if (candidates.empty()) {
+      *ok = false;
+      return {};
+    }
+  }
+
+  // stage two. Try to cast them to computation op.
+  // return (*ok=false) when failed.
+  //
+  // The reason why we cannot make any types of op handle to be the last lived
+  // op is:
+  //    some op handle may operate on many DeviceContext, however, our garbage
+  //    collector can only wait one DeviceContext for now. So currently, we wait
+  //    the nearest compute op.
+  std::unordered_set<ComputationOpHandle *> computation_op;
+  {
+    for (auto *op : candidates) {
+      auto *compute_op =
+          FindNextComputationOpHandleOrReturnItself(op, scope_idx);
+      if (compute_op == nullptr) {
+        *ok = false;
+        return {};
+      }
+      computation_op.emplace(compute_op);
+    }
  }
+
+  // stage three. Try to shrink computation op if they depend on each other.
+  // Get the smallest set of the most ops.
+  *ok = true;
+  return shrink_func(computation_op);
+}
+
+static VarDesc *TryGetLatestVarDesc(const std::vector<VarHandle *> &vars) {
+  VarDesc *var_desc = nullptr;
+  std::find_if(vars.rbegin(), vars.rend(), [&](VarHandle *var_handle) -> bool {
+    var_desc = var_handle->Node()->Var();
+    return var_desc != nullptr;
+  });
+  return var_desc;
 }

 std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
-  auto &ref_cnts = Get<DeviceReferenceCountMap>(kGlobalReferenceCount);
-  auto &cur_ref_cnts = Get<AtomicDeviceReferenceCountMap>(kCurReferenceCount);
-  auto &gcs = Get<DeviceGarbageCollectorMap>(kGarbageCollector);
-
-  // It is not easy to find the right reference counts of varaibles in graph
-  // Step 1: Find all variables in computation ops
-  // Step 2: Find all variables in non-computation ops which refers to variables
-  // in computation ops
-  std::unordered_set<std::string> names;
-  std::unordered_map<OpHandleBase *, ReferenceCountOpHandle *>
-      compute_ref_cnt_map;
-
-  auto get_ref_cnts_from_compute_op = [&](
-      OpHandleBase *op, const std::vector<VarHandleBase *> &vars) {
-    std::vector<std::string> var_names_in_op;
-    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op);
-    if (compute_op == nullptr ||
-        !platform::is_gpu_place(compute_op->GetPlace()))
-      return var_names_in_op;
-    auto place = boost::get<platform::CUDAPlace>(compute_op->GetPlace());
-    for (VarHandleBase *var_handle_base : vars) {
-      auto *var_handle = dynamic_cast<VarHandle *>(var_handle_base);
-      if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue;
-
-      if (!platform::is_gpu_place(var_handle->place_) ||
-          boost::get<platform::CUDAPlace>(var_handle->place_) != place)
-        continue;
+  auto &ref_cnts = Get<std::vector<ReferenceCountMap>>(kGlobalReferenceCount);
+  auto &last_live_ops_of_vars =
+      Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
+
+  PADDLE_ENFORCE(last_live_ops_of_vars.empty() && ref_cnts.empty(),
+                 "Last Live Ops and Reference Counts of vars should be "
+                 "initialized at here.");

-      VarDesc *var_desc = var_handle->Node()->Var();
-      auto var_name = var_handle->Node()->Name();
+  const auto &vars = graph->Get<GraphVars>(kGraphVars);

-      // This is weird but there is really some variables without var_desc
-      // in computation_op
-      if (var_desc == nullptr) {
-        var_desc = compute_op->Node()->Op()->Block()->FindVar(var_name);
-        if (var_desc == nullptr) continue;
+  last_live_ops_of_vars.resize(vars.size());
+  ref_cnts.resize(vars.size());
+
+  ShrinkDepsOpFunctor shrink_func(
+      ir::FilterByNodeWrapper<OpHandleBase>(*graph));
+
+  for (size_t i = 0; i < vars.size(); ++i) {
+    for (auto &name_var_pair : vars[i]) {
+      // Whether this variable can be reused or deleted? If not, we do not
+      // compute reference counts and dependencies.
+      VarDesc *var_desc = TryGetLatestVarDesc(name_var_pair.second);
+
+      if (var_desc == nullptr || var_desc->Persistable()) {
+        continue;
      }

-      if (var_desc->Persistable()) continue;
      auto var_type = var_desc->Proto()->type().type();
      if (var_type != proto::VarType::LOD_TENSOR &&
-          var_type != proto::VarType::SELECTED_ROWS) {
+          var_type != proto::VarType::SELECTED_ROWS &&
+          var_type != proto::VarType::LOD_TENSOR_ARRAY) {
+        // Var type cannot be deleted
        continue;
      }

-      // compute op only runs in one device
-      if (ref_cnts[place.device]->count(var_name))
-        ++(*ref_cnts[place.device])[var_name];
-      else
-        (*ref_cnts[place.device])[var_name] = 1;
+      bool ok;
+      auto result = ExtractComputationOpFromLastLivedVar(
+          name_var_pair.second.back(), i, shrink_func, &ok);

-      names.insert(var_name);
-      var_names_in_op.push_back(var_name);
-    }
-    return var_names_in_op;
-  };
-
-  auto update_ref_cnts_from_non_compute_op = [&](
-      OpHandleBase *op, const std::vector<VarHandleBase *> &vars) {
-    if (dynamic_cast<ComputationOpHandle *>(op) != nullptr) return;
-    for (VarHandleBase *var_handle_base : vars) {
-      auto *var_handle = dynamic_cast<VarHandle *>(var_handle_base);
-      if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue;
-
-      auto var_name = var_handle->Node()->Name();
-      auto var_place = var_handle->place_;
-      if (!platform::is_gpu_place(var_place)) continue;
-      auto place = boost::get<platform::CUDAPlace>(var_place);
-      if (names.count(var_name) == 0) continue;
-      if (ref_cnts.count(place.device) &&
-          ref_cnts[place.device]->count(var_name)) {
-        ++(*ref_cnts[place.device])[var_name];
-
-        auto *next_compute_op = FindNextComputationOpHandle(var_handle);
-        if (next_compute_op != nullptr) {
-          if (compute_ref_cnt_map.count(next_compute_op)) {
-            compute_ref_cnt_map[next_compute_op]->AddVar(var_name);
-            VLOG(5) << "Add reference count of " << var_name << " to Operator "
-                    << next_compute_op->Name();
-          } else {
-            // Create new reference_count_op_handle
-            ir::Node *ref_cnt_node = graph->CreateEmptyNode(
-                "reference_count", ir::Node::Type::kOperation);
-            auto *ref_cnt_handle = new ReferenceCountOpHandle(
-                ref_cnt_node, next_compute_op->GetScope(), place, {var_name},
-                gcs[place.device].get(), cur_ref_cnts[place.device].get());
-            AddDependencyBetween(next_compute_op, ref_cnt_handle, graph.get());
-            compute_ref_cnt_map[next_compute_op] = ref_cnt_handle;
-          }
-        }
+      if (ok) {
+        auto &var_name = name_var_pair.first;
+        PADDLE_ENFORCE(!result.empty(), "Last living ops of %s cannot be empty",
+                       var_name);
+        ref_cnts[i].emplace(var_name, result.size());
+        last_live_ops_of_vars[i].emplace(var_name, std::move(result));
      }
    }
-  };
-
-  auto all_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
-  for (auto &op : all_ops) {
-    auto in_var_names = get_ref_cnts_from_compute_op(op, op->Inputs());
-    auto out_var_names = get_ref_cnts_from_compute_op(op, op->Outputs());
-    if (in_var_names.empty() && out_var_names.empty()) continue;
-    in_var_names.insert(in_var_names.end(), out_var_names.begin(),
-                        out_var_names.end());
-    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op);
-    auto place = boost::get<platform::CUDAPlace>(compute_op->GetPlace());
-    ir::Node *ref_cnt_node =
-        graph->CreateEmptyNode("reference_count", ir::Node::Type::kOperation);
-    auto *ref_cnt_handle = new ReferenceCountOpHandle(
-        ref_cnt_node, compute_op->GetScope(), place, in_var_names,
-        gcs[place.device].get(), cur_ref_cnts[place.device].get());
-    AddDependencyBetween(compute_op, ref_cnt_handle, graph.get());
-    compute_ref_cnt_map[compute_op] = ref_cnt_handle;
-  }
-
-  for (auto &op : all_ops) {
-    update_ref_cnts_from_non_compute_op(op, op->Inputs());
-    update_ref_cnts_from_non_compute_op(op, op->Outputs());
-  }
-
-  std::vector<OpHandleBase *> new_all_ops;
-  new_all_ops.reserve(compute_ref_cnt_map.size() + all_ops.size());
-  for (auto &op : all_ops) {
-    new_all_ops.emplace_back(std::move(op));
-    auto it = compute_ref_cnt_map.find(new_all_ops.back());
-    if (it != compute_ref_cnt_map.end()) {
-      // Add LeafNode to ReferenceCountOpHandle
-      auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar());
-      graph->Get<GraphDepVars>(kGraphDepVars).emplace(dummy_leaf);
-      it->second->AddOutput(dummy_leaf);
-      new_all_ops.emplace_back(std::move(it->second));
-    }
  }

-  all_ops.swap(new_all_ops);
  return graph;
 }

@@ -205,5 +258,4 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
 REGISTER_PASS(reference_count_pass,
              paddle::framework::details::ReferenceCountPass)
    .RequirePassAttr(paddle::framework::details::kGlobalReferenceCount)
-    .RequirePassAttr(paddle::framework::details::kCurReferenceCount)
-    .RequirePassAttr(paddle::framework::details::kGarbageCollector);
+    .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars);
--- a/paddle/fluid/framework/details/reference_count_pass.h
+++ b/paddle/fluid/framework/details/reference_count_pass.h
@@ -14,7 +14,6 @@

 #pragma once

-#include "paddle/fluid/framework/details/reference_count_op_handle.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"

@@ -22,10 +21,6 @@ namespace paddle {
 namespace framework {
 namespace details {

-constexpr char kGlobalReferenceCount[] = "reference_count";
-constexpr char kCurReferenceCount[] = "current_reference_count";
-constexpr char kGarbageCollector[] = "garbage_collector";
-
 class ReferenceCountPass : public ir::Pass {
 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(

--- a/paddle/fluid/framework/details/reference_count_pass_helper.cc
+++ b/paddle/fluid/framework/details/reference_count_pass_helper.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/reference_count_pass_helper.h
+++ b/paddle/fluid/framework/details/reference_count_pass_helper.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <atomic>
+#include <map>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/garbage_collector.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class ComputationOpHandle;
+
+using ReferenceCountMap = std::unordered_map<std::string, size_t>;
+
+using AtomicReferenceCountMap =
+    std::unordered_map<std::string, std::atomic<size_t>>;
+
+using GarbageCollectorMap =
+    std::map<platform::Place, std::unique_ptr<GarbageCollector>>;
+
+const char kGlobalReferenceCount[] = "global_reference_count";
+const char kRuntimeReferenceCount[] = "runtime_reference_count";
+const char kGarbageCollector[] = "garbage_collector";
+const char kAllPlaces[] = "all_places";
+
+using LastLiveOpsOfVars =
+    std::unordered_map<std::string, std::unordered_set<ComputationOpHandle*>>;
+const char kLastLiveOpsOfVars[] = "last_live_ops_of_var";
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -16,11 +16,8 @@
 #include <stdexcept>
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/profiler.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/framework/details/reference_count_op_handle.h"
-#endif

 namespace paddle {
 namespace framework {
@@ -69,27 +66,12 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
  platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
  drop_scope_counter_ += 1;

-#ifdef PADDLE_WITH_CUDA
-  const std::string gc_name = "garbage_collector";
-  DeviceGarbageCollectorMap *gc =
-      Graph().Has(gc_name) ? &(Graph().Get<DeviceGarbageCollectorMap>(gc_name))
-                           : nullptr;
-#endif
-
  if (!fetch_tensors.empty() ||
      drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
    drop_scope_counter_ = 0;
    // Wait All computational streams
    for (auto p : places_) {
      platform::DeviceContextPool::Instance().Get(p)->Wait();
-#ifdef PADDLE_WITH_CUDA
-      if (gc != nullptr && platform::is_gpu_place(p)) {
-        auto gpu_place = boost::get<platform::CUDAPlace>(p);
-        auto &gc_at_place = gc->at(gpu_place.device);
-        gc_at_place->Wait();
-        gc_at_place->Reset();
-      }
-#endif
    }
    for (auto &scope : local_scopes_) {
      auto &local_scope =

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -13,18 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/executor.h"
+#include <deque>

 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/ngraph_operator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
+#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/operators/detail/macros.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"

+#ifdef PADDLE_WITH_NGRAPH
+#include "paddle/fluid/framework/ngraph_operator.h"
+#endif
+
 DECLARE_bool(benchmark);
 DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
 DEFINE_bool(use_ngraph, false, "Use NGRAPH to run");
@@ -37,11 +42,43 @@ namespace {
 int kProgramId = -1;
 }  // namespace

+static std::unordered_map<std::string, size_t> GetNonPersistableReferenceCounts(
+    const BlockDesc& block, const std::vector<std::string>& skip_var_list) {
+  std::unordered_map<std::string, size_t> ref_cnts;
+  std::unordered_set<std::string> skip_vars(skip_var_list.begin(),
+                                            skip_var_list.end());
+
+  auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) {
+    for (auto& name_pair : name_map) {
+      for (auto& name : name_pair.second) {
+        if (skip_vars.count(name)) continue;
+        auto* var_desc = block.FindVar(name);
+        if (var_desc == nullptr || var_desc->Persistable()) continue;
+        auto type = var_desc->Proto()->type().type();
+        if (type != proto::VarType::LOD_TENSOR &&
+            type != proto::VarType::SELECTED_ROWS &&
+            type != proto::VarType::LOD_TENSOR_ARRAY) {
+          continue;
+        }
+        ++ref_cnts[name];
+      }
+    }
+  };
+
+  for (auto op_desc : block.AllOps()) {
+    update_ref_cnts(op_desc, op_desc->Inputs());
+    update_ref_cnts(op_desc, op_desc->Outputs());
+  }
+  return ref_cnts;
+}
+
 ExecutorPrepareContext::ExecutorPrepareContext(
-    const framework::ProgramDesc& prog, size_t block_id)
+    const framework::ProgramDesc& prog, size_t block_id,
+    const std::vector<std::string>& skip_ref_cnt_vars)
    : prog_(prog), block_id_(block_id) {
  if (GetEagerDeletionThreshold() >= 0) {
-    ref_cnts_ = GetNonPersistableReferenceCount<int>(prog_, block_id_);
+    global_ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id),
+                                                        skip_ref_cnt_vars);
  }
 }

@@ -49,28 +86,40 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
  VLOG(5) << "destroy ExecutorPrepareContext";
 }

-template <typename RefCntMap>
-static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
-                                GarbageCollector<Tensor>* gc,
-                                RefCntMap* ref_cnts) {
-  std::unordered_set<Tensor*> erase_tensors;
+static void DeleteUnusedTensors(
+    const Scope& scope, const OperatorBase* op, GarbageCollector* gc,
+    std::unordered_map<std::string, size_t>* ref_cnts) {
+  std::deque<std::shared_ptr<memory::Allocation>> garbages;

  auto handler = [&](const VariableNameMap& name_map) {
    for (auto& name_pair : name_map) {
      for (auto& name : name_pair.second) {
        auto it = ref_cnts->find(name);
        if (it == ref_cnts->end()) continue;
-        if ((it->second)-- == 1) {
-          auto* var = scope.FindVar(name);
-          if (var != nullptr) {
-            VLOG(10) << "Erase tensor \'" << name << "\'";
-            if (var->IsType<LoDTensor>()) {
-              erase_tensors.insert(var->GetMutable<LoDTensor>());
-            } else if (var->IsType<SelectedRows>()) {
-              erase_tensors.insert(
-                  var->GetMutable<SelectedRows>()->mutable_value());
-            }
+        if (--(it->second) != 0) {
+          continue;
+        }
+        auto* var = scope.FindVar(name);
+        if (var != nullptr) {
+          continue;
+        }
+
+        VLOG(2) << "Erase variable " << name;
+        if (var->IsType<LoDTensor>()) {
+          garbages.emplace_back(
+              var->GetMutable<LoDTensor>()->MoveMemoryHolder());
+        } else if (var->IsType<SelectedRows>()) {
+          garbages.emplace_back(var->GetMutable<SelectedRows>()
+                                    ->mutable_value()
+                                    ->MoveMemoryHolder());
+        } else if (var->IsType<LoDTensorArray>()) {
+          auto* lod_tensor_arr = var->GetMutable<LoDTensorArray>();
+          for (auto& t : *lod_tensor_arr) {
+            garbages.emplace_back(t.MoveMemoryHolder());
          }
+        } else {
+          PADDLE_THROW("Type %s of %s is not supported eager deletion",
+                       var->Type().name(), name);
        }
      }
    }
@@ -79,19 +128,19 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
  handler(op->Inputs());
  handler(op->Outputs());

-  if (!erase_tensors.empty()) {
-    gc->Add(erase_tensors);
+  if (!garbages.empty()) {
+    gc->Add(std::move(garbages));
  }
 }

 static void EnableFusedOp(ExecutorPrepareContext* ctx) {
 #ifdef PADDLE_WITH_NGRAPH
  VLOG(3) << "use_ngraph=True";
-  auto intervals = FusedOperator::FusedOpIntervals(&ctx->ops_);
+  auto intervals = NgraphOperator::NgraphOpIntervals(&ctx->ops_);
  for (auto& interval : intervals) {
-    auto* fused_op = new FusedOperator(ctx->prog_, ctx->block_id_,
-                                       interval.at(0), interval.at(1));
-    *interval[0] = std::unique_ptr<OperatorBase>(fused_op);
+    auto* ng_op = new NgraphOperator(ctx->prog_, ctx->block_id_, interval.at(0),
+                                     interval.at(1));
+    *interval[0] = std::unique_ptr<OperatorBase>(ng_op);
  }
  for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
    ctx->ops_.erase(it->at(0) + 1, it->at(1));
@@ -114,36 +163,6 @@ void Executor::Close() {
 #endif
 }

-void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
-  if (var_type == proto::VarType::LOD_TENSOR) {
-    var->GetMutable<LoDTensor>();
-  } else if (var_type == proto::VarType::SELECTED_ROWS) {
-    var->GetMutable<SelectedRows>();
-  } else if (var_type == proto::VarType::FEED_MINIBATCH) {
-    var->GetMutable<FeedFetchList>();
-  } else if (var_type == proto::VarType::FETCH_LIST) {
-    var->GetMutable<FeedFetchList>();
-  } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope*>>();
-  } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
-    var->GetMutable<LoDRankTable>();
-  } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
-    var->GetMutable<LoDTensorArray>();
-  } else if (var_type == proto::VarType::PLACE_LIST) {
-    var->GetMutable<platform::PlaceList>();
-  } else if (var_type == proto::VarType::READER) {
-    var->GetMutable<ReaderHolder>();
-  } else if (var_type == proto::VarType::RAW) {
-    // GetMutable will be called in operator
-  } else {
-    PADDLE_THROW(
-        "Variable type %d is not in "
-        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
-        "LOD_RANK_TABLE, PLACE_LIST, READER, RAW]",
-        var_type);
-  }
-}
-
 void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
                               int block_id) {
  auto& global_block = pdesc.Block(block_id);
@@ -351,9 +370,10 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
 }

 std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
-    const ProgramDesc& program, int block_id) {
+    const ProgramDesc& program, int block_id,
+    const std::vector<std::string>& skip_ref_cnt_vars) {
  std::unique_ptr<ExecutorPrepareContext> ctx(
-      new ExecutorPrepareContext(program, block_id));
+      new ExecutorPrepareContext(program, block_id, skip_ref_cnt_vars));
  PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
  auto& block = program.Block(block_id);
  for (auto& op_desc : block.AllOps()) {
@@ -364,16 +384,28 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
 }

 std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
-    const ProgramDesc& program, const std::vector<int>& block_ids) {
+    const ProgramDesc& program, const std::vector<int>& block_ids,
+    const std::vector<std::vector<std::string>>& skip_ref_cnt_vars) {
+  PADDLE_ENFORCE(
+      skip_ref_cnt_vars.empty() || skip_ref_cnt_vars.size() == block_ids.size(),
+      "skip_ref_cnt_vars should be either empty or equals to block number %d",
+      block_ids.size());
  std::vector<std::shared_ptr<ExecutorPrepareContext>> result;
+  size_t idx = 0;
  for (auto& bid : block_ids) {
-    auto* ctx = new ExecutorPrepareContext(program, bid);
+    ExecutorPrepareContext* ctx;
+    if (skip_ref_cnt_vars.empty()) {
+      ctx = new ExecutorPrepareContext(program, bid);
+    } else {
+      ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx]);
+    }
    PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
    auto& block = program.Block(bid);
    for (auto& op_desc : block.AllOps()) {
      ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
    }
    result.push_back(std::shared_ptr<ExecutorPrepareContext>(ctx));
+    ++idx;
  }
  return result;
 }
@@ -391,22 +423,23 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
  }

  int64_t max_memory_size = GetEagerDeletionThreshold();
-  std::unique_ptr<GarbageCollector<Tensor>> gc;
-  // WhileOp would set keep_kids to true,
-  // because WhileGradOp needs the scopes created in WhileOp.
-  // Perhaps, we should not perform eager deletion in WhileOp
-  // The scopes and variables created by WhileOp would be deleted
-  // in WhileGradOp.
+  std::unique_ptr<GarbageCollector> gc;
+  // skip while_op and while_grad_op temporarily
  if (max_memory_size >= 0 && !keep_kids) {
    ctx->ResetReferenceCount();
 #ifdef PADDLE_WITH_CUDA
    if (platform::is_gpu_place(place_)) {
-      gc.reset(new DefaultStreamGarbageCollector<Tensor>(
-          boost::get<platform::CUDAPlace>(place_), max_memory_size));
-    } else {
+      if (IsFastEagerDeletionModeEnabled()) {
+        gc.reset(new UnsafeFastGPUGarbageCollector(
+            boost::get<platform::CUDAPlace>(place_), max_memory_size));
+      } else {
+        gc.reset(new DefaultStreamGarbageCollector(
+            boost::get<platform::CUDAPlace>(place_), max_memory_size));
+      }
+    } else if (platform::is_cpu_place(place_)) {
 #endif
-      gc.reset(new CPUGarbageCollector<Tensor>(
-          boost::get<platform::CPUPlace>(place_), max_memory_size));
+      gc.reset(new CPUGarbageCollector(boost::get<platform::CPUPlace>(place_),
+                                       max_memory_size));
 #ifdef PADDLE_WITH_CUDA
    }
 #endif
@@ -415,17 +448,13 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
  for (auto& op : ctx->ops_) {
    op->Run(*local_scope, place_);

-    if (gc != nullptr) {
+    if (gc) {
      DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
-                          &(ctx->cur_ref_cnts_));
+                          &(ctx->runtime_ref_cnts_));
    }
  }

-  if (gc != nullptr) {
-    gc->Wait();
-  } else {
-    platform::DeviceContextPool::Instance().Get(place_)->Wait();
-  }
+  platform::DeviceContextPool::Instance().Get(place_)->Wait();

  if (local_scope != scope) {
    scope->DeleteScope(local_scope);

--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -26,54 +26,22 @@ limitations under the License. */

 namespace paddle {
 namespace framework {
-extern void InitializeVariable(Variable* var, proto::VarType::Type var_type);
-
-template <typename T>
-std::unordered_map<std::string, T> GetNonPersistableReferenceCount(
-    const ProgramDesc& prog, size_t block_id) {
-  auto& block = prog.Block(block_id);
-  std::unordered_map<std::string, T> ref_cnts;
-
-  auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) {
-    for (auto& name_pair : name_map) {
-      for (auto& name : name_pair.second) {
-        auto* var_desc = block.FindVar(name);
-        if (var_desc == nullptr || var_desc->Persistable()) continue;
-        auto type = var_desc->Proto()->type().type();
-        if (type != proto::VarType::LOD_TENSOR &&
-            type != proto::VarType::SELECTED_ROWS) {
-          continue;
-        }
-
-        auto it = ref_cnts.find(name);
-        if (it != ref_cnts.end()) {
-          ++it->second;
-        } else {
-          ref_cnts[name] = 1;
-        }
-      }
-    }
-  };
-
-  for (auto op_desc : block.AllOps()) {
-    update_ref_cnts(op_desc, op_desc->Inputs());
-    update_ref_cnts(op_desc, op_desc->Outputs());
-  }
-  return ref_cnts;
-}

 struct ExecutorPrepareContext {
-  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id);
+  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id,
+                         const std::vector<std::string>& skip_ref_cnt_vars =
+                             std::vector<std::string>());
+
  ~ExecutorPrepareContext();

-  void ResetReferenceCount() { cur_ref_cnts_ = ref_cnts_; }
+  void ResetReferenceCount() { runtime_ref_cnts_ = global_ref_cnts_; }

  const framework::ProgramDesc& prog_;
  size_t block_id_;
  std::vector<std::unique_ptr<OperatorBase>> ops_;

-  std::unordered_map<std::string, int> ref_cnts_;
-  std::unordered_map<std::string, int> cur_ref_cnts_;
+  std::unordered_map<std::string, size_t> global_ref_cnts_;
+  std::unordered_map<std::string, size_t> runtime_ref_cnts_;
 };

 class Executor {
@@ -109,10 +77,14 @@ class Executor {
           const std::string& fetch_holder_name = "fetch");

  static std::unique_ptr<ExecutorPrepareContext> Prepare(
-      const ProgramDesc& program, int block_id);
+      const ProgramDesc& program, int block_id,
+      const std::vector<std::string>& skip_ref_cnt_vars =
+          std::vector<std::string>());

  static std::vector<std::shared_ptr<ExecutorPrepareContext>> Prepare(
-      const ProgramDesc& program, const std::vector<int>& block_ids);
+      const ProgramDesc& program, const std::vector<int>& block_ids,
+      const std::vector<std::vector<std::string>>& skip_ref_cnt_vars =
+          std::vector<std::vector<std::string>>());

  void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id);


--- a/paddle/fluid/framework/executor_thread_worker.cc
+++ b/paddle/fluid/framework/executor_thread_worker.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/executor_thread_worker.h"
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "google/protobuf/message.h"
+#include "google/protobuf/text_format.h"
+
+#include "gflags/gflags.h"
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/cpu_helper.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/pybind/pybind.h"
+namespace paddle {
+namespace framework {
+
+void ExecutorThreadWorker::CreateThreadOperators(const ProgramDesc& program) {
+  auto& block = program.Block(0);
+  op_names_.clear();
+  for (auto& op_desc : block.AllOps()) {
+    std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc);
+    op_names_.push_back(op_desc->Type());
+    OperatorBase* local_op_ptr = local_op.release();
+    ops_.push_back(local_op_ptr);
+    continue;
+  }
+}
+
+void ExecutorThreadWorker::CreateThreadResource(
+    const framework::ProgramDesc& program,
+    const paddle::platform::Place& place) {
+  CreateThreadScope(program);
+  CreateThreadOperators(program);
+  SetMainProgram(program);
+  SetPlace(place);
+}
+
+void ExecutorThreadWorker::CreateThreadScope(const ProgramDesc& program) {
+  auto& block = program.Block(0);
+
+  PADDLE_ENFORCE_NOT_NULL(
+      root_scope_, "root_scope should be set before creating thread scope");
+
+  thread_scope_ = &root_scope_->NewScope();
+  for (auto& var : block.AllVars()) {
+    if (var->Persistable()) {
+      auto* ptr = root_scope_->Var(var->Name());
+      InitializeVariable(ptr, var->GetType());
+    } else {
+      auto* ptr = thread_scope_->Var(var->Name());
+      InitializeVariable(ptr, var->GetType());
+    }
+  }
+}
+
+void ExecutorThreadWorker::SetDataFeed(
+    const std::shared_ptr<DataFeed>& datafeed) {
+  thread_reader_ = datafeed;
+}
+
+void ExecutorThreadWorker::BindingDataFeedMemory() {
+  const std::vector<std::string>& input_feed =
+      thread_reader_->GetUseSlotAlias();
+  for (auto name : input_feed) {
+    thread_reader_->AddFeedVar(thread_scope_->Var(name), name);
+  }
+}
+
+void ExecutorThreadWorker::SetFetchVarNames(
+    const std::vector<std::string>& fetch_var_names) {
+  fetch_var_names_.clear();
+  fetch_var_names_.insert(fetch_var_names_.end(), fetch_var_names.begin(),
+                          fetch_var_names.end());
+}
+
+void ExecutorThreadWorker::SetDevice() {
+#if defined _WIN32 || defined __APPLE__
+  return;
+#else
+  static unsigned concurrency_cap = std::thread::hardware_concurrency();
+  int thread_id = this->thread_id_;
+
+  if (static_cast<unsigned>(thread_id) < concurrency_cap) {
+    unsigned proc = thread_id;
+
+    cpu_set_t mask;
+    CPU_ZERO(&mask);
+    CPU_SET(proc, &mask);
+
+    if (-1 == sched_setaffinity(0, sizeof(mask), &mask)) {
+      VLOG(1) << "WARNING: Failed to set thread affinity for thread "
+              << thread_id;
+    } else {
+      CPU_ZERO(&mask);
+      if ((0 != sched_getaffinity(0, sizeof(mask), &mask)) ||
+          (CPU_ISSET(proc, &mask) == 0)) {
+        VLOG(3) << "WARNING: Failed to set thread affinity for thread "
+                << thread_id;
+      }
+    }
+  } else {
+    VLOG(1) << "WARNING: Failed to set thread affinity for thread "
+            << thread_id;
+  }
+#endif
+}
+
+template <typename T>
+void print_lod_tensor(std::string var_name, const LoDTensor& lod_tensor) {
+  auto inspect = lod_tensor.data<T>();
+  auto element_num = lod_tensor.numel();
+
+  std::ostringstream sstream;
+  sstream << var_name << " (element num " << element_num << "): [";
+  sstream << inspect[0];
+  for (int j = 1; j < element_num; ++j) {
+    sstream << " " << inspect[j];
+  }
+  sstream << "]";
+
+  std::cout << sstream.str() << std::endl;
+}
+
+void print_fetch_var(Scope* scope, std::string var_name) {
+  const LoDTensor& tensor = scope->FindVar(var_name)->Get<LoDTensor>();
+
+  if (std::type_index(tensor.type()) ==
+      std::type_index(typeid(platform::float16))) {
+    print_lod_tensor<platform::float16>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) == std::type_index(typeid(float))) {
+    print_lod_tensor<float>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) ==
+             std::type_index(typeid(double))) {
+    print_lod_tensor<double>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) == std::type_index(typeid(int))) {
+    print_lod_tensor<int>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) ==
+             std::type_index(typeid(int64_t))) {
+    print_lod_tensor<int64_t>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) == std::type_index(typeid(bool))) {
+    print_lod_tensor<bool>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) ==
+             std::type_index(typeid(uint8_t))) {
+    print_lod_tensor<uint8_t>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) ==
+             std::type_index(typeid(int16_t))) {
+    print_lod_tensor<int16_t>(var_name, tensor);
+  } else if (std::type_index(tensor.type()) ==
+             std::type_index(typeid(int8_t))) {
+    print_lod_tensor<int8_t>(var_name, tensor);
+  } else {
+    VLOG(1) << "print_fetch_var: unrecognized data type:"
+            << tensor.type().name();
+  }
+
+  return;
+}
+
+void ExecutorThreadWorker::TrainFiles() {
+  platform::SetNumThreads(1);
+
+  // todo: configurable
+  SetDevice();
+
+  int fetch_var_num = fetch_var_names_.size();
+  fetch_values_.clear();
+  fetch_values_.resize(fetch_var_num);
+
+  thread_reader_->Start();
+
+  int cur_batch;
+  int batch_cnt = 0;
+  while ((cur_batch = thread_reader_->Next()) > 0) {
+    // executor run here
+    for (auto& op : ops_) {
+      op->Run(*thread_scope_, place_);
+    }
+
+    ++batch_cnt;
+    thread_scope_->DropKids();
+
+    if (debug_ == false || thread_id_ != 0) {
+      continue;
+    }
+
+    for (int i = 0; i < fetch_var_num; ++i) {
+      print_fetch_var(thread_scope_, fetch_var_names_[i]);
+    }  // end for (int i = 0...)
+  }    // end while ()
+}
+
+void ExecutorThreadWorker::SetThreadId(int tid) { thread_id_ = tid; }
+
+void ExecutorThreadWorker::SetPlace(const platform::Place& place) {
+  place_ = place;
+}
+
+void ExecutorThreadWorker::SetMainProgram(
+    const ProgramDesc& main_program_desc) {
+  main_program_.reset(new ProgramDesc(main_program_desc));
+}
+
+void ExecutorThreadWorker::SetRootScope(Scope* g_scope) {
+  root_scope_ = g_scope;
+}
+
+}  // einit_modelnd namespace framework
+}  // end namespace paddle
--- a/paddle/fluid/framework/executor_thread_worker.h
+++ b/paddle/fluid/framework/executor_thread_worker.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <set>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+#include "paddle/fluid/framework/data_feed.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+void CreateTensor(Variable* var, proto::VarType::Type var_type);
+
+class ExecutorThreadWorker {
+ public:
+  ExecutorThreadWorker()
+      : thread_id_(-1), root_scope_(NULL), thread_scope_(NULL), debug_(false) {}
+  ~ExecutorThreadWorker() {}
+
+  void CreateThreadResource(const framework::ProgramDesc& program,
+                            const paddle::platform::Place& place);
+  void SetThreadId(int tid);
+  void SetDebug(const bool debug) { debug_ = debug; }
+  void SetRootScope(Scope* g_scope);
+  // set cpu device in this function
+  // cpu binding is used by default
+  void SetDevice();
+  // since we read data into memory that can not be accessed by program
+  // we need to bind memory of data with corresponding variables in program
+  // this function should be called after data feed is set
+  void BindingDataFeedMemory();
+  // set data feed declared in executor
+  void SetDataFeed(const std::shared_ptr<DataFeed>& datafeed);
+  // A multi-thread training function
+  void TrainFiles();
+  // set fetch variable names from python interface assigned by users
+  void SetFetchVarNames(const std::vector<std::string>& fetch_var_names);
+
+ private:
+  void CreateThreadScope(const framework::ProgramDesc& program);
+  void CreateThreadOperators(const framework::ProgramDesc& program);
+  void SetMainProgram(const ProgramDesc& main_program_desc);
+  void SetPlace(const paddle::platform::Place& place);
+
+ protected:
+  // thread index
+  std::shared_ptr<DataFeed> thread_reader_;  // shared queue, thread buffer
+  int thread_id_;
+  // operator name
+  std::vector<std::string> op_names_;
+  // thread level, local operators for forward and backward
+  std::vector<OperatorBase*> ops_;
+  // main program for training
+  std::unique_ptr<framework::ProgramDesc> main_program_;
+  // execution place
+  platform::Place place_;
+  // root scope for model parameters
+  Scope* root_scope_;
+  // a thread scope, father scope is global score which is shared
+  Scope* thread_scope_;
+
+ private:
+  std::vector<std::string> fetch_var_names_;
+  std::vector<std::vector<float>> fetch_values_;
+  bool debug_;
+};
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -16,7 +16,9 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "glog/logging.h"
+#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/place.h"

 namespace paddle {
 namespace framework {
@@ -53,5 +55,12 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
  return tensor;
 }

+LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name) {
+  Variable* var = scope.FindVar(var_name);
+  PADDLE_ENFORCE(var, "%s no in scope", var_name);
+  PADDLE_ENFORCE(var->IsType<LoDTensor>(), "Only support lod tensor now.");
+  return *var->GetMutable<LoDTensor>();
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/feed_fetch_method.h
+++ b/paddle/fluid/framework/feed_fetch_method.h
@@ -27,5 +27,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
 LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
                            size_t index);

+LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name);
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
+#include "paddle/fluid/framework/garbage_collector.h"
+
+namespace paddle {
+namespace framework {
+
+GarbageCollector::GarbageCollector(const platform::Place &place,
+                                   size_t max_memory_size)
+    : max_memory_size_((std::max)(max_memory_size, static_cast<size_t>(1))) {
+  garbages_.reset(new GarbageQueue());
+  dev_ctx_ = platform::DeviceContextPool::Instance().Get(place);
+}
+
+CPUGarbageCollector::CPUGarbageCollector(const platform::CPUPlace &place,
+                                         size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void CPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
+  callback();
+}
+
+#ifdef PADDLE_WITH_CUDA
+UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(
+    const platform::CUDAPlace &place, size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void UnsafeFastGPUGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  callback();
+}
+
+DefaultStreamGarbageCollector::DefaultStreamGarbageCollector(
+    const platform::CUDAPlace &place, size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void DefaultStreamGarbageCollector::Wait() const {
+  static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
+      ->WaitStreamCallback();
+}
+
+void DefaultStreamGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
+      ->AddStreamCallback(callback);
+}
+
+StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
+                                               size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {
+  platform::CUDADeviceGuard guard(place.device);
+  PADDLE_ENFORCE(cudaStreamCreate(&stream_));
+  callback_manager_.reset(new platform::StreamCallbackManager(stream_));
+}
+
+StreamGarbageCollector::~StreamGarbageCollector() {
+  auto place = boost::get<platform::CUDAPlace>(this->dev_ctx_->GetPlace());
+  platform::CUDADeviceGuard guard(place.device);
+  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
+  PADDLE_ENFORCE(cudaStreamDestroy(stream_));
+}
+
+cudaStream_t StreamGarbageCollector::stream() const { return stream_; }
+
+void StreamGarbageCollector::Wait() const { callback_manager_->Wait(); }
+
+void StreamGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  callback_manager_->AddCallback(callback);
+}
+#endif
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -14,7 +14,6 @@

 #pragma once

-#include <algorithm>
 #include <deque>
 #include <functional>
 #include <memory>
@@ -24,134 +23,74 @@
 namespace paddle {
 namespace framework {

-// T should have memory_size() and clear() method
-template <typename T>
 class GarbageCollector {
 public:
-  GarbageCollector(const platform::Place &place, size_t max_memory_size)
-      : max_memory_size_((std::max)(max_memory_size, static_cast<size_t>(1))) {
-    garbages_.reset(new std::deque<T *>());
-    dev_ctx_ = platform::DeviceContextPool::Instance().Get(place);
-  }
+  using GarbageQueue = std::deque<std::shared_ptr<memory::Allocation>>;

-  virtual ~GarbageCollector() {}
+  GarbageCollector(const platform::Place &place, size_t max_memory_size);

-  void Reset() {
-    std::lock_guard<std::mutex> guard(mutex_);
-    garbages_.reset(new std::deque<T *>());
-    cur_memory_size_ = 0;
-  }
+  virtual ~GarbageCollector() = default;
+
+  virtual void Wait() const {}

  template <typename Container>
-  void Add(const Container &objs) {
-    Add(objs, []() {});
-  }
+  void Add(Container &&objs);

  template <typename Container, typename Callback>
-  void Add(const Container &objs, Callback &&callback) {
-    std::shared_ptr<std::deque<T *>> clear_deque;
-    {
-      std::lock_guard<std::mutex> guard(mutex_);
-      for (auto *obj : objs) {
-        garbages_->push_back(obj);
-        cur_memory_size_ += obj->memory_size();
-      }
-      if (cur_memory_size_ >= max_memory_size_) {
-        cur_memory_size_ = 0;
-        clear_deque = garbages_;
-        garbages_.reset(new std::deque<T *>());
-      }
-    }
-
-    if (clear_deque != nullptr) {
-      callback();
-      ClearCallback([=]() {
-        for (auto *obj : *clear_deque) obj->clear();
-      });
-    }
-  }
-
-  virtual void Wait() const {}
+  void Add(Container &&objs, Callback &&callback);

 protected:
  virtual void ClearCallback(const std::function<void()> &callback) = 0;

  platform::DeviceContext *dev_ctx_;
-  std::shared_ptr<std::deque<T *>> garbages_;
+  std::unique_ptr<GarbageQueue> garbages_;
  mutable std::mutex mutex_;
  const size_t max_memory_size_;
-  size_t cur_memory_size_ = 0;
+  size_t cur_memory_size_{0};
 };

-template <typename T>
-class CPUGarbageCollector : public GarbageCollector<T> {
+class CPUGarbageCollector : public GarbageCollector {
 public:
-  CPUGarbageCollector(const platform::CPUPlace &place, size_t max_memory_size)
-      : GarbageCollector<T>(place, max_memory_size) {}
+  CPUGarbageCollector(const platform::CPUPlace &place, size_t max_memory_size);

 protected:
-  void ClearCallback(const std::function<void()> &callback) override {
-    callback();
-  }
+  void ClearCallback(const std::function<void()> &callback) override;
 };

 #ifdef PADDLE_WITH_CUDA
-template <typename T>
-class DefaultStreamGarbageCollector : public GarbageCollector<T> {
+class UnsafeFastGPUGarbageCollector : public GarbageCollector {
 public:
-  DefaultStreamGarbageCollector(const platform::CUDAPlace &place,
-                                size_t max_memory_size)
-      : GarbageCollector<T>(place, max_memory_size) {}
+  UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place,
+                                size_t max_memory_size);

-  cudaStream_t stream() const {
-    return static_cast<const platform::CUDADeviceContext *>(this->dev_ctx_)
-        ->stream();
-  }
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+};

-  void Wait() const override {
-    this->dev_ctx_->Wait();
-    static_cast<const platform::CUDADeviceContext *>(this->dev_ctx_)
-        ->WaitStreamCallback();
-  }
+class DefaultStreamGarbageCollector : public GarbageCollector {
+ public:
+  DefaultStreamGarbageCollector(const platform::CUDAPlace &place,
+                                size_t max_memory_size);
+
+  void Wait() const override;

 protected:
-  void ClearCallback(const std::function<void()> &callback) override {
-    static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
-        ->AddStreamCallback(callback);
-  }
+  void ClearCallback(const std::function<void()> &callback) override;
 };

-template <typename T>
-class StreamGarbageCollector : public GarbageCollector<T> {
+class StreamGarbageCollector : public GarbageCollector {
 public:
  StreamGarbageCollector(const platform::CUDAPlace &place,
-                         size_t max_memory_size)
-      : GarbageCollector<T>(place, max_memory_size) {
-    PADDLE_ENFORCE(cudaSetDevice(place.device));
-    PADDLE_ENFORCE(cudaStreamCreate(&stream_));
-    callback_manager_.reset(new platform::StreamCallbackManager(stream_));
-  }
+                         size_t max_memory_size);

-  ~StreamGarbageCollector() {
-    auto place = boost::get<platform::CUDAPlace>(this->dev_ctx_->GetPlace());
-    PADDLE_ENFORCE(cudaSetDevice(place.device));
-    PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
-    PADDLE_ENFORCE(cudaStreamDestroy(stream_));
-  }
+  ~StreamGarbageCollector();

-  void Wait() const override {
-    PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
-    std::lock_guard<std::mutex> guard(this->mutex_);
-    callback_manager_->Wait();
-  }
+  void Wait() const override;

-  cudaStream_t stream() const { return stream_; }
+  cudaStream_t stream() const;

 protected:
-  void ClearCallback(const std::function<void()> &callback) override {
-    std::lock_guard<std::mutex> guard(this->mutex_);
-    callback_manager_->AddCallback(callback);
-  }
+  void ClearCallback(const std::function<void()> &callback) override;

 private:
  cudaStream_t stream_;
@@ -159,5 +98,33 @@ class StreamGarbageCollector : public GarbageCollector<T> {
 };
 #endif

+template <typename Container>
+void GarbageCollector::Add(Container &&objs) {
+  Add(std::forward<Container>(objs), []() {});
+}
+
+template <typename Container, typename Callback>
+void GarbageCollector::Add(Container &&objs, Callback &&callback) {
+  GarbageQueue *garbage_queue = nullptr;
+  {
+    std::lock_guard<std::mutex> guard(mutex_);
+    for (auto &obj : objs) {
+      if (!obj) continue;
+      cur_memory_size_ += obj->size();
+      garbages_->push_back(std::move(obj));
+    }
+    if (cur_memory_size_ >= max_memory_size_) {
+      cur_memory_size_ = 0;
+      garbage_queue = garbages_.release();
+      garbages_.reset(new GarbageQueue());
+    }
+  }
+
+  if (garbage_queue) {
+    callback();
+    ClearCallback([garbage_queue]() { delete garbage_queue; });
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
@@ -46,14 +46,16 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
  auto* scope = param_scope();
  PADDLE_ENFORCE(scope);

+  std::string type = is_conv3d() ? "conv3d" : "conv2d";
+
  GraphPatternDetector gpd;
  auto* conv_input =
      gpd.mutable_pattern()
          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
          ->AsInput()
-          ->assert_is_op_input("conv2d", "Input");
+          ->assert_is_op_input(type, "Input");
  patterns::ConvBias conv_bias_pattern(gpd.mutable_pattern(), name_scope_);
-  conv_bias_pattern(conv_input);
+  conv_bias_pattern(conv_input, is_conv3d());
  int found_conv_bias_count = 0;
  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                     Graph* g) {
@@ -109,7 +111,7 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
      desc.SetInput("Filter", std::vector<std::string>({conv_weight->Name()}));
      desc.SetInput("Bias", std::vector<std::string>({eltwise_bias->Name()}));
      desc.SetOutput("Output", std::vector<std::string>({eltwise_out->Name()}));
-      desc.SetType("conv2d");
+      desc.SetType(type);

      for (auto& attr : conv->Op()->GetAttrMap()) {
        desc.SetAttr(attr.first, attr.second);
@@ -135,3 +137,5 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
 }  // namespace paddle
 REGISTER_PASS(conv_bias_mkldnn_fuse_pass,
              paddle::framework::ir::ConvBiasFusePass);
+REGISTER_PASS(conv3d_bias_mkldnn_fuse_pass,
+              paddle::framework::ir::Conv3DBiasFusePass);
--- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h
@@ -26,11 +26,19 @@ namespace ir {
 class ConvBiasFusePass : public FusePassBase {
 public:
  virtual ~ConvBiasFusePass() {}
+  virtual bool is_conv3d() const { return false; }

 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
  const std::string name_scope_{"conv_bias_mkldnn_fuse"};
 };
+/*
+* Fuse the Conv3D and Elementwise_add to a Conv3DBiasOp.
+*/
+class Conv3DBiasFusePass : public ConvBiasFusePass {
+ public:
+  bool is_conv3d() const override { return true; }
+};
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -38,9 +38,8 @@ void CheckProgram(const ProgramDesc &program) {
    switch (role_id) {
      case _INT(OpRole::kForward):
        if (visit.find(_INT(OpRole::kBackward)) != visit.end()) {
-          LOG(ERROR)
-              << "Cannot add backward operator before forward operator %s."
-              << op->Type();
+          LOG(ERROR) << "Cannot add backward operator before forward operator "
+                     << op->Type();
        }
        break;
      case _INT(OpRole::kBackward):

--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -73,14 +73,21 @@ class Graph {
  }

  bool Has(const std::string &attr_name) const {
-    return attrs_.find(attr_name) != attrs_.end();
+    return attrs_.count(attr_name) > 0;
  }

  template <typename AttrType>
  AttrType &Get(const std::string &attr_name) const {
    PADDLE_ENFORCE(Has(attr_name), "%s attr not registered for graph.",
                   attr_name);
-    return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
+    try {
+      return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
+    } catch (boost::bad_any_cast &) {
+      PADDLE_THROW(
+          "Invalid attribute type of %s error, expected: %s, actual: %s",
+          attr_name, typeid(AttrType *).name(),
+          attrs_.at(attr_name).type().name());
+    }
  }

  template <typename AttrType>
@@ -177,14 +184,13 @@ class Graph {
    return nullptr;
  }

-  const ProgramDesc &program() const { return program_; }
-  std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
-      const ProgramDesc &program);
-
  void ResolveHazard(
      const std::map<std::string, std::vector<ir::Node *>> &var_nodes);

 private:
+  std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
+      const ProgramDesc &program);
+
  // This method takes ownership of `node`.
  ir::Node *AddNode(ir::Node *node) {
    PADDLE_ENFORCE(node_set_.find(node) == node_set_.end());

--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1030,10 +1030,11 @@ PDNode *patterns::ElewiseAddActInplaceGrad::operator()(
 }

 PDNode *patterns::ConvBias::operator()(
-    paddle::framework::ir::PDNode *conv_input) {
+    paddle::framework::ir::PDNode *conv_input, bool is_conv3d) {
+  std::string type = is_conv3d ? "conv3d" : "conv2d";
  // Create Operators
-  conv_input->assert_is_op_input("conv2d", "Input");
-  auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
+  conv_input->assert_is_op_input(type, "Input");
+  auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op(type);
  auto *eltiwse_op =
      pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add");
  // Create variables
@@ -1041,11 +1042,11 @@ PDNode *patterns::ConvBias::operator()(
  auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
                              ->AsInput()
                              ->assert_is_persistable_var()
-                              ->assert_is_op_input("conv2d", "Filter");
+                              ->assert_is_op_input(type, "Filter");
  // intermediate variable, will be removed in the IR after fuse.
  auto *conv_out_var = pattern->NewNode(conv_out_repr())
                           ->AsIntermediate()
-                           ->assert_is_only_output_of_op("conv2d")
+                           ->assert_is_only_output_of_op(type)
                           ->assert_is_op_input("elementwise_add");
  // Bias stored in elementwise_add
  auto *eltwise_bias_var = pattern->NewNode(eltwise_bias_repr())

--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -623,7 +623,7 @@ struct ElewiseAddActInplaceGrad : public PatternBase {
 struct ConvBias : public PatternBase {
  ConvBias(PDPattern* pattern, const std::string& name_scope)
      : PatternBase(pattern, name_scope, "conv_bias") {}
-  PDNode* operator()(PDNode* conv_input);
+  PDNode* operator()(PDNode* conv_input, bool is_conv3d = false);
  // declare operator node's name
  PATTERN_DECL_NODE(conv);
  PATTERN_DECL_NODE(eltwise);

--- a/paddle/fluid/framework/ir/is_test_pass.cc
+++ b/paddle/fluid/framework/ir/is_test_pass.cc
@@ -38,7 +38,7 @@ std::unique_ptr<ir::Graph> IsTestPass::ApplyImpl(
  for (const Node* n : graph->Nodes()) {
    if (n->IsOp()) {
      auto* op = n->Op();
-      if (op->HasAttr("is_test")) {
+      if (op->HasAttr("is_test") || op->HasProtoAttr("is_test")) {
        op->SetAttr("is_test", true);
      } else if (std::find(begin(op_list), end(op_list), op->Type()) !=
                 end(op_list)) {

--- a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/ir/mkldnn_placement_pass.h"
+#include <string>

 namespace paddle {
 namespace framework {
@@ -21,9 +22,19 @@ namespace ir {
 std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  VLOG(3) << "Aplies MKL-DNN placement strategy.";
+  const auto& op_types_list =
+      Get<std::unordered_set<std::string>>("mkldnn_enabled_op_types");
  for (const Node* n : graph->Nodes()) {
-    if (n->IsOp() && n->Op()->HasAttr("use_mkldnn")) {
-      n->Op()->SetAttr("use_mkldnn", true);
+    if (n->IsOp()) {
+      auto* op = n->Op();
+      if (op->HasAttr("use_mkldnn") || op->HasProtoAttr("use_mkldnn")) {
+        if (op_types_list.empty()) {
+          op->SetAttr("use_mkldnn", true);
+        } else if (std::find(op_types_list.begin(), op_types_list.end(),
+                             n->Name()) != op_types_list.end()) {
+          op->SetAttr("use_mkldnn", true);
+        }
+      }
    }
  }
  return graph;
@@ -33,5 +44,5 @@ std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
 }  // namespace framework
 }  // namespace paddle

-REGISTER_PASS(mkldnn_placement_pass,
-              paddle::framework::ir::MKLDNNPlacementPass);
+REGISTER_PASS(mkldnn_placement_pass, paddle::framework::ir::MKLDNNPlacementPass)
+    .RequirePassAttr("mkldnn_enabled_op_types");
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/op_info.h"

 namespace paddle {
 namespace framework {
@@ -24,10 +25,11 @@ constexpr char Node::kControlDepVarName[];
 const char Node::kControlDepVarName[] = "__control_var";
 #endif

-std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
+std::unique_ptr<Node> CreateNodeForTest(const std::string &name,
                                        Node::Type type) {
  return std::unique_ptr<Node>(new Node(name, type));
 }
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -51,11 +51,18 @@ class Pass {
  AttrType &Get(const std::string &attr_name) const {
    PADDLE_ENFORCE(attrs_.find(attr_name) != attrs_.end(),
                   "%s attr not registered for pass.", attr_name);
-    return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
+    try {
+      return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
+    } catch (boost::bad_any_cast &) {
+      PADDLE_THROW(
+          "Invalid attribute type of %s error, expected: %s, actual: %s",
+          attr_name, typeid(AttrType *).name(),
+          attrs_.at(attr_name).type().name());
+    }
  }

  bool Has(const std::string &attr_name) const {
-    return attrs_.find(attr_name) != attrs_.end();
+    return attrs_.count(attr_name) > 0;
  }

  void Erase(const std::string &attr_name) {

--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -21,42 +21,11 @@
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/string/pretty_log.h"

 namespace paddle {
 namespace framework {
-
-// These code can be shared with Executor.
-static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
-  if (var_type == proto::VarType::LOD_TENSOR) {
-    var->GetMutable<LoDTensor>();
-  } else if (var_type == proto::VarType::SELECTED_ROWS) {
-    var->GetMutable<SelectedRows>();
-  } else if (var_type == proto::VarType::FEED_MINIBATCH) {
-    var->GetMutable<FeedFetchList>();
-  } else if (var_type == proto::VarType::FETCH_LIST) {
-    var->GetMutable<FeedFetchList>();
-  } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope *>>();
-  } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
-    var->GetMutable<LoDRankTable>();
-  } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
-    var->GetMutable<LoDTensorArray>();
-  } else if (var_type == proto::VarType::PLACE_LIST) {
-    var->GetMutable<platform::PlaceList>();
-  } else if (var_type == proto::VarType::READER) {
-    var->GetMutable<ReaderHolder>();
-  } else if (var_type == proto::VarType::RAW) {
-    // GetMutable will be called in operator
-  } else {
-    PADDLE_THROW(
-        "Variable type %d is not in "
-        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
-        "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]",
-        var_type);
-  }
-}
-
 void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
                            int block_id, bool with_feed_fetch_ops) {
  if (!scope) {

--- a/paddle/fluid/framework/ngraph_bridge.cc
+++ b/paddle/fluid/framework/ngraph_bridge.cc
@@ -12,28 +12,109 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifdef PADDLE_WITH_NGRAPH
 #include <algorithm>
 #include <functional>
+#include <vector>

 #include "paddle/fluid/framework/ngraph_bridge.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"

 #include "ngraph/ngraph.hpp"

 namespace paddle {
 namespace framework {

+static std::shared_ptr<ngraph::Node> GetNode(
+    const std::shared_ptr<OperatorBase>& op, const std::string name,
+    const VariableNameMap& var_map,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto& var_names = var_map.at(name);
+  PADDLE_ENFORCE_EQ(var_names.size(), 1,
+                    "op %s name %s expects one associated var", op->Type(),
+                    name);
+  if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) {
+    return (*ngb_node_map)[var_names[0]];
+  } else {
+    return nullptr;
+  }
+}
+
+static std::shared_ptr<ngraph::Node> GetInputNode(
+    const std::shared_ptr<OperatorBase>& op, const std::string name,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  return GetNode(op, name, op->Inputs(), ngb_node_map);
+}
+
+static std::shared_ptr<ngraph::Node> GetOutputNode(
+    const std::shared_ptr<OperatorBase>& op, const std::string name,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  return GetNode(op, name, op->Outputs(), ngb_node_map);
+}
+
+static void SetOutputNode(
+    const std::shared_ptr<OperatorBase>& op, const std::string name,
+    std::shared_ptr<ngraph::Node> node,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto& var_names = op->Outputs().at(name);
+  if (var_names.size() == 1) {
+    (*ngb_node_map)[var_names[0]] = node;
+  } else if (var_names.size() == 0) {
+    (*ngb_node_map)[""] = node;
+  } else {
+    PADDLE_THROW("name %s has more than 1 var_names.", name);
+  }
+}
+
+static bool HasOutput(const std::shared_ptr<OperatorBase>& op,
+                      const std::string name) {
+  auto& outputs = op->Outputs();
+  if (outputs.find(name) == outputs.end()) return false;
+  return outputs.at(name).size() > 0;
+}
+
+template <typename T>
+static void BuildBinaryNode(
+    const std::shared_ptr<OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto x = GetInputNode(op, "X", ngb_node_map);
+  auto y = GetInputNode(op, "Y", ngb_node_map);
+  auto out = std::make_shared<T>(x, y);
+  SetOutputNode(op, "Out", out, ngb_node_map);
+}
+
+template <typename T>
+static void BuildUnaryNode(
+    const std::shared_ptr<OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto input = GetInputNode(op, "X", ngb_node_map);
+  auto out = std::make_shared<T>(input);
+  SetOutputNode(op, "Out", out, ngb_node_map);
+}
+
 std::map<std::string,
         std::function<void(const std::shared_ptr<OperatorBase>&,
                            std::shared_ptr<std::unordered_map<
                                std::string, std::shared_ptr<ngraph::Node>>>)>>
-    NgraphBridge::NG_NODE_MAP = {};
+    NgraphBridge::NG_NODE_MAP = {{"relu", BuildUnaryNode<ngraph::op::Relu>},
+                                 {"tanh", BuildUnaryNode<ngraph::op::Tanh>}};

-void NgraphBridge::build_graph(const std::shared_ptr<OperatorBase>& op) {
+void NgraphBridge::BuildNgNode(const std::shared_ptr<OperatorBase>& op) {
  auto& op_type = op->Type();
-  NG_NODE_MAP[op_type](op, ngb_node_map);
+  NG_NODE_MAP[op_type](op, ngb_node_map_);
 }

 }  // namespace framework
 }  // namespace paddle
-#endif
--- a/paddle/fluid/framework/ngraph_bridge.h
+++ b/paddle/fluid/framework/ngraph_bridge.h
@@ -14,22 +14,18 @@ limitations under the License. */

 #pragma once

-#ifdef PADDLE_WITH_NGRAPH
-
 #include <algorithm>
 #include <map>
 #include <string>
 #include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/enforce.h"

-#include "ngraph/ngraph.hpp"
+#include "ngraph/node.hpp"

 namespace paddle {
 namespace framework {

+class OperatorBase;
+
 class NgraphBridge {
 public:
  static std::map<
@@ -43,16 +39,15 @@ class NgraphBridge {
      std::shared_ptr<
          std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
          var_node_map)
-      : ngb_node_map(var_node_map) {}
+      : ngb_node_map_(var_node_map) {}

-  void build_graph(const std::shared_ptr<OperatorBase>& op);
+  void BuildNgNode(const std::shared_ptr<OperatorBase>& op);

 private:
  std::shared_ptr<
      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-      ngb_node_map;
+      ngb_node_map_;
 };

 }  // namespace framework
 }  // namespace paddle
-#endif
--- a/paddle/fluid/framework/ngraph_operator.cc
+++ b/paddle/fluid/framework/ngraph_operator.cc
--- a/paddle/fluid/framework/ngraph_operator.h
+++ b/paddle/fluid/framework/ngraph_operator.h
@@ -14,39 +14,32 @@ limitations under the License. */

 #pragma once

-#ifdef PADDLE_WITH_NGRAPH
-
 #include <algorithm>
-#include <atomic>
 #include <string>
 #include <unordered_map>
 #include <vector>

 #include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/ngraph_bridge.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/variant.h"

-#include "ngraph/ngraph.hpp"
+#include "ngraph/type/element_type.hpp"

 namespace paddle {
 namespace framework {

-class FusedOperator : public OperatorBase {
+class NgraphOperator : public OperatorBase {
 public:
  static std::vector<
      std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
-  FusedOpIntervals(
+  NgraphOpIntervals(
      std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops);

-  explicit FusedOperator(
+  explicit NgraphOperator(
      const ProgramDesc& prog, size_t block_id,
      std::vector<std::unique_ptr<OperatorBase>>::iterator start,
      std::vector<std::unique_ptr<OperatorBase>>::iterator end,
@@ -69,4 +62,3 @@ class FusedOperator : public OperatorBase {
 };
 }  // namespace framework
 }  // namespace paddle
-#endif
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -237,6 +237,23 @@ void OpDesc::SetOutput(const std::string &param_name,
  this->outputs_[param_name] = args;
 }

+bool OpDesc::HasProtoAttr(const std::string &name) const {
+  auto &op_info = OpInfoMap::Instance();
+  if (op_info.Has(desc_.type())) {
+    auto op_info_ptr = op_info.Get(desc_.type());
+    if (op_info_ptr.HasOpProtoAndChecker()) {
+      const proto::OpProto &proto = op_info_ptr.Proto();
+      for (int i = 0; i != proto.attrs_size(); ++i) {
+        const proto::OpProto::Attr &attr = proto.attrs(i);
+        if (attr.name() == name) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
 proto::AttrType OpDesc::GetAttrType(const std::string &name) const {
  auto it = attrs_.find(name);
  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);

--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -65,6 +65,8 @@ class OpDesc {
    return attrs_.find(name) != attrs_.end();
  }

+  bool HasProtoAttr(const std::string &name) const;
+
  proto::AttrType GetAttrType(const std::string &name) const;

  std::vector<std::string> AttrNames() const;

--- a/paddle/fluid/framework/op_kernel_type.cc
+++ b/paddle/fluid/framework/op_kernel_type.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_kernel_type.h"
+
+namespace paddle {
+namespace framework {
+
+size_t OpKernelType::Hash::operator()(const OpKernelType& key) const {
+  int cur_loc = 0;
+
+  int place = key.place_.which();
+  cur_loc += OpKernelType::kPlaceBits;
+
+  int data_type = static_cast<int>(key.data_type_) << cur_loc;
+  cur_loc += OpKernelType::kPrimaryDTypeBits;
+
+  int data_layout = static_cast<int>(key.data_layout_) << cur_loc;
+  cur_loc += OpKernelType::kLayoutBits;
+
+  int library_type = static_cast<int>(key.library_type_) << cur_loc;
+  cur_loc += OpKernelType::kLibBits;
+
+  int customized_value = key.customized_type_value_;
+  PADDLE_ENFORCE(customized_value < (1 << OpKernelType::kCustomizeBits));
+  customized_value = customized_value << cur_loc;
+  cur_loc += OpKernelType::kCustomizeBits;
+  PADDLE_ENFORCE(cur_loc < 64);
+
+  std::hash<int> hasher;
+  return hasher(place + data_type + data_layout + library_type +
+                customized_value);
+}
+
+bool OpKernelType::operator==(const OpKernelType& o) const {
+  return platform::places_are_same_class(place_, o.place_) &&
+         data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&
+         library_type_ == o.library_type_ &&
+         customized_type_value_ == o.customized_type_value_;
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
@@ -24,54 +24,55 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-struct OpKernelType {
-  struct Hash {
-    size_t operator()(const OpKernelType& key) const {
-      int place = key.place_.which();
-      int data_type = static_cast<int>(key.data_type_) << LEFT_SHIFT;
-      int data_layout = static_cast<int>(key.data_layout_) << (LEFT_SHIFT * 2);
-      int library_type = static_cast<int>(key.library_type_)
-                         << (LEFT_SHIFT * 3);
-
-      std::hash<int> hasher;
-      return hasher(place + data_type + data_layout + library_type);
-    }
-  };
+class OpKernelType {
+ public:
+  constexpr static int kDefaultCustomizedTypeValue = 0;

-  // place, data_type, library_type kinds less than 2^8
-  constexpr static int LEFT_SHIFT = 8;
-
-  proto::VarType::Type data_type_;
-  DataLayout data_layout_;
-  platform::Place place_;
-  LibraryType library_type_;
+  // In total should be smaller than 64.
+  constexpr static int kPlaceBits = 4;
+  constexpr static int kPrimaryDTypeBits = 8;
+  constexpr static int kLayoutBits = 4;
+  constexpr static int kLibBits = 4;
+  constexpr static int kCustomizeBits = 4;

  OpKernelType(proto::VarType::Type data_type, platform::Place place,
               DataLayout data_layout = DataLayout::kAnyLayout,
-               LibraryType library_type = LibraryType::kPlain)
+               LibraryType library_type = LibraryType::kPlain,
+               int customized_type_value = kDefaultCustomizedTypeValue)
      : data_type_(data_type),
        data_layout_(data_layout),
        place_(place),
-        library_type_(library_type) {}
+        library_type_(library_type),
+        customized_type_value_(customized_type_value) {}

  OpKernelType(proto::VarType::Type data_type,
               const platform::DeviceContext& dev_ctx,
               DataLayout data_layout = DataLayout::kAnyLayout,
-               LibraryType library_type = LibraryType::kPlain)
+               LibraryType library_type = LibraryType::kPlain,
+               int customized_type_value = kDefaultCustomizedTypeValue)
      : data_type_(data_type),
        data_layout_(data_layout),
        place_(dev_ctx.GetPlace()),
-        library_type_(library_type) {}
+        library_type_(library_type),
+        customized_type_value_(customized_type_value) {}
+
+  virtual ~OpKernelType() {}
+
+  struct Hash {
+    size_t operator()(const OpKernelType& key) const;
+  };

  size_t hash_key() const { return Hash()(*this); }

-  bool operator==(const OpKernelType& o) const {
-    return platform::places_are_same_class(place_, o.place_) &&
-           data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&
-           library_type_ == o.library_type_;
-  }
+  bool operator==(const OpKernelType& o) const;

  bool operator!=(const OpKernelType& o) const { return !(*this == o); }
+
+  proto::VarType::Type data_type_;
+  DataLayout data_layout_;
+  platform::Place place_;
+  LibraryType library_type_;
+  int customized_type_value_;
 };

 inline std::ostream& operator<<(std::ostream& os,

--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -695,6 +695,12 @@ static void CheckTensorNANOrInf(const std::string& name,
                 "Tensor %s contains NAN", name);
 }

+void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
+                                           const platform::Place& place) const {
+  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
+  this->InferShape(&infer_shape_ctx);
+}
+
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                 const platform::Place& place) const {
  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
@@ -873,6 +879,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
          t = &(var->Get<SelectedRows>().value());
        }
        if (t != nullptr) {
+          PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized: %s",
+                         ipt_name, DebugString());
          int tmp = static_cast<int>(ToDataType(t->type()));
          PADDLE_ENFORCE(
              tmp == data_type || data_type == -1,

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -128,6 +128,8 @@ class OperatorBase {
  virtual std::vector<std::string> OutputVars(bool has_intermediate) const;

  void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; }
+  virtual void RuntimeInferShape(const Scope& scope,
+                                 const platform::Place& place) const {}

 protected:
  std::string type_;
@@ -348,6 +350,9 @@ class OperatorWithKernel : public OperatorBase {
    OpInfoMap::Instance().Get(Type()).infer_shape_(ctx);
  }

+  void RuntimeInferShape(const Scope& scope,
+                         const platform::Place& place) const override;
+
 protected:
  virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
  virtual OpKernelType GetKernelTypeForVar(

--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -14,7 +14,6 @@ limitations under the License. */

 #pragma once

-#include <atomic>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -29,10 +28,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"

-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/framework/details/reference_count_pass.h"
-#endif
-
 namespace paddle {
 namespace framework {

@@ -75,24 +70,7 @@ class ParallelExecutor {
 private:
  void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;

-  std::unique_ptr<ParallelExecutorPrivate> member_;
-
-#ifdef PADDLE_WITH_CUDA
-  // ref_cnts_ is only initialized when ParallelExecutor constructs, and then
-  // keeps unchanged
-  // Before each iteration, cur_ref_cnts_ is reset to ref_cnts_
-  details::DeviceReferenceCountMap ref_cnts_;
-  details::AtomicDeviceReferenceCountMap cur_ref_cnts_;
-  details::DeviceGarbageCollectorMap gcs_;
-
-  void ResetReferenceCount() {
-    for (auto &pair1 : ref_cnts_) {
-      for (auto &pair2 : *(pair1.second)) {
-        (*(cur_ref_cnts_[pair1.first]))[pair2.first] = pair2.second;
-      }
-    }
-  }
-#endif
+  ParallelExecutorPrivate *member_;
 };

 }  // namespace framework

--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -38,6 +38,10 @@ DEFINE_double(
    "Memory size threshold (GB) when the garbage collector clear tensors."
    "Disabled when this value is less than 0");

+DEFINE_bool(fast_eager_deletion_mode, false,
+            "Fast eager deletion mode. If enabled, memory would release "
+            "immediately without waiting GPU kernel ends.");
+
 // When in inference scenario, the scopes will not be written by two threads in
 // a mean time, but a scope may be read by multiple threads concurrently, and
 // the mutex will cause serious performance issue.
@@ -58,6 +62,8 @@ int64_t GetEagerDeletionThreshold() {
                                    (static_cast<int64_t>(1) << 30));
 }

+bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; }
+
 Scope::~Scope() { DropKids(); }

 Scope& Scope::NewScope() const {

--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -27,6 +27,7 @@ namespace paddle {
 namespace framework {

 int64_t GetEagerDeletionThreshold();
+bool IsFastEagerDeletionModeEnabled();

 class Scope;


--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -32,8 +32,7 @@ namespace framework {
 class SelectedRows {
  /*
   * @brief We can use the SelectedRows structure to reproduce a sparse table.
-   *  A sparse table is a key-value structure that the key is an `int64_t`
-   * number,
+   *  A sparse table is a key-value structure that the key is an `int64_t`,
   *  and the value is a Tensor which the first dimension is 0.
   *  You can use the following interface to operate the sparse table, and you
   * can find

--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -158,6 +158,10 @@ class Tensor {
  const std::shared_ptr<memory::Allocation>& Holder() const { return holder_; }
  size_t offset() const { return offset_; }

+  std::shared_ptr<memory::Allocation> MoveMemoryHolder() {
+    return std::move(holder_);
+  }
+
 private:
  /*! holds the memory block if allocated. */
  std::shared_ptr<memory::Allocation> holder_;

--- a/paddle/fluid/framework/variable_helper.cc
+++ b/paddle/fluid/framework/variable_helper.cc
--- a/paddle/fluid/framework/variable_helper.h
+++ b/paddle/fluid/framework/variable_helper.h
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
+cc_library(layer SRCS layer.cc DEPS proto_desc operator)
+cc_library(tracer SRCS tracer.cc DEPS proto_desc)
+cc_library(engine SRCS engine.cc)
--- a/paddle/fluid/imperative/engine.cc
+++ b/paddle/fluid/imperative/engine.cc
--- a/paddle/fluid/imperative/engine.h
+++ b/paddle/fluid/imperative/engine.h
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
--- a/paddle/fluid/inference/analysis/analysis_pass.h
+++ b/paddle/fluid/inference/analysis/analysis_pass.h
@@ -46,8 +46,6 @@ class AnalysisPass {
 protected:
  // User should implement these.
  virtual void RunImpl(Argument* argument) = 0;
-
-  Argument* argument_{nullptr};
 };

 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
--- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
--- a/paddle/fluid/inference/analysis/passes/passes.cc
+++ b/paddle/fluid/inference/analysis/passes/passes.cc
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
--- a/paddle/fluid/inference/api/details/reset_tensor_array.cc
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.cc
--- a/paddle/fluid/inference/api/details/reset_tensor_array.h
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.h
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
--- a/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
--- a/paddle/fluid/inference/tests/api/config_printer.h
+++ b/paddle/fluid/inference/tests/api/config_printer.h
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
--- a/paddle/fluid/inference/utils/CMakeLists.txt
+++ b/paddle/fluid/inference/utils/CMakeLists.txt
--- a/paddle/fluid/inference/utils/benchmark.cc
+++ b/paddle/fluid/inference/utils/benchmark.cc
--- a/paddle/fluid/inference/utils/benchmark.h
+++ b/paddle/fluid/inference/utils/benchmark.h
--- a/paddle/fluid/inference/utils/visualizer.cc
+++ b/paddle/fluid/inference/utils/visualizer.cc
--- a/paddle/fluid/inference/utils/visualizer.h
+++ b/paddle/fluid/inference/utils/visualizer.h
--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
--- a/paddle/fluid/operators/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/activation_mkldnn_op.cc
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
--- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
--- a/paddle/fluid/operators/batch_norm_op.cu.cc
+++ b/paddle/fluid/operators/batch_norm_op.cu.cc
--- a/paddle/fluid/operators/bilinear_tensor_product_op.cu
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.cu
--- a/paddle/fluid/operators/bpr_loss_op.cc
+++ b/paddle/fluid/operators/bpr_loss_op.cc
--- a/paddle/fluid/operators/bpr_loss_op.h
+++ b/paddle/fluid/operators/bpr_loss_op.h
--- a/paddle/fluid/operators/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/concat_mkldnn_op.cc
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
--- a/paddle/fluid/operators/conv_fusion_op.cu.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cu.cc
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
--- a/paddle/fluid/operators/conv_transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_transpose_mkldnn_op.cc
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
--- a/paddle/fluid/operators/cos_sim_op.cu
+++ b/paddle/fluid/operators/cos_sim_op.cu
--- a/paddle/fluid/operators/crop_op.cu
+++ b/paddle/fluid/operators/crop_op.cu
--- a/paddle/fluid/operators/cudnn_lstm_op.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cc
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
--- a/paddle/fluid/operators/detection/box_coder_op.h
+++ b/paddle/fluid/operators/detection/box_coder_op.h
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
--- a/paddle/fluid/operators/distributed/brpc_client.cc
+++ b/paddle/fluid/operators/distributed/brpc_client.cc
--- a/paddle/fluid/operators/distributed/collective_client.cc
+++ b/paddle/fluid/operators/distributed/collective_client.cc
--- a/paddle/fluid/operators/distributed/collective_client.h
+++ b/paddle/fluid/operators/distributed/collective_client.h
--- a/paddle/fluid/operators/distributed/collective_server.cc
+++ b/paddle/fluid/operators/distributed/collective_server.cc
--- a/paddle/fluid/operators/distributed/collective_server.h
+++ b/paddle/fluid/operators/distributed/collective_server.h
--- a/paddle/fluid/operators/distributed/collective_server_test.cc
+++ b/paddle/fluid/operators/distributed/collective_server_test.cc
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
--- a/paddle/fluid/operators/distributed/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde.cc
--- a/paddle/fluid/operators/distributed/grpc_serde.h
+++ b/paddle/fluid/operators/distributed/grpc_serde.h
--- a/paddle/fluid/operators/distributed/grpc_serde_test.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde_test.cc
--- a/paddle/fluid/operators/distributed/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
--- a/paddle/fluid/operators/distributed/grpc_service.h
+++ b/paddle/fluid/operators/distributed/grpc_service.h
--- a/paddle/fluid/operators/distributed/grpc_variable_response.cc
+++ b/paddle/fluid/operators/distributed/grpc_variable_response.cc
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
--- a/paddle/fluid/operators/distributed/parameter_prefetch.h
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.h
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
--- a/paddle/fluid/operators/distributed/request_handler_impl.h
+++ b/paddle/fluid/operators/distributed/request_handler_impl.h
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
--- a/paddle/fluid/operators/distributed/rpc_server.h
+++ b/paddle/fluid/operators/distributed/rpc_server.h
--- a/paddle/fluid/operators/distributed/send_recv.proto.in
+++ b/paddle/fluid/operators/distributed/send_recv.proto.in
--- a/paddle/fluid/operators/distributed/variable_response.h
+++ b/paddle/fluid/operators/distributed/variable_response.h
--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cu
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cu
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
--- a/paddle/fluid/operators/expand_op.cu
+++ b/paddle/fluid/operators/expand_op.cu
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
--- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
--- a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
+++ b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
--- a/paddle/fluid/operators/gru_unit_op.cu
+++ b/paddle/fluid/operators/gru_unit_op.cu
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
--- a/paddle/fluid/operators/hinge_loss_op.cu
+++ b/paddle/fluid/operators/hinge_loss_op.cu
--- a/paddle/fluid/operators/huber_loss_op.cu
+++ b/paddle/fluid/operators/huber_loss_op.cu
--- a/paddle/fluid/operators/im2sequence_op.cu
+++ b/paddle/fluid/operators/im2sequence_op.cu
--- a/paddle/fluid/operators/isfinite_op.cu
+++ b/paddle/fluid/operators/isfinite_op.cu
--- a/paddle/fluid/operators/l1_norm_op.cu
+++ b/paddle/fluid/operators/l1_norm_op.cu
--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
--- a/paddle/fluid/operators/log_loss_op.cu
+++ b/paddle/fluid/operators/log_loss_op.cu
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
--- a/paddle/fluid/operators/math/context_project.cu
+++ b/paddle/fluid/operators/math/context_project.cu
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
--- a/paddle/fluid/operators/math/cpu_vec_test.cc
+++ b/paddle/fluid/operators/math/cpu_vec_test.cc
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
--- a/paddle/fluid/operators/math/jit_gen.cc
+++ b/paddle/fluid/operators/math/jit_gen.cc
--- a/paddle/fluid/operators/math/jit_kernel.cc
+++ b/paddle/fluid/operators/math/jit_kernel.cc
--- a/paddle/fluid/operators/math/jit_kernel_blas.cc
+++ b/paddle/fluid/operators/math/jit_kernel_blas.cc
--- a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
+++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
--- a/paddle/fluid/operators/math/jit_kernel_exp.cc
+++ b/paddle/fluid/operators/math/jit_kernel_exp.cc
--- a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc
+++ b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc
--- a/paddle/fluid/operators/math/jit_kernel_macro.h
+++ b/paddle/fluid/operators/math/jit_kernel_macro.h
--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
--- a/paddle/fluid/operators/math/matrix_bit_code.cc
+++ b/paddle/fluid/operators/math/matrix_bit_code.cc
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
--- a/paddle/fluid/operators/math/prelu.cu
+++ b/paddle/fluid/operators/math/prelu.cu
--- a/paddle/fluid/operators/math/prelu.h
+++ b/paddle/fluid/operators/math/prelu.h
--- a/paddle/fluid/operators/math/sequence2batch.cu
+++ b/paddle/fluid/operators/math/sequence2batch.cu
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
--- a/paddle/fluid/operators/merge_selected_rows_op.cc
+++ b/paddle/fluid/operators/merge_selected_rows_op.cc
--- a/paddle/fluid/operators/merge_selected_rows_op.cu.cc
+++ b/paddle/fluid/operators/merge_selected_rows_op.cu.cc
--- a/paddle/fluid/operators/merge_selected_rows_op.h
+++ b/paddle/fluid/operators/merge_selected_rows_op.h
--- a/paddle/fluid/operators/metrics/auc_op.h
+++ b/paddle/fluid/operators/metrics/auc_op.h
--- a/paddle/fluid/operators/optimizers/adadelta_op.cu
+++ b/paddle/fluid/operators/optimizers/adadelta_op.cu
--- a/paddle/fluid/operators/optimizers/adagrad_op.cu
+++ b/paddle/fluid/operators/optimizers/adagrad_op.cu
--- a/paddle/fluid/operators/optimizers/adam_op.cu
+++ b/paddle/fluid/operators/optimizers/adam_op.cu
--- a/paddle/fluid/operators/optimizers/adamax_op.cu
+++ b/paddle/fluid/operators/optimizers/adamax_op.cu
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu
--- a/paddle/fluid/operators/optimizers/ftrl_op.cu
+++ b/paddle/fluid/operators/optimizers/ftrl_op.cu
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu
+++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.cu
+++ b/paddle/fluid/operators/optimizers/proximal_gd_op.cu
--- a/paddle/fluid/operators/optimizers/rmsprop_op.cu
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.cu
--- a/paddle/fluid/operators/pad2d_op.cc
+++ b/paddle/fluid/operators/pad2d_op.cc
--- a/paddle/fluid/operators/pad2d_op.cu
+++ b/paddle/fluid/operators/pad2d_op.cu
--- a/paddle/fluid/operators/pad_constant_like_op.cu
+++ b/paddle/fluid/operators/pad_constant_like_op.cu
--- a/paddle/fluid/operators/pad_op.cu
+++ b/paddle/fluid/operators/pad_op.cu
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
--- a/paddle/fluid/operators/prelu_op.cu
+++ b/paddle/fluid/operators/prelu_op.cu
--- a/paddle/fluid/operators/psroi_pool_op.cc
+++ b/paddle/fluid/operators/psroi_pool_op.cc
--- a/paddle/fluid/operators/psroi_pool_op.cu
+++ b/paddle/fluid/operators/psroi_pool_op.cu
--- a/paddle/fluid/operators/psroi_pool_op.h
+++ b/paddle/fluid/operators/psroi_pool_op.h
--- a/paddle/fluid/operators/reader/ctr_reader.h
+++ b/paddle/fluid/operators/reader/ctr_reader.h
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
--- a/paddle/fluid/operators/smooth_l1_loss_op.cu
+++ b/paddle/fluid/operators/smooth_l1_loss_op.cu
--- a/paddle/fluid/operators/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/softmax_mkldnn_op.cc
--- a/paddle/fluid/operators/softmax_op.h
+++ b/paddle/fluid/operators/softmax_op.h
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
--- a/paddle/fluid/operators/split_selected_rows_op.h
+++ b/paddle/fluid/operators/split_selected_rows_op.h
--- a/paddle/fluid/operators/squared_l2_distance_op.cu
+++ b/paddle/fluid/operators/squared_l2_distance_op.cu
--- a/paddle/fluid/operators/squared_l2_norm_op.cu
+++ b/paddle/fluid/operators/squared_l2_norm_op.cu
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
--- a/paddle/fluid/operators/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/yolov3_loss_op.cc
--- a/paddle/fluid/operators/yolov3_loss_op.h
+++ b/paddle/fluid/operators/yolov3_loss_op.h
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
--- a/paddle/fluid/platform/assert.h
+++ b/paddle/fluid/platform/assert.h
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
--- a/paddle/fluid/platform/cuda_helper_test.cu
+++ b/paddle/fluid/platform/cuda_helper_test.cu
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
--- a/paddle/fluid/pybind/async_executor_py.cc
+++ b/paddle/fluid/pybind/async_executor_py.cc
--- a/paddle/fluid/pybind/async_executor_py.h
+++ b/paddle/fluid/pybind/async_executor_py.h
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
--- a/paddle/fluid/pybind/imperative.h
+++ b/paddle/fluid/pybind/imperative.h
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
--- a/paddle/fluid/string/CMakeLists.txt
+++ b/paddle/fluid/string/CMakeLists.txt
--- a/paddle/fluid/string/split.h
+++ b/paddle/fluid/string/split.h
--- a/paddle/fluid/string/split_test.cc
+++ b/paddle/fluid/string/split_test.cc
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/async_executor.py
+++ b/python/paddle/fluid/async_executor.py
--- a/python/paddle/fluid/average.py
+++ b/python/paddle/fluid/average.py
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
--- a/python/paddle/fluid/data_feed_desc.py
+++ b/python/paddle/fluid/data_feed_desc.py
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/imperative/__init__.py
+++ b/python/paddle/fluid/imperative/__init__.py
--- a/python/paddle/fluid/imperative/base.py
+++ b/python/paddle/fluid/imperative/base.py
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt
--- a/python/paddle/fluid/tests/demo/async_executor.py
+++ b/python/paddle/fluid/tests/demo/async_executor.py
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
--- a/python/paddle/fluid/tests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/test_gradient_clip.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr.py
--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
--- a/python/paddle/fluid/tests/unittests/dist_save_load.py
+++ b/python/paddle/fluid/tests/unittests/dist_save_load.py
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
--- a/python/paddle/fluid/tests/unittests/test_async_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_async_executor.py
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py
--- a/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv3d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
--- a/python/paddle/fluid/tests/unittests/test_dist_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_save_load.py
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
--- a/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py
--- a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
--- a/python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py
--- a/python/paddle/fluid/tests/unittests/test_pad2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pad2d_op.py
--- a/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py
--- a/python/paddle/fluid/tests/unittests/test_regularizer.py
+++ b/python/paddle/fluid/tests/unittests/test_regularizer.py
--- a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
--- a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
--- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
--- a/python/paddle/fluid/tests/unittests/testsuite.py
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
--- a/python/paddle/reader/tests/decorator_test.py
+++ b/python/paddle/reader/tests/decorator_test.py
--- a/python/setup.py.in
+++ b/python/setup.py.in
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py