Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into...

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add-async-ssa-graph-executor-communicator test=develop

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into...
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add-async-ssa-graph-executor-communicator test=develop
d8974e6d · Qiao Longfei · 34890fd3 · 1096746c · d8974e6d · d8974e6d
413 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,6 +64,7 @@ option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
 option(WITH_PSLIB       "Compile with pslib support"                    OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
 option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
+# TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(ANAKIN_BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform, ignored when WITH_ANAKIN=OFF" OFF)
 option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform. ignored when WITH_ANAKIN=OFF" ON)
@@ -190,7 +191,14 @@ include(configure)          # add paddle env configuration
 if(WITH_GPU)
    include(cuda)
    include(tensorrt)
+    include(anakin_subgraph)
 endif()
+if(WITH_GPU AND NOT WIN32)
+    message(STATUS "add dgc lib.")
+    include(external/dgc)
+endif()
 if(WITH_MKL OR WITH_MKLML)
    include(external/anakin)
 elseif()

--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -156,7 +156,7 @@ python \
 This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the verbose range of 0 to 3, so you will see above example VLOG message, which is in level 3.  This suggests that we output overall messages in lower verbose levels, so they display with higher probability.  When coding C++, please follow the verbose level convention as follows:
- verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework)
+- verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/framework)
- verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)
+- verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators)
- verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform)
+- verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/platform)
- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/math)
+- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators/math/)
--- a/cmake/anakin_subgraph.cmake
+++ b/cmake/anakin_subgraph.cmake
+if(NOT WITH_GPU)
+    return()
+endif()
+set(ANAKIN_ROOT "/usr" CACHE PATH "ANAKIN ROOT")
+find_path(ANAKIN_INCLUDE_DIR anakin_config.h
+    PATHS ${ANAKIN_ROOT} ${ANAKIN_ROOT}/include
+    $ENV{ANAKIN_ROOT} $ENV{ANAKIN_ROOT}/include
+    NO_DEFAULT_PATH
+)
+find_library(ANAKIN_LIBRARY NAMES libanakin_saber_common.so libanakin.so
+    PATHS ${ANAKIN_ROOT}
+    $ENV{ANAKIN_ROOT} $ENV{ANAKIN_ROOT}/lib
+    NO_DEFAULT_PATH
+    DOC "Path to ANAKIN library.")
+if(ANAKIN_INCLUDE_DIR AND ANAKIN_LIBRARY)
+  if(WITH_DSO)
+    set(ANAKIN_FOUND ON)
+  endif(WITH_DSO)
+else()
+    set(ANAKIN_FOUND OFF)
+endif()
+if(ANAKIN_FOUND)
+    message(STATUS "Current ANAKIN header is ${ANAKIN_INCLUDE_DIR}/anakin_config.h. ")
+    include_directories(${ANAKIN_ROOT}/include)
+    include_directories(${ANAKIN_ROOT}/include/saber)
+    link_directories(${ANAKIN_ROOT})
+    add_definitions(-DPADDLE_WITH_ANAKIN)
+endif()
--- a/cmake/external/dgc.cmake
+++ b/cmake/external/dgc.cmake
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+INCLUDE(ExternalProject)
+SET(DGC_SOURCES_DIR "${THIRD_PARTY_PATH}/dgc")
+SET(DGC_INSTALL_DIR "${THIRD_PARTY_PATH}/install/dgc")
+SET(DGC_INCLUDE_DIR "${DGC_INSTALL_DIR}/include" CACHE PATH "dgc include directory." FORCE)
+SET(DGC_LIBRARIES "${DGC_INSTALL_DIR}/lib/libdgc.a" CACHE FILEPATH "dgc library." FORCE)
+INCLUDE_DIRECTORIES(${DGC_INCLUDE_DIR})
+ExternalProject_Add(
+    extern_dgc
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY "https://github.com/PaddlePaddle/Fleet"
+    GIT_TAG "2d04dc3800cdd0601f1b65d547dabcc60b0cf9dc"
+    SOURCE_DIR "${DGC_SOURCES_DIR}"
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND cd collective && make -j
+    INSTALL_COMMAND mkdir -p ${DGC_INSTALL_DIR}/lib/  ${DGC_INCLUDE_DIR}/dgc
+        && cp ${DGC_SOURCES_DIR}/collective/build/lib/libdgc.a ${DGC_LIBRARIES}
+        && cp ${DGC_SOURCES_DIR}/collective/build/include/dgc.h ${DGC_INCLUDE_DIR}/dgc/
+    BUILD_IN_SOURCE 1
+)
+ADD_LIBRARY(dgc SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET dgc PROPERTY IMPORTED_LOCATION ${DGC_LIBRARIES})
+ADD_DEPENDENCIES(dgc extern_dgc)
+LIST(APPEND external_project_dependencies dgc)
--- a/cmake/external/ngraph.cmake
+++ b/cmake/external/ngraph.cmake
@@ -57,20 +57,25 @@ SET(NGRAPH_TBB_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME})
 ExternalProject_Add(
    ${NGRAPH_PROJECT}
    ${EXTERNAL_PROJECT_LOG_ARGS}
-    DEPENDS             ${MKLDNN_PROJECT} ${MKLML_PROJECT}
+    DEPENDS                  ${MKLDNN_PROJECT} ${MKLML_PROJECT}
-    GIT_REPOSITORY      ${NGRAPH_GIT_REPO}
+    GIT_REPOSITORY           ${NGRAPH_GIT_REPO}
-    GIT_TAG             ${NGRAPH_GIT_TAG}
+    GIT_TAG                  ${NGRAPH_GIT_TAG}
-    PREFIX              ${NGRAPH_SOURCES_DIR}
+    PREFIX                   ${NGRAPH_SOURCES_DIR}
-    UPDATE_COMMAND      ""
+    UPDATE_COMMAND           ""
-    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${NGRAPH_INSTALL_DIR}
+    CMAKE_GENERATOR          ${CMAKE_GENERATOR}
-    CMAKE_ARGS          -DNGRAPH_UNIT_TEST_ENABLE=FALSE
+    CMAKE_GENERATOR_PLATFORM ${CMAKE_GENERATOR_PLATFORM}
-    CMAKE_ARGS          -DNGRAPH_TOOLS_ENABLE=FALSE
+    CMAKE_GENERATOR_TOOLSET  ${CMAKE_GENERATOR_TOOLSET}
-    CMAKE_ARGS          -DNGRAPH_INTERPRETER_ENABLE=FALSE
+    CMAKE_ARGS               -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    CMAKE_ARGS          -DNGRAPH_DEX_ONLY=TRUE
+    CMAKE_ARGS               -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-    CMAKE_ARGS          -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+    CMAKE_ARGS               -DCMAKE_INSTALL_PREFIX=${NGRAPH_INSTALL_DIR}
-    CMAKE_ARGS          -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR}
+    CMAKE_ARGS               -DNGRAPH_UNIT_TEST_ENABLE=FALSE
-    CMAKE_ARGS          -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}
+    CMAKE_ARGS               -DNGRAPH_TOOLS_ENABLE=FALSE
-    CMAKE_ARGS          -DMKLML_LIB_DIR=${MKLML_INSTALL_DIR}/lib
+    CMAKE_ARGS               -DNGRAPH_INTERPRETER_ENABLE=FALSE
+    CMAKE_ARGS               -DNGRAPH_DEX_ONLY=TRUE
+    CMAKE_ARGS               -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+    CMAKE_ARGS               -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR}
+    CMAKE_ARGS               -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}
+    CMAKE_ARGS               -DMKLML_LIB_DIR=${MKLML_INSTALL_DIR}/lib
 )
 add_dependencies(ngraph ${NGRAPH_PROJECT})

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -131,6 +131,15 @@ elseif (NOT CBLAS_FOUND OR WIN32)
            )
 endif ()
+if (WITH_GPU AND NOT WIN32)
+    set(dgc_dir "${FLUID_INSTALL_DIR}/third_party/install/dgc")
+    copy(dgc_lib
+            SRCS ${DGC_INSTALL_DIR}/lib ${DGC_INSTALL_DIR}/include
+            DSTS ${dgc_dir} ${dgc_dir}
+            DEPS dgc)
+endif()
 if (WITH_MKLDNN)
    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn")
    copy(mkldnn_lib

--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -110,7 +110,7 @@ function(op_library TARGET)
    # Define operators that don't need pybind here.
    foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
-"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op")
+"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op" "dgc_op")
        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
            set(pybind_flag 1)
        endif()

--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@@ -33,5 +33,6 @@ if(TENSORRT_FOUND)
    message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
        "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
    include_directories(${TENSORRT_INCLUDE_DIR})
+    link_directories(${TENSORRT_LIBRARY})
    add_definitions(-DPADDLE_WITH_TENSORRT)
 endif()
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -211,7 +211,7 @@ paddle.fluid.layers.mean (ArgSpec(args=['x', 'name'], varargs=None, keywords=Non
 paddle.fluid.layers.mul (ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)), ('document', 'ccd37fa6b53f074adbfb732d738c4c2d'))
 paddle.fluid.layers.sigmoid_cross_entropy_with_logits (ArgSpec(args=['x', 'label', 'ignore_index', 'name', 'normalize'], varargs=None, keywords=None, defaults=(-100, None, False)), ('document', '180c284317ea45ef89a460d8d79c0b72'))
 paddle.fluid.layers.maxout (ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '891870d069a6aea746d34cc53b61690c'))
-paddle.fluid.layers.space_to_depth (ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5f207ae10589ebe38a63575ef6ff8e1e'))
+paddle.fluid.layers.space_to_depth (ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'a9221eaef53884a00654e028551b78e2'))
 paddle.fluid.layers.affine_grid (ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '51def402b8910e163cbace9d0c0526ed'))
 paddle.fluid.layers.sequence_reverse (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '77a6d80aa5551ca70324fc975c44507f'))
 paddle.fluid.layers.affine_channel (ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name', 'act'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None, None)), ('document', 'ab84fdc6dc60f3ad9aa397e6007e3bf9'))
@@ -483,6 +483,11 @@ paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients (ArgSpec(args=['sel
 paddle.fluid.optimizer.LarsMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
 paddle.fluid.optimizer.LarsMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.LarsMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
+paddle.fluid.optimizer.DGCMomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'rampup_begin_step', 'rampup_step', 'sparsity', 'use_nesterov', 'local_grad_clip_norm', 'num_trainers', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1, [0.999], False, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.optimizer.DGCMomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
+paddle.fluid.optimizer.DGCMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.DGCMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.optimizer.DGCMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.backward.append_backward (ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '1a79bd7d10ae54ca763ec81bca36ba24'))
 paddle.fluid.regularizer.L1DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.regularizer.L2DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -520,6 +525,7 @@ paddle.fluid.unique_name.guard (ArgSpec(args=['new_generator'], varargs=None, ke
 paddle.fluid.recordio_writer.convert_reader_to_recordio_file (ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', '65c7523e86f0c50bb729b01667f36310'))
 paddle.fluid.recordio_writer.convert_reader_to_recordio_files (ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', 'bc643f0f5f1b9db57ff0d8a57d379bd7'))
 paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope
+paddle.fluid.install_check.run_check (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '66b7c84a17ed32fec2df9628367be2b9'))
 paddle.reader.cache (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', '1676886070eb607cb608f7ba47be0d3c'))
 paddle.reader.map_readers (ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None), ('document', '77cbadb09df588e21e5cc0819b69c87d'))
 paddle.reader.buffered (ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None), ('document', '0d6186f109feceb99f60ec50a0a624cb'))

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -63,7 +63,7 @@ cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
-cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory)
+cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory gflags glog)
 cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
 cc_test(reader_test SRCS reader_test.cc DEPS reader)
@@ -164,6 +164,8 @@ else()
  set(NGRAPH_EXE_DEPS)
 endif()
+cc_library(executor_gc_helper SRCS executor_gc_helper.cc DEPS scope proto_desc operator garbage_collector)
 if(WITH_DISTRIBUTE)
  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog
    lod_rank_table feed_fetch_method sendrecvop_rpc  ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper ${NGRAPH_EXE_DEPS})
@@ -174,7 +176,7 @@ else()
  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()
-target_link_libraries(executor garbage_collector while_op_helper)
+target_link_libraries(executor while_op_helper executor_gc_helper)
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor async_ssa_graph_executor
@@ -194,6 +196,7 @@ cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_con
 cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
        proto_desc)
 cc_test(inplace_op_inference_test SRCS inplace_op_inference_test.cc DEPS op_registry proto_desc op_info memory_optimize_helper)
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
 cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)

--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -134,6 +134,11 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
  out_layout =
      out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout;
+  auto& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(
+      pool.Get(expected_kernel_type.place_));
+  auto& cpu_engine = dev_ctx->GetEngine();
  std::vector<int> in_tz = paddle::framework::vectorize2int(in.dims());
  std::vector<int> out_tz = in_tz;
@@ -142,25 +147,29 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                 "Input tensor type is not supported: %s", in.type());
  memory::data_type out_type = in_type;
+  auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
+  auto out_format =
+      platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
  // output tensor has the same dims as input. Reorder don't change dims
  out->Resize(in.dims());
-  // tempory mem pd fr out , to make reorder
+  if (in_format != out_format) {
-  auto out_mem_pd = paddle::platform::create_prim_desc_from_dims(
-      paddle::framework::vectorize2int(out->dims()),
-      mkldnn::memory::format::blocked, out_type);
-  if (in.get_mkldnn_prim_desc() != out_mem_pd) {
    void* in_data = GetDataFromTensor(in, in_type);
    auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());
-    auto in_memory = memory(in.get_mkldnn_prim_desc(), in_data);
+    auto in_memory =
-    auto out_memory = memory(out_mem_pd, out_data);
+        memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
+    auto out_memory =
+        memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
    platform::Reorder(in_memory, out_memory);
  } else {
    out->ShareDataWith(in);
  }
  out->set_layout(out_layout);
+  // reset format since the out tensor will be feed to non-MKLDNN OPkernel
+  out->set_format(memory::format::format_undef);
 #endif
 }

--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -51,31 +51,13 @@ void TransformData(const OpKernelType &expected_kernel_type,
 #ifdef PADDLE_WITH_MKLDNN
        // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
        // Just set layout/format. No real transform occur
+        auto out_format = platform::MKLDNNFormatForSize(in.dims().size(),
+                                                        ToMKLDNNFormat(lin));
        out.ShareDataWith(input_tensor);
-        // TODO(jczaja): Remove that once all mkldnn ops
+        out.set_layout(DataLayout::kMKLDNN);
-        // are modified to work with mkldnn_blocked
+        out.set_format(out_format);
-        auto mkldnn_fmt = [&](int rank) {
-          switch (rank) {
-            case 5:
-              return mkldnn::memory::format::ncdhw;
-            case 4:
-              return mkldnn::memory::format::nchw;
-            case 3:
-              return mkldnn::memory::format::ncw;
-            case 2:
-              return mkldnn::memory::format::nc;
-            case 1:
-              return mkldnn::memory::format::x;
-            default:
-              return mkldnn::memory::format::blocked;
-          }
-        };
-        auto out_mem_pd = paddle::platform::create_prim_desc_from_dims(
-            paddle::framework::vectorize2int(out.dims()),
-            mkldnn_fmt(out.dims().size()));
-        out.set_mkldnn_prim_desc(out_mem_pd);
 #endif
      } else {
        // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -10,7 +10,10 @@ cc_library(fetch_barrier_op_handle SRCS fetch_barrier_op_handle.cc DEPS framewor
 cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper)
 cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper)
 cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper)
 cc_library(alloc_continuous_space_for_grad_pass SRCS alloc_continuous_space_for_grad_pass.cc DEPS graph graph_helper)
+cc_library(fuse_adam_op_pass SRCS fuse_adam_op_pass.cc fuse_optimizer_op_pass.cc DEPS graph graph_helper)
+cc_library(fuse_sgd_op_pass SRCS fuse_sgd_op_pass.cc fuse_optimizer_op_pass.cc DEPS graph graph_helper)
 cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
@@ -23,7 +26,7 @@ endif()
 if(WITH_GPU)
    nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
-            dynload_cuda variable_visitor)
+            dynload_cuda variable_visitor dgc)
    nv_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
            dynload_cuda variable_visitor)
    if(WITH_DISTRIBUTE)
@@ -110,5 +113,7 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS
        graph_viz_pass multi_devices_graph_pass
        multi_devices_graph_print_pass multi_devices_graph_check_pass
        fuse_elewise_add_act_pass multi_batch_merge_pass 
-        fuse_relu_depthwise_conv_pass 
+        fuse_relu_depthwise_conv_pass
-        memory_optimize_pass lock_free_optimize_pass alloc_continuous_space_for_grad_pass fuse_all_reduce_op_pass)
+        memory_optimize_pass lock_free_optimize_pass
+        alloc_continuous_space_for_grad_pass fuse_all_reduce_op_pass
+        fuse_adam_op_pass fuse_sgd_op_pass)
--- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
@@ -42,8 +42,7 @@ VarHandle* GetValidInput(const OpHandleBase* a) {
  return nullptr;
 }
-std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
+void AllReduceDepsPass::ApplyImpl(ir::Graph* graph) const {
-    std::unique_ptr<ir::Graph> graph) const {
  auto graph_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
  // get vars order
@@ -86,7 +85,8 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
    }
  }
-  VLOG(10) << "dist_ops size:" << dist_ops.size() << std::endl;
+  VLOG(10) << "dist_ops size:" << dist_ops.size()
+           << ", outputs size:" << vars.size() << ", ops size:" << ops.size();
  std::sort(dist_ops.begin(), dist_ops.end(), [&](OpHandleBase* op1,
                                                  OpHandleBase* op2) {
@@ -99,6 +99,10 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
    auto l_it = vars.find(i0->name());
    auto r_it = vars.find(i1->name());
+    PADDLE_ENFORCE(l_it != vars.end() && r_it != vars.end(),
+                   "can't find var's name %s and %s in opdesc", i0->name(),
+                   i1->name());
    if (l_it->second < r_it->second) return true;
    if (l_it->second == r_it->second) {
@@ -126,8 +130,6 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
    VLOG(10) << "pre_op:" << pre_op->DebugString()
             << ", op:" << op->DebugString();
  }
-  return graph;
 }
 }  // namespace details

--- a/paddle/fluid/framework/details/all_reduce_deps_pass.h
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.h
@@ -24,8 +24,7 @@ namespace details {
 // TODO(gongwb): overlap allreduce with backward computation.
 class AllReduceDepsPass : public ir::Pass {
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
+  void ApplyImpl(ir::Graph* graph) const override;
-      std::unique_ptr<ir::Graph> graph) const override;
 };
 }  // namespace details

--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -16,6 +16,13 @@
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/framework/operator.h"
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#include "dgc/dgc.h"
+#endif
+#include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/profiler.h"
 // asynchronous nccl allreduce or synchronous issue:
@@ -33,11 +40,14 @@ namespace details {
 AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                     const std::vector<Scope *> &local_scopes,
                                     const std::vector<platform::Place> &places,
-                                     const platform::NCCLContextMap *ctxs)
+                                     const platform::NCCLContextMap *ctxs,
+                                     bool is_encoded, int nranks)
    : OpHandleBase(node),
      local_scopes_(local_scopes),
      places_(places),
-      nccl_ctxs_(ctxs) {
+      nccl_ctxs_(ctxs),
+      is_encoded_(is_encoded),
+      nranks_(nranks) {
  if (nccl_ctxs_) {
    for (auto &p : places_) {
      this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
@@ -51,7 +61,185 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
 #endif
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+void AllReduceOpHandle::RunImplEncoded() {
+  platform::RecordEvent record_event(Name());
+  WaitInputVarGenerated();
+  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
+  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), places_.size(),
+      "The NoDummyInputSize should be equal to the number of places.");
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), out_var_handles.size(),
+      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
+  std::vector<const LoDTensor *> ins;
+  std::vector<LoDTensor *> outs;
+  int k = -1;
+  for (size_t i = 0; i < local_scopes_.size(); ++i) {
+    auto &local_scope =
+        local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
+    auto original_name =
+        paddle::framework::GradOriginalVarName(in_var_handles[i]->name());
+    auto encode_var_name = original_name + g_dgc_encoded;
+    auto *in_var = local_scope->FindVar(encode_var_name);
+    PADDLE_ENFORCE_NOT_NULL(in_var);
+    auto &in = in_var->Get<LoDTensor>();
+    ins.emplace_back(&in);
+    auto *out = local_scope->FindVar(out_var_handles[i]->name())
+                    ->GetMutable<LoDTensor>();
+    outs.emplace_back(out);
+    if (k < 0) {
+      k = GetKValue(in_var_handles[i]->name());
+    }
+  }
+  PADDLE_ENFORCE(platform::is_gpu_place(ins[0]->place()));
+  PADDLE_ENFORCE(platform::is_gpu_place(outs[0]->place()));
+  PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
+  int dtype = -1;
+  size_t in_numel = 0;
+  size_t out_numel = 0;
+  PADDLE_ENFORCE(nranks_ > 1);
+  std::vector<std::function<void()>> all_reduce_calls;
+  for (size_t i = 0; i < local_scopes_.size(); ++i) {
+    auto &place = places_[i];
+    auto &in = *ins[i];
+    void *in_tensor_buf = const_cast<void *>(in.data<void>());
+    auto &out = *outs[i];
+    float *out_tensor_buf = out.data<float>();
+    dtype = (dtype == -1) ? platform::ToNCCLDataType(in.type()) : dtype;
+    in_numel = (in_numel == 0) ? static_cast<size_t>(in.numel()) : in_numel;
+    PADDLE_ENFORCE(in_numel % 2 == 0);
+    PADDLE_ENFORCE(in_numel / 2 == static_cast<size_t>(k));
+    out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel;
+    int dev_id = boost::get<platform::CUDAPlace>(place).device;
+    auto &nccl_ctx = nccl_ctxs_->at(dev_id);
+    auto stream = nccl_ctx.stream();
+    auto comm = nccl_ctx.comm_;
+    auto &allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(place, stream);
+    int encode_size = 2 * k * sizeof(int);
+    // dgc use ncclAllGather to get all the encoded data
+    // so the buffer need nranks.
+    int buf_size = nranks_ * encode_size;
+    auto tmp_ious_data = allocator.Allocate(buf_size);
+    void *gather_buff = reinterpret_cast<void *>(tmp_ious_data->ptr());
+    VLOG(10) << "in_numel:" << in_numel << ", out_numel:" << out_numel
+             << ", nranks:" << nranks_ << ", gather_buf size:" << buf_size
+             << ", k:" << k << ", place:" << place << ", dtype:" << dtype;
+    all_reduce_calls.emplace_back([=] {
+      PADDLE_ENFORCE(paddle::communication::dgc::sparseAllGReduce(
+          in_tensor_buf, gather_buff, k, out_tensor_buf, out_numel, comm,
+          stream));
+    });
+  }
+  this->RunAndRecordEvent([&] {
+    if (all_reduce_calls.size() == 1UL) {
+      // Do not use NCCLGroup when manage NCCL by per thread per device
+      all_reduce_calls[0]();
+    } else {
+      platform::NCCLGroupGuard guard;
+      for (auto &call : all_reduce_calls) {
+        call();
+      }
+    }
+  });
+  if (FLAGS_sync_nccl_allreduce) {
+    for (auto &p : places_) {
+      int dev_id = boost::get<platform::CUDAPlace>(p).device;
+      auto &nccl_ctx = nccl_ctxs_->at(dev_id);
+      auto stream = nccl_ctx.stream();
+      cudaError_t e_sync = cudaStreamSynchronize(stream);
+      if (e_sync != 0) {
+        LOG(FATAL) << "cudaStreamSynchronize " << cudaGetErrorString(e_sync);
+      }
+      cudaError_t e_get = cudaGetLastError();
+      if (e_get != 0) {
+        LOG(FATAL) << "cudaGetLastError  " << cudaGetErrorString(e_get)
+                   << " errno:" << e_get;
+      }
+    }
+  }
+}
+int AllReduceOpHandle::GetKValue(const std::string &grad_name) {
+  auto original_name = paddle::framework::GradOriginalVarName(grad_name);
+  auto var_name = original_name + g_dgc_k;
+  PADDLE_ENFORCE(local_scopes_.size() > 0);
+  auto *scope = local_scopes_[0];
+  auto &local_scope = scope->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  auto var = local_scope->FindVar(var_name);
+  PADDLE_ENFORCE_NOT_NULL(var);
+  auto tensor = var->Get<LoDTensor>().data<float>();
+  return *tensor;
+}
+#endif
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+bool AllReduceOpHandle::IsEncoded() {
+  if (!is_encoded_) {
+    return false;
+  }
+  auto counter_name = g_dgc_counter_name;
+  auto step_name = g_dgc_rampup_begin_step;
+  PADDLE_ENFORCE(local_scopes_.size() > 0);
+  auto *scope = local_scopes_[0];
+  auto &local_scope = scope->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  auto count_var = local_scope->FindVar(counter_name);
+  auto step_var = local_scope->FindVar(step_name);
+  if (count_var == nullptr || step_var == nullptr) {
+    PADDLE_THROW("not find count_var:%s or step_var:%s", counter_name,
+                 step_var);
+  }
+  float count = *count_var->Get<LoDTensor>().data<float>();
+  float step = *step_var->Get<LoDTensor>().data<float>();
+  if (static_cast<int>(count) < static_cast<int>(step)) {
+    VLOG(10) << "in all_reduce currentstep:" << count
+             << " < rampup_begin_step:" << step
+             << " so not use sparse all reduce";
+    return false;
+  }
+  return true;
+}
+#else
+bool AllReduceOpHandle::IsEncoded() { return false; }
+#endif
 void AllReduceOpHandle::RunImpl() {
+  if (!IsEncoded()) {
+    RunImplNormal();
+    return;
+  }
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+  RunImplEncoded();
+#else
+  PADDLE_THROW("Not compiled with CUDA");
+#endif
+}
+void AllReduceOpHandle::RunImplNormal() {
  platform::RecordEvent record_event(Name());
  WaitInputVarGenerated();
@@ -72,6 +260,8 @@ void AllReduceOpHandle::RunImpl() {
    auto &lod_tensor =
        local_scope.FindVar(in_var_handles[i]->name())->Get<LoDTensor>();
    lod_tensors.emplace_back(&lod_tensor);
+    VLOG(10) << "place:" << i << ", input_name:" << in_var_handles[i]->name()
+             << ", out_name:" << out_var_handles[i]->name();
    PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(),
                      "The name of input and output should be equal.");
  }
@@ -99,13 +289,17 @@ void AllReduceOpHandle::RunImpl() {
      auto &nccl_ctx = nccl_ctxs_->at(dev_id);
      auto stream = nccl_ctx.stream();
      auto comm = nccl_ctx.comm_;
+      VLOG(10) << "before all reduce buffer:" << buffer << ", numel:" << numel
+               << ", dev_id:" << dev_id << ", dtype:" << dtype
+               << ", place:" << p;
      all_reduce_calls.emplace_back([=] {
        PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
            buffer, buffer, numel, static_cast<ncclDataType_t>(dtype), ncclSum,
            comm, stream));
      });
    }
    this->RunAndRecordEvent([&] {
      if (all_reduce_calls.size() == 1UL) {
        // Do not use NCCLGroup when manage NCCL by per thread per device

--- a/paddle/fluid/framework/details/all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.h
@@ -28,11 +28,19 @@ namespace paddle {
 namespace framework {
 namespace details {
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+constexpr char g_dgc_counter_name[] = "__g_dgc_counter__";
+constexpr char g_dgc_rampup_begin_step[] = "__g_rampup_begin_step__";
+constexpr char g_dgc_encoded[] = "__dgc_encoded__";
+constexpr char g_dgc_k[] = "__dgc_k__";
+#endif
 struct AllReduceOpHandle : public OpHandleBase {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
  AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                    const std::vector<platform::Place> &places,
-                    const platform::NCCLContextMap *ctxs);
+                    const platform::NCCLContextMap *ctxs,
+                    bool is_encoded = false, int nranks = -1);
 #else
  AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                    const std::vector<platform::Place> &places);
@@ -50,8 +58,14 @@ struct AllReduceOpHandle : public OpHandleBase {
  std::vector<Scope *> local_scopes_;
  std::vector<platform::Place> places_;
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+  void RunImplEncoded();
  const platform::NCCLContextMap *nccl_ctxs_;
+  bool is_encoded_{false};
+  int nranks_{-1};
+  int GetKValue(const std::string &grad_name);
 #endif
+  void RunImplNormal();
+  bool IsEncoded();
 };
 }  // namespace details

--- a/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc
+++ b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc
@@ -21,6 +21,7 @@
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 DEFINE_uint32(fuse_parameter_memory_size, 0,  // 0 KB
              "fuse_parameter_memory_size is up limited memory size "
              "of one group parameters' gradient which is the input "
@@ -46,8 +47,7 @@ static framework::proto::VarType::Type kDefaultDtype =
 class AllocContinuousSpaceForGradPass : public ir::Pass {
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
+  void ApplyImpl(ir::Graph *graph) const override {
-      std::unique_ptr<ir::Graph> graph) const override {
    ir::Graph &result = *graph;
    auto &places = Get<const std::vector<platform::Place>>(kPlaces);
@@ -65,7 +65,7 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
    if (params_grads.size() == 0) {
      VLOG(10) << "Doesn't find gradients";
-      return std::move(graph);
+      return;
    }
    std::unordered_map<std::string, ir::Node *> vars;
@@ -106,26 +106,33 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
      auto ele_dtype = iter->second->Var()->GetDataType();
      if (dtype == kDefaultDtype) {
        dtype = ele_dtype;
-        PADDLE_ENFORCE_NE(ele_dtype, kDefaultDtype);
+        PADDLE_ENFORCE_NE(ele_dtype, kDefaultDtype,
+                          "The data type should not be bool.");
      }
-      PADDLE_ENFORCE_EQ(ele_dtype, dtype);
+      PADDLE_ENFORCE_EQ(ele_dtype, dtype,
+                        "The data type of input is not consistent.");
    }
-    // Create the fused variable name.
+    // Create a FusedVarsSet to avoid duplicating names for fused_var in other
+    // pass.
    if (!result.Has(kFusedVars)) {
      result.Set(kFusedVars, new FusedVars);
    }
-    const std::string prefix(kFusedVarNamePrefix);
+    // the kFusedGrads is used be fuse_optimizer_op_pass.
-    // The fused_var_name should be unique.
+    result.Set(kFusedGrads, new FusedGrads);
-    auto fused_var_name = prefix + "GRAD@" + params_grads[0].second;
+    // the fused_var_name should be unique, so it appends
+    // params_grads.begin()->second.
+    auto fused_var_name = std::string(kFusedVarNamePrefix) + "@GRAD@" +
+                          params_grads.begin()->second;
+    result.Get<FusedGrads>(kFusedGrads) = fused_var_name;
    auto &fused_var_set = result.Get<FusedVars>(kFusedVars);
-    PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0);
+    PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0,
+                      "%s is duplicate in FusedVars.", fused_var_name);
    fused_var_set.insert(fused_var_name);
    InitFusedVarsAndAllocSpaceForVars(places, local_scopes, vars,
                                      fused_var_name, params_grads);
-    return std::move(graph);
  }
  template <typename AttrType>
@@ -298,17 +305,6 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
    return type == proto::VarType::LOD_TENSOR;
  }
-  void AppendAllocSpaceForVarsOp(const std::vector<std::string> &params_name,
-                                 const std::vector<std::string> &grads_name,
-                                 const std::string &fused_var_name,
-                                 BlockDesc *global_block) const {
-    auto op_desc = global_block->AppendOp();
-    op_desc->SetType("alloc_continuous_space");
-    op_desc->SetInput("Input", params_name);
-    op_desc->SetOutput("Output", grads_name);
-    op_desc->SetOutput("FusedOutput", {fused_var_name});
-  }
  void RecordParamsAndGrads(ir::Node *node,
                            ParamsAndGrads *params_grads) const {
    try {
@@ -361,6 +357,7 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
      }
    }
+    // Alloc continuous space for vars.
    std::vector<std::string> grads_name;
    std::vector<std::string> params_name;
    grads_name.reserve(params_grads.size());
@@ -373,7 +370,6 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
    AppendAllocSpaceForVarsOp(params_name, grads_name, fused_var_name,
                              program_desc.MutableBlock(0));
-    // Run Only Once Programs
    for (size_t i = 0; i < local_scopes.size(); ++i) {
      for (auto &op_desc : program_desc.Block(0).AllOps()) {
        auto op = OpRegistry::CreateOp(*op_desc);
@@ -381,6 +377,17 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
      }
    }
  }
+  void AppendAllocSpaceForVarsOp(const std::vector<std::string> &params_name,
+                                 const std::vector<std::string> &grads_name,
+                                 const std::string &fused_var_name,
+                                 BlockDesc *global_block) const {
+    auto op_desc = global_block->AppendOp();
+    op_desc->SetType("alloc_continuous_space");
+    op_desc->SetInput("Input", params_name);
+    op_desc->SetOutput("Output", grads_name);
+    op_desc->SetOutput("FusedOutput", {fused_var_name});
+  }
 };
 }  // namespace details

--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -27,20 +27,17 @@ void BroadcastOpHandle::RunImpl() {
  if (places_.size() == 1) return;
  // The input and output may have dummy vars.
-  VarHandle *in_var_handle;
+  auto in_var_handles = DynamicCast<VarHandle>(inputs_);
-  {
-    auto in_var_handles = DynamicCast<VarHandle>(inputs_);
-    PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL,
-                      "The number of input should be one.");
-    in_var_handle = in_var_handles[0];
-  }
  auto out_var_handles = DynamicCast<VarHandle>(outputs_);
+  PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL,
+                    "The number of input should be one.");
  PADDLE_ENFORCE_EQ(
      out_var_handles.size(), places_.size(),
      "The number of output should equal to the number of places.");
+  VarHandle *in_var_handle = in_var_handles[0];
  WaitInputVarGenerated();
  std::vector<const Scope *> var_scopes;

--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <memory>
 #include <utility>
 #include "paddle/fluid/framework/details/memory_optimize_helper.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
@@ -82,23 +81,43 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
      AppendPass("inplace_pass");
    }
-    if (strategy.fuse_elewise_add_act_ops_) {
+    if (strategy_.fuse_elewise_add_act_ops_) {
      VLOG(10) << "Add fuse_elewise_add_act_pass";
      AppendPass("fuse_elewise_add_act_pass");
    }
    // for single card training, fuse_all_reduce_ops is unnecessary.
    // alloc_continuous_space_for_grad_pass should be before of MultiDevPass.
-    if (strategy.fuse_all_reduce_ops_) {
+    if (strategy_.fuse_all_reduce_ops_) {
      VLOG(10) << "Add alloc_continuous_space_for_grad_pass";
      AppendPass("alloc_continuous_space_for_grad_pass");
    }
+    if (strategy_.fuse_all_optimizer_ops_) {
+      if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce ||
+          strategy_.is_distribution_) {
+        VLOG(3)
+            << "Currently, fuse_all_optimizer_ops only works under AllReduce "
+               "mode.";
+        strategy_.fuse_all_optimizer_ops_ = false;
+      } else {
+        VLOG(10) << "Add alloc_continuous_space_for_grad_pass";
+        AppendPass("alloc_continuous_space_for_grad_pass");
+        // NOTE: fuse_all_xx_ops will count the number of xx operator first,
+        // if the number is zero, fuse_all_reduce_ops will do nothing.
+        // Currently, only one type of optimization algorithm can be fused.
+        VLOG(10) << "Add fuse_adam_op_pass";
+        AppendPass("fuse_adam_op_pass");
+        VLOG(10) << "Add fuse_sgd_op_pass";
+        AppendPass("fuse_sgd_op_pass");
+      }
+    }
    // Add a graph viz pass to record a graph.
    if (!strategy.debug_graphviz_path_.empty()) {
      auto viz_pass = AppendPass("graph_viz_pass");
      const std::string graph_path = string::Sprintf(
-          "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph");
+          "%s%s", strategy_.debug_graphviz_path_.c_str(), "_fused_graph");
      viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
    }
@@ -118,14 +137,14 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
    // the de-fact IR, any reuse on Graph is meaningless.
    // A side-effect of that, memory optimize cannot forsee the fetched vars
    // , so fetchlist should be set persistable before call the Run interface.
-    if (strategy.memory_optimize_) {
+    if (strategy_.memory_optimize_) {
      VLOG(10) << "Add memory_optimize_pass";
      AppendPass("memory_optimize_pass");
    }
-    AppendMultiDevPass(strategy);
+    AppendMultiDevPass(strategy_);
-    if (strategy.fuse_all_reduce_ops_) {
+    if (strategy_.fuse_all_reduce_ops_) {
      // NOTE: fuse_all_reduce_ops will count the number of all_reduce operator
      // first, if the number is zero, fuse_all_reduce_ops will do nothing.
      VLOG(10) << "Add fuse_all_reduce_op_pass";
@@ -151,7 +170,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
      AppendPass("all_reduce_deps_pass");
    }
-    if (SeqOnlyAllReduceOps(strategy)) {
+    if (SeqOnlyAllReduceOps(strategy_)) {
      VLOG(10) << "Add all_reduce_deps_pass";
      AppendPass("all_reduce_deps_pass");
    }
@@ -208,15 +227,16 @@ bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const {
  return framework::details::MultiDevSSAGraphBuilder().count(pass_name) > 0;
 }
-std::unique_ptr<ir::Graph> BuildStrategy::Apply(
+ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
-    std::unique_ptr<ir::Graph> graph,
+                                const std::vector<platform::Place> &places,
-    const std::vector<platform::Place> &places,
+                                const std::string &loss_var_name,
-    const std::string &loss_var_name, const std::vector<Scope *> &local_scopes,
+                                const std::vector<Scope *> &local_scopes,
-    const size_t &nranks,
+                                const size_t &nranks,
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const {
+                                const bool use_cuda,
+                                platform::NCCLContextMap *nccl_ctxs) const {
 #else
-    const bool use_cuda) const {
+                                const bool use_cuda) const {
 #endif
  VLOG(3) << "apply all passes";
  // Create a default one if not finalized by user.
@@ -240,17 +260,22 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
      pass->Erase(kNCCLCtxs);
      pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx);
 #endif
-    } else if (pass->Type() == "fuse_all_reduce_op_pass") {
+    } else if (pass->Type() == "alloc_continuous_space_for_grad_pass" ||
+               pass->Type() == "fuse_adam_op_pass" ||
+               pass->Type() == "fuse_sgd_op_pass" ||
+               pass->Type() == "fuse_all_reduce_op_pass") {
      pass->Erase(kPlaces);
      pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
      pass->Erase(kLocalScopes);
      pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
                                                    &local_scopes);
+      if (pass->Type() == "fuse_all_reduce_op_pass") {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
+        platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
-      pass->Erase(kNCCLCtxs);
+        pass->Erase(kNCCLCtxs);
-      pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx);
+        pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx);
 #endif
+      }
    } else if (pass->Type() == "alloc_continuous_space_for_grad_pass") {
      pass->Erase(kPlaces);
      pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
@@ -271,7 +296,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
      }
    }
    VLOG(3) << "Start Apply Pass " << pass->Type();
-    graph = pass->Apply(std::move(graph));
+    graph = pass->Apply(graph);
    VLOG(3) << "Finish Apply Pass " << pass->Type();
  }
  VLOG(3) << "All Passes Applied";
@@ -300,4 +325,6 @@ USE_PASS(inplace_pass);
 USE_PASS(lock_free_optimize_pass);
 USE_PASS(alloc_continuous_space_for_grad_pass);
 USE_PASS(graph_to_program_pass);
+USE_PASS(fuse_adam_op_pass);
+USE_PASS(fuse_sgd_op_pass);
 USE_PASS(fuse_all_reduce_op_pass);
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -18,7 +18,6 @@
 #include <string>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/ir/pass_builder.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
@@ -76,6 +75,8 @@ struct BuildStrategy {
  bool fuse_elewise_add_act_ops_{false};
+  bool fuse_all_optimizer_ops_{false};
  bool fuse_all_reduce_ops_{false};
  bool fuse_relu_depthwise_conv_{false};
@@ -121,16 +122,15 @@ struct BuildStrategy {
  // Apply the passes built by the pass_builder_. The passes will be
  // applied to the Program and output an ir::Graph.
-  std::unique_ptr<ir::Graph> Apply(std::unique_ptr<ir::Graph> graph,
+  ir::Graph *Apply(ir::Graph *graph, const std::vector<platform::Place> &places,
-                                   const std::vector<platform::Place> &places,
+                   const std::string &loss_var_name,
-                                   const std::string &loss_var_name,
+                   const std::vector<Scope *> &local_scopes,
-                                   const std::vector<Scope *> &local_scopes,
+                   const size_t &nranks,
-                                   const size_t &nranks,
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-                                   const bool use_cuda,
+                   const bool use_cuda,
-                                   platform::NCCLContextMap *nccl_ctxs) const;
+                   platform::NCCLContextMap *nccl_ctxs) const;
 #else
-                                   const bool use_cuda) const;
+                   const bool use_cuda) const;
 #endif
  // If set true, ParallelExecutor would build the main_program into multiple

--- a/paddle/fluid/framework/details/eager_deletion_pass.cc
+++ b/paddle/fluid/framework/details/eager_deletion_pass.cc
@@ -22,14 +22,9 @@
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
-DEFINE_double(memory_fraction_of_eager_deletion, 1.0,
-              "Fraction of eager deletion. If less than 1.0, all variables in "
-              "the program would be sorted according to its memory size, and "
-              "only the FLAGS_memory_fraction_of_eager_deletion of the largest "
-              "variables would be deleted.");
 namespace paddle {
 namespace framework {
 namespace details {
@@ -175,12 +170,10 @@ static OpToVarNameSetMap ShrinkGCVars(
 class EagerDeletionPass : public ir::Pass {
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
+  void ApplyImpl(ir::Graph *graph) const override;
-      std::unique_ptr<ir::Graph> graph) const override;
 };
-std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
+void EagerDeletionPass::ApplyImpl(ir::Graph *graph) const {
-    std::unique_ptr<ir::Graph> graph) const {
  auto &ref_cnts =
      Get<std::vector<AtomicReferenceCountMap>>(kRuntimeReferenceCount);
  PADDLE_ENFORCE(ref_cnts.empty(),
@@ -206,8 +199,9 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
    }
  }
-  op_vars_map = ShrinkGCVars(op_vars_map, vars, places,
+  double memory_fraction = framework::GetEagerDeletionMemoryFraction();
-                             FLAGS_memory_fraction_of_eager_deletion);
+  op_vars_map = ShrinkGCVars(op_vars_map, vars, places, memory_fraction);
  for (auto &pair : op_vars_map) {
    auto *op = pair.first;
@@ -239,13 +233,12 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
    eager_deletion_op->AddOutput(dummy_leaf);
  }
-  VLOG(10) << "FLAGS_memory_fraction_of_eager_deletion = "
+  VLOG(10) << "FLAGS_memory_fraction_of_eager_deletion = " << memory_fraction;
-           << FLAGS_memory_fraction_of_eager_deletion;
  VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)";
  auto while_op_eager_deletion_pass =
      ir::PassRegistry::Instance().Get("while_op_eager_deletion_pass");
-  return while_op_eager_deletion_pass->Apply(std::move(graph));
+  while_op_eager_deletion_pass->Apply(graph);
 }
 }  // namespace details

--- a/paddle/fluid/framework/details/early_delete_op_handle.h
+++ b/paddle/fluid/framework/details/early_delete_op_handle.h
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/details/computation_op_handle.h"
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/details/var_handle.h"
-#include "paddle/fluid/framework/garbage_collector.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-namespace paddle {
-namespace framework {
-namespace details {
-class EarlyDeleteOpHandle : public OpHandleBase {
- public:
-  EarlyDeleteOpHandle(ir::Node* node, const Scope* scope,
-                      const platform::Place& place,
-                      const std::vector<std::string>& names,
-                      GarbageCollector* gc)
-      : OpHandleBase(node),
-        scope_(scope),
-        place_(place),
-        names_(names),
-        gc_(gc) {
-#ifdef PADDLE_WITH_CUDA
-    if (IsStreamGarabageCollector()) {
-      auto gpu_place = boost::get<platform::CUDAPlace>(place);
-      PADDLE_ENFORCE(cudaSetDevice(gpu_place.device));
-      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
-    }
-#endif
-  }
-  ~EarlyDeleteOpHandle() {
-#ifdef PADDLE_WITH_CUDA
-    if (IsStreamGarabageCollector()) {
-      auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
-      PADDLE_ENFORCE(cudaSetDevice(gpu_place.device));
-      PADDLE_ENFORCE(cudaEventDestroy(event_));
-    }
-#endif
-  }
-  std::string Name() const override { return "early_delete"; }
- protected:
-  void RunImpl() override {
-    std::vector<std::shared_ptr<memory::Allocation>> tensors;
-    auto* local_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope*>();
-    for (auto& var_name : names_) {
-      auto* var = local_scope->FindVar(var_name);
-      PADDLE_ENFORCE(var != nullptr,
-                     string::Sprintf("Local Scope not has var %s", var_name));
-      if (var->IsType<LoDTensor>()) {
-        tensors.emplace_back(var->GetMutable<LoDTensor>()->MoveMemoryHolder());
-      } else if (var->IsType<SelectedRows>()) {
-        tensors.emplace_back(var->GetMutable<SelectedRows>()
-                                 ->mutable_value()
-                                 ->MoveMemoryHolder());
-      } else if (var->IsType<LoDTensorArray>()) {
-        LoDTensorArray* tensor_array = var->GetMutable<LoDTensorArray>();
-        for (auto& tensor : *tensor_array) {
-          tensors.emplace_back(tensor.MoveMemoryHolder());
-        }
-      }
-    }
-    if (!tensors.empty()) {
-      ClearTensors(tensors);
-    }
-  }
- private:
-  void ClearTensors(
-      const std::vector<std::shared_ptr<memory::Allocation>>& tensors) {
-    if (platform::is_cpu_place(place_)) {
-      ClearCPUTensors(tensors);
-    } else {
-      ClearGPUTensors(tensors);
-    }
-  }
-  void ClearCPUTensors(
-      const std::vector<std::shared_ptr<memory::Allocation>>& tensors) {
-    auto* gc = dynamic_cast<CPUGarbageCollector*>(gc_);
-    if (gc != nullptr) {
-      gc->Add(tensors);
-    }
-  }
-  void ClearGPUTensors(
-      const std::vector<std::shared_ptr<memory::Allocation>>& tensors) {
-#ifdef PADDLE_WITH_CUDA
-    auto* gc = dynamic_cast<StreamGarbageCollector*>(gc_);
-    if (gc != nullptr) {
-      auto compute_stream = dev_ctx_->stream();
-      auto callback_stream = gc->stream();
-      auto callback_func = [=]() {
-        PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
-        PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
-      };
-      gc_->Add(tensors, callback_func);
-    } else {
-      gc_->Add(tensors);
-    }
-  }
-  bool IsStreamGarabageCollector() const {
-    return dynamic_cast<const StreamGarbageCollector*>(gc_) != nullptr;
-#endif
-  }
-  const Scope* scope_;
-  const platform::Place place_;
-  std::vector<std::string> names_;
-  GarbageCollector* gc_;
-#ifdef PADDLE_WITH_CUDA
-  platform::CUDADeviceContext* dev_ctx_;
-  cudaEvent_t event_;
-#endif
-};
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -31,9 +31,10 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
      local_scopes_(local_scopes),
      places_(places),
      graph_(graph),
+      fetch_ctxs_(places),
      pool_(strategy.num_threads_),
-      prepare_pool_(1),  // add one more thread for generate op_deps
+      // add one more thread for generate op_deps
-      fetch_ctxs_(places) {
+      prepare_pool_(1) {
  for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
    int dep = static_cast<int>(op->NotReadyInputSize());
    op_deps_.emplace(op, dep);

--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@@ -14,7 +14,9 @@
 #pragma once
 #include <ThreadPool.h>
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/details/exception_holder.h"
@@ -37,6 +39,8 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
  const ir::Graph &Graph() const override;
 private:
+  // Note(zcd): the ThreadPool should be placed last so that ThreadPool should
+  // be destroyed first.
  ExecutionStrategy strategy_;
  std::vector<Scope *> local_scopes_;
  std::vector<platform::Place> places_;
@@ -45,21 +49,22 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
  std::unordered_map<OpHandleBase *, int> op_deps_;
  std::vector<OpHandleBase *> bootstrap_ops_;
-  ::ThreadPool pool_;
-  ::ThreadPool prepare_pool_;
  platform::DeviceContextPool fetch_ctxs_;
  std::atomic<int> remaining_;
+  std::future<
+      std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>>
+      atomic_op_deps_;
+  ExceptionHolder exception_;
+  ::ThreadPool pool_;
+  ::ThreadPool prepare_pool_;
  void RunOpAsync(std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
                  OpHandleBase *op,
                  const std::shared_ptr<BlockingQueue<size_t>> &complete_q);
  void PrepareAtomicOpDeps();
-  std::future<
-      std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>>
-      atomic_op_deps_;
-  ExceptionHolder exception_;
 };
 }  // namespace details
 }  // namespace framework

--- a/paddle/fluid/framework/details/fuse_adam_op_pass.cc
+++ b/paddle/fluid/framework/details/fuse_adam_op_pass.cc
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/fuse_adam_op_pass.h"
+#include <algorithm>
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace framework {
+namespace details {
+const std::string FuseAdamOpPass::GetOpType() const { return "adam"; }
+const std::vector<std::string> FuseAdamOpPass::GetAuxiliaryVarNames() const {
+  return {"Param", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow"};
+}
+void FuseAdamOpPass::FuseOptimizerOps(
+    const std::unordered_map<std::string, std::vector<std::string>>
+        &aux_var_set,
+    const std::unordered_map<std::string, std::string> &fused_vars_name,
+    const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const {
+  FuseAdamOps(aux_var_set, fused_vars_name, adam_ops, graph);
+  FuseScaleOps(aux_var_set.at("Beta1Pow"), fused_vars_name.at("Beta1Pow"),
+               adam_ops, graph);
+  FuseScaleOps(aux_var_set.at("Beta2Pow"), fused_vars_name.at("Beta2Pow"),
+               adam_ops, graph);
+}
+void FuseAdamOpPass::FuseAdamOps(
+    const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
+    const std::unordered_map<std::string, std::string> &fused_vars_name,
+    const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const {
+  PADDLE_ENFORCE_GT(adam_ops.size(), static_cast<size_t>(0));
+  // Check attributions
+  // NOTE: If new attribution is added, the following code maybe need change.
+  int op_role = boost::get<int>(
+      adam_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
+  float beta1 = boost::get<float>(adam_ops[0]->Op()->GetAttr("beta1"));
+  float beta2 = boost::get<float>(adam_ops[0]->Op()->GetAttr("beta2"));
+  float epsilon = boost::get<float>(adam_ops[0]->Op()->GetAttr("epsilon"));
+  bool lazy_mode = boost::get<bool>(adam_ops[0]->Op()->GetAttr("lazy_mode"));
+  int64_t min_row_size_to_use_multithread = boost::get<int64_t>(
+      adam_ops[0]->Op()->GetAttr("min_row_size_to_use_multithread"));
+  for (auto &adam_op : adam_ops) {
+    PADDLE_ENFORCE_EQ(beta1,
+                      boost::get<float>(adam_op->Op()->GetAttr("beta1")));
+    PADDLE_ENFORCE_EQ(beta2,
+                      boost::get<float>(adam_op->Op()->GetAttr("beta2")));
+    PADDLE_ENFORCE_EQ(epsilon,
+                      boost::get<float>(adam_op->Op()->GetAttr("epsilon")));
+    PADDLE_ENFORCE_EQ(lazy_mode,
+                      boost::get<bool>(adam_op->Op()->GetAttr("lazy_mode")));
+    PADDLE_ENFORCE_EQ(min_row_size_to_use_multithread,
+                      boost::get<int64_t>(adam_op->Op()->GetAttr(
+                          "min_row_size_to_use_multithread")));
+    PADDLE_ENFORCE_EQ(op_role, boost::get<int>(adam_op->Op()->GetAttr(
+                                   OpProtoAndCheckerMaker::OpRoleAttrName())));
+  }
+  // NOTE: fused_var is only exist in scope, so the graph doesn't have fused_var
+  // node.
+  VLOG(10) << "Insert adam to graph ";
+  OpDesc adam_desc(adam_ops[0]->Op()->Block());
+  adam_desc.SetType("adam");
+  adam_desc.SetInput("Param", {fused_vars_name.at("Param")});
+  adam_desc.SetInput("Grad", {fused_vars_name.at("Grad")});
+  adam_desc.SetInput("Moment1", {fused_vars_name.at("Moment1")});
+  adam_desc.SetInput("Moment2", {fused_vars_name.at("Moment2")});
+  // TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal.
+  adam_desc.SetInput("LearningRate", adam_ops[0]->Op()->Input("LearningRate"));
+  adam_desc.SetInput("Beta1Pow", adam_ops[0]->Op()->Input("Beta1Pow"));
+  adam_desc.SetInput("Beta2Pow", adam_ops[0]->Op()->Input("Beta2Pow"));
+  adam_desc.SetOutput("ParamOut", {fused_vars_name.at("Param")});
+  adam_desc.SetOutput("Moment1Out", {fused_vars_name.at("Moment1")});
+  adam_desc.SetOutput("Moment2Out", {fused_vars_name.at("Moment2")});
+  adam_desc.SetAttr("beta1", beta1);
+  adam_desc.SetAttr("beta2", beta2);
+  adam_desc.SetAttr("epsilon", epsilon);
+  adam_desc.SetAttr("lazy_mode", lazy_mode);
+  adam_desc.SetAttr("min_row_size_to_use_multithread",
+                    min_row_size_to_use_multithread);
+  adam_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
+  auto adam_node = graph->CreateOpNode(&adam_desc);
+  InserInputAndOutputForOptOps(adam_ops, adam_node);
+}
+void FuseAdamOpPass::FuseScaleOps(const std::vector<std::string> &beta_name,
+                                  const std::string &fused_var_name,
+                                  const std::vector<ir::Node *> &adam_ops,
+                                  ir::Graph *graph) const {
+  PADDLE_ENFORCE_EQ(beta_name.size(), adam_ops.size());
+  const std::string scale_op_name = "scale";
+  // Get the scale_ops of dealing the adam's beta var.
+  std::vector<ir::Node *> scale_ops;
+  scale_ops.reserve(beta_name.size());
+  for (size_t i = 0; i < adam_ops.size(); ++i) {
+    auto &beta_1_pow_name = beta_name[i];
+    auto beta_pow_iter = std::find_if(
+        adam_ops[i]->inputs.begin(), adam_ops[i]->inputs.end(),
+        [&beta_name, &beta_1_pow_name](ir::Node *var_node) -> bool {
+          return var_node->Var() && var_node->Var()->Name() == beta_1_pow_name;
+        });
+    PADDLE_ENFORCE(beta_pow_iter != adam_ops[i]->inputs.end());
+    auto beta_pow_node = *beta_pow_iter;
+    auto scale_op_iter = std::find_if(
+        beta_pow_node->outputs.begin(), beta_pow_node->outputs.end(),
+        [&scale_op_name](ir::Node *op_node) -> bool {
+          return op_node->Op() && op_node->Op()->Type() == scale_op_name;
+        });
+    PADDLE_ENFORCE(scale_op_iter != beta_pow_node->outputs.end());
+    scale_ops.emplace_back(*scale_op_iter);
+  }
+  PADDLE_ENFORCE_EQ(scale_ops.size(), beta_name.size());
+  // Check attributions
+  // NOTE: If new attribution is added, the following code maybe need change.
+  int op_role = boost::get<int>(
+      scale_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
+  float scale = boost::get<float>(scale_ops[0]->Op()->GetAttr("scale"));
+  float bias = boost::get<float>(scale_ops[0]->Op()->GetAttr("bias"));
+  bool bias_after_scale =
+      boost::get<bool>(scale_ops[0]->Op()->GetAttr("bias_after_scale"));
+  for (auto &scale_op : scale_ops) {
+    PADDLE_ENFORCE_EQ(scale,
+                      boost::get<float>(scale_op->Op()->GetAttr("scale")));
+    PADDLE_ENFORCE_EQ(bias, boost::get<float>(scale_op->Op()->GetAttr("bias")));
+    PADDLE_ENFORCE_EQ(
+        bias_after_scale,
+        boost::get<bool>(scale_op->Op()->GetAttr("bias_after_scale")));
+    PADDLE_ENFORCE_EQ(op_role, boost::get<int>(scale_op->Op()->GetAttr(
+                                   OpProtoAndCheckerMaker::OpRoleAttrName())));
+  }
+  // NOTE: fused_var is only exist in scope, so the graph doesn't have fused_var
+  // node.
+  VLOG(10) << "Insert fused scale to graph.";
+  OpDesc scale_desc(scale_ops[0]->Op()->Block());
+  scale_desc.SetType("scale");
+  scale_desc.SetInput("X", {fused_var_name});
+  scale_desc.SetOutput("Out", {fused_var_name});
+  scale_desc.SetAttr("scale", scale);
+  scale_desc.SetAttr("bias", bias);
+  scale_desc.SetAttr("bias_after_scale", bias_after_scale);
+  scale_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
+  auto scale_node = graph->CreateOpNode(&scale_desc);
+  for (auto scale_op : scale_ops) {
+    // set inputs
+    scale_node->inputs.insert(scale_node->inputs.begin(),
+                              scale_op->inputs.begin(), scale_op->inputs.end());
+    for (auto &input : scale_op->inputs) {
+      std::replace(input->outputs.begin(), input->outputs.end(), scale_op,
+                   scale_node);
+    }
+    // set outputs
+    scale_node->outputs.insert(scale_node->outputs.begin(),
+                               scale_op->outputs.begin(),
+                               scale_op->outputs.end());
+    for (auto &output : scale_op->outputs) {
+      std::replace(output->inputs.begin(), output->inputs.end(), scale_op,
+                   scale_node);
+    }
+  }
+  // Delete scale_ops
+  for (auto &scale_op : scale_ops) {
+    graph->RemoveNode(scale_op);
+  }
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(fuse_adam_op_pass, paddle::framework::details::FuseAdamOpPass)
+    .RequirePassAttr(paddle::framework::details::kPlaces)
+    .RequirePassAttr(paddle::framework::details::kLocalScopes);
--- a/paddle/fluid/framework/details/fuse_adam_op_pass.h
+++ b/paddle/fluid/framework/details/fuse_adam_op_pass.h
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/details/build_strategy.h"
+#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph.h"
+namespace paddle {
+namespace framework {
+namespace details {
+class FuseAdamOpPass : public FuseOptimizerOpPass {
+ private:
+  virtual const std::string GetOpType() const;
+  virtual const std::vector<std::string> GetAuxiliaryVarNames() const;
+  // Fuse Adam Ops and Scale Ops which are used to update "Beta1Pow", "Beta2Pow"
+  virtual void FuseOptimizerOps(
+      const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
+      const std::unordered_map<std::string, std::string> &fused_vars_name,
+      const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const;
+  void FuseAdamOps(
+      const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
+      const std::unordered_map<std::string, std::string> &fused_vars_name,
+      const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const;
+  void FuseScaleOps(const std::vector<std::string> &aux_var_set,
+                    const std::string &fused_var_name,
+                    const std::vector<ir::Node *> &adam_ops,
+                    ir::Graph *graph) const;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/fuse_all_reduce_op_pass.cc
+++ b/paddle/fluid/framework/details/fuse_all_reduce_op_pass.cc
@@ -28,8 +28,7 @@ namespace details {
 class FuseAllReduceOpPass : public ir::Pass {
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
+  void ApplyImpl(ir::Graph *graph) const override {
-      std::unique_ptr<ir::Graph> graph) const override {
    ir::Graph &result = *graph;
    auto &places = Get<const std::vector<platform::Place>>(kPlaces);
@@ -71,7 +70,7 @@ class FuseAllReduceOpPass : public ir::Pass {
    VLOG(10) << "Find all_reduce_ops: " << all_reduce_ops.size();
    if (all_reduce_ops.size() == 0) {
-      return std::move(graph);
+      return;
    }
    PADDLE_ENFORCE_EQ(all_reduce_ops.size(), grads.size(),
@@ -99,7 +98,6 @@ class FuseAllReduceOpPass : public ir::Pass {
                           group_all_reduce_ops, &result);
 #endif
    }
-    return std::move(graph);
  }
  void InsertFusedAllReduce(const std::vector<platform::Place> &places,

--- a/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc
+++ b/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h"
+#include <algorithm>
+#include <unordered_set>
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace framework {
+namespace details {
+void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
+  ir::Graph &result = *graph;
+  auto &places = Get<const std::vector<platform::Place>>(kPlaces);
+  auto &local_scopes = Get<const std::vector<Scope *>>(kLocalScopes);
+  const std::string fuse_op_type = GetOpType();
+  const std::vector<std::string> aux_var_names = GetAuxiliaryVarNames();
+  // Step 1: Get the specified op and auxiliary variables.
+  std::vector<ir::Node *> topo_nodes = ir::TopologySortOperations(result);
+  std::unordered_map<std::string, std::vector<std::string>> aux_var_set;
+  std::vector<ir::Node *> opt_ops;
+  for (auto &node : topo_nodes) {
+    GetSpecifiedOpsAndVars(fuse_op_type, aux_var_names, node, &opt_ops,
+                           &aux_var_set);
+  }
+  VLOG(10) << "Find " << fuse_op_type << " operators: " << opt_ops.size();
+  if (opt_ops.size() == 0) {
+    return;
+  }
+  if (result.Has(kFusedOptType)) {
+    VLOG(10)
+        << "Currently only support fusing one type optimizer op. Has fused "
+        << result.Get<FusedOptType>(kFusedOptType);
+    return;
+  } else {
+    result.Set(kFusedOptType, new FusedOptType);
+  }
+  result.Get<FusedOptType>(kFusedOptType) = fuse_op_type;
+  // Step 2: Insert fused_var_name to FusedVars, and the FusedVars need be
+  // initialized in scopes before execution.
+  if (!result.Has(kFusedVars)) {
+    result.Set(kFusedVars, new FusedVars);
+  }
+  std::unordered_map<std::string, std::string> fused_vars_name;
+  fused_vars_name.reserve(aux_var_names.size() + 1);
+  auto &fused_var_set = result.Get<FusedVars>(kFusedVars);
+  const std::string prefix(kFusedVarNamePrefix);
+  // NOTE: the fused_var_name should be unique.
+  for (auto &var_name : aux_var_names) {
+    auto fused_var_name = prefix + "_" + fuse_op_type + "_" + var_name + "_" +
+                          aux_var_set[var_name][0];
+    VLOG(10) << fused_var_name;
+    fused_vars_name.emplace(var_name, fused_var_name);
+    PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0);
+    fused_var_set.insert(fused_var_name);
+  }
+  // Step 3: Get the fused Gradient's name
+  auto &params_grads = result.Get<ParamsAndGrads>(kParamsAndGrads);
+  if (!result.Has(kFusedGrads)) {
+    PADDLE_THROW(
+        "The alloc_continuous_space_for_grad_pass should be called before this "
+        "pass.");
+  }
+  auto &fused_grad = result.Get<FusedGrads>(kFusedGrads);
+  auto &fused_vars = result.Get<FusedVars>(kFusedVars);
+  auto iter = std::find(fused_vars.begin(), fused_vars.end(), fused_grad);
+  PADDLE_ENFORCE(iter != fused_vars.end(), "Not find the fused_grad.");
+  fused_vars_name.emplace("Grad", fused_grad);
+  // Step 4: Sort the parameters and auxiliary variables according
+  // to parameters' name to make variables' name correspond correctly.
+  PADDLE_ENFORCE(result.Has(kParamsAndGrads), "Does't find kParamsAndGrads.");
+  PADDLE_ENFORCE_EQ(params_grads.size(), aux_var_set.begin()->second.size(),
+                    "The size of params_grads and aux_var_set are not equal.");
+  SortParametersAndAuxVars(params_grads, &aux_var_set, &opt_ops);
+  // Step 5: Alloc continuous space for Parameters and AuxiliaryVar(e.g.
+  // Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops separately.
+  InitFusedVarsAndAllocSpaceForVars(places, local_scopes, aux_var_names,
+                                    aux_var_set, fused_vars_name);
+  // Step 6: Fuse optimizer Ops and Scale Ops
+  FuseOptimizerOps(aux_var_set, fused_vars_name, opt_ops, &result);
+  // Step 7: Remove optimizer Ops
+  for (auto &opt_op : opt_ops) {
+    graph->RemoveNode(opt_op);
+  }
+}
+void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars(
+    const std::vector<platform::Place> &places,
+    const std::vector<Scope *> &local_scopes,
+    const std::vector<std::string> &aux_var_names,
+    const std::unordered_map<std::string, std::vector<std::string>>
+        &aux_var_set,
+    const std::unordered_map<std::string, std::string> &fused_vars_name) const {
+  VLOG(10) << "Init FusedVars.";
+  // Alloc parameters and auxiliary vars in the respective scope.
+  size_t idx = local_scopes.size();
+  for (auto iter = local_scopes.rbegin(); iter != local_scopes.rend();
+       ++iter, --idx) {
+    auto &scope = *iter;
+    for (auto &var_name : aux_var_names) {
+      auto fused_var_name = fused_vars_name.at(var_name);
+      VLOG(10) << "Init " << fused_var_name;
+      PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr,
+                     "%s has exist in scope[%d]", fused_var_name, idx);
+      scope->Var(fused_var_name)->GetMutable<LoDTensor>();
+    }
+  }
+  ProgramDesc program_desc;
+  auto *global_block = program_desc.MutableBlock(0);
+  for (auto &var_name : aux_var_names) {
+    AppendAllocContinuousSpace(aux_var_set.at(var_name),
+                               fused_vars_name.at(var_name), true,
+                               global_block);
+  }
+  for (size_t i = 0; i < local_scopes.size(); ++i) {
+    for (auto &op_desc : global_block->AllOps()) {
+      auto op = OpRegistry::CreateOp(*op_desc);
+      op->Run(*local_scopes[i], places[i]);
+    }
+  }
+}
+void FuseOptimizerOpPass::SortParametersAndAuxVars(
+    const std::vector<std::pair<std::string, std::string>> &params_grads,
+    std::unordered_map<std::string, std::vector<std::string>> *aux_vars_set,
+    std::vector<ir::Node *> *ops) const {
+  PADDLE_ENFORCE_NE(aux_vars_set->count("Param"), static_cast<size_t>(0));
+  auto &param_vec = aux_vars_set->at("Param");
+  std::vector<size_t> param_sort_idx;
+  param_sort_idx.reserve(param_vec.size());
+  for (auto &p_g : params_grads) {
+    auto iter = std::find(param_vec.begin(), param_vec.end(), p_g.first);
+    PADDLE_ENFORCE(iter != param_vec.end());
+    auto idx = std::distance(param_vec.begin(), iter);
+    param_sort_idx.emplace_back(idx);
+  }
+  for (auto &aux_vars : *aux_vars_set) {
+    std::vector<std::string> sorted_vars;
+    sorted_vars.reserve(aux_vars.second.size());
+    for (size_t i = 0; i < aux_vars.second.size(); ++i) {
+      sorted_vars.emplace_back(aux_vars.second.at(param_sort_idx[i]));
+    }
+    std::swap(aux_vars.second, sorted_vars);
+    std::stringstream out;
+    for (auto &var_name : aux_vars.second) {
+      out << var_name << " ";
+    }
+    VLOG(10) << aux_vars.first << ": " << out.str();
+  }
+  std::vector<ir::Node *> sorted_ops;
+  sorted_ops.reserve(ops->size());
+  for (size_t i = 0; i < ops->size(); ++i) {
+    sorted_ops.emplace_back(ops->at(param_sort_idx[i]));
+  }
+  std::swap(*ops, sorted_ops);
+}
+void FuseOptimizerOpPass::GetSpecifiedOpsAndVars(
+    const std::string &op_type, const std::vector<std::string> &aux_vars_name,
+    ir::Node *node, std::vector<ir::Node *> *ops,
+    std::unordered_map<std::string, std::vector<std::string>> *aux_args_name)
+    const {
+  if (node->Op()->Type() != op_type) return;
+  for (auto &var_n : aux_vars_name) {
+    auto arg_names = node->Op()->Input(var_n);
+    PADDLE_ENFORCE_EQ(arg_names.size(), static_cast<size_t>(1));
+    (*aux_args_name)[var_n].emplace_back(arg_names[0]);
+    VLOG(10) << var_n << ", " << arg_names[0];
+  }
+  ops->emplace_back(node);
+}
+void FuseOptimizerOpPass::AppendAllocContinuousSpace(
+    const std::vector<std::string> &args, const std::string &out_arg,
+    bool copy_data, BlockDesc *global_block) const {
+  auto op_desc = global_block->AppendOp();
+  op_desc->SetType("alloc_continuous_space");
+  op_desc->SetInput("Input", args);
+  op_desc->SetOutput("Output", args);
+  op_desc->SetOutput("FusedOutput", {out_arg});
+  op_desc->SetAttr("copy_data", copy_data);
+  op_desc->SetAttr("check_name", true);
+}
+void FuseOptimizerOpPass::InserInputAndOutputForOptOps(
+    const std::vector<ir::Node *> &opt_ops, ir::Node *opt_node) const {
+  std::unordered_set<ir::Node *> inputs;
+  std::unordered_set<ir::Node *> outputs;
+  for (auto opt_op : opt_ops) {
+    // set inputs
+    inputs.insert(opt_op->inputs.begin(), opt_op->inputs.end());
+    for (auto &input : opt_op->inputs) {
+      replace(input->outputs.begin(), input->outputs.end(), opt_op, opt_node);
+    }
+    // set outputs
+    outputs.insert(opt_op->outputs.begin(), opt_op->outputs.end());
+    for (auto &output : opt_op->outputs) {
+      replace(output->inputs.begin(), output->inputs.end(), opt_op, opt_node);
+    }
+  }
+  opt_node->inputs.insert(opt_node->inputs.begin(), inputs.begin(),
+                          inputs.end());
+  opt_node->outputs.insert(opt_node->outputs.begin(), outputs.begin(),
+                           outputs.end());
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/fuse_optimizer_op_pass.h
+++ b/paddle/fluid/framework/details/fuse_optimizer_op_pass.h
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/details/build_strategy.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph.h"
+namespace paddle {
+namespace framework {
+namespace details {
+class FuseOptimizerOpPass : public ir::Pass {
+ protected:
+  void ApplyImpl(ir::Graph *graph) const override;
+ protected:
+  virtual void SortParametersAndAuxVars(
+      const std::vector<std::pair<std::string, std::string>> &params_grads,
+      std::unordered_map<std::string, std::vector<std::string>> *aux_var_set,
+      std::vector<ir::Node *> *ops) const;
+  void InserInputAndOutputForOptOps(const std::vector<ir::Node *> &opt_ops,
+                                    ir::Node *opt_node) const;
+ private:
+  virtual const std::string GetOpType() const = 0;
+  virtual const std::vector<std::string> GetAuxiliaryVarNames() const = 0;
+  virtual void FuseOptimizerOps(
+      const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
+      const std::unordered_map<std::string, std::string> &fused_vars_name,
+      const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const = 0;
+  void GetSpecifiedOpsAndVars(
+      const std::string &op_type, const std::vector<std::string> &aux_vars_name,
+      ir::Node *node, std::vector<ir::Node *> *ops,
+      std::unordered_map<std::string, std::vector<std::string>> *aux_args_name)
+      const;
+  void AppendAllocContinuousSpace(const std::vector<std::string> &args,
+                                  const std::string &out_arg, bool copy_data,
+                                  BlockDesc *global_block) const;
+  void InitFusedVarsAndAllocSpaceForVars(
+      const std::vector<platform::Place> &places,
+      const std::vector<Scope *> &local_scopes,
+      const std::vector<std::string> &aux_var_names,
+      const std::unordered_map<std::string, std::vector<std::string>>
+          &aux_var_set,
+      const std::unordered_map<std::string, std::string> &fused_vars_name)
+      const;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/fuse_sgd_op_pass.cc
+++ b/paddle/fluid/framework/details/fuse_sgd_op_pass.cc
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/fuse_sgd_op_pass.h"
+#include <algorithm>
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace framework {
+namespace details {
+const std::string FuseSgdOpPass::GetOpType() const { return "sgd"; }
+const std::vector<std::string> FuseSgdOpPass::GetAuxiliaryVarNames() const {
+  return {"Param"};
+}
+void FuseSgdOpPass::FuseOptimizerOps(
+    const std::unordered_map<std::string, std::vector<std::string>>
+        &aux_var_set,
+    const std::unordered_map<std::string, std::string> &fused_vars_name,
+    const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const {
+  FuseSgdOps(aux_var_set, fused_vars_name, sgd_ops, graph);
+}
+void FuseSgdOpPass::FuseSgdOps(
+    const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
+    const std::unordered_map<std::string, std::string> &fused_vars_name,
+    const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const {
+  PADDLE_ENFORCE_GT(sgd_ops.size(), static_cast<size_t>(0));
+  // NOTE: fused_var is only exist in scope, so the graph doesn't have fused_var
+  // node.
+  int op_role = boost::get<int>(
+      sgd_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName()));
+  VLOG(10) << "Insert sgd to graph ";
+  // Add fused scale
+  OpDesc Sgd_desc(sgd_ops[0]->Op()->Block());
+  Sgd_desc.SetType("sgd");
+  Sgd_desc.SetInput("Param", {fused_vars_name.at("Param")});
+  Sgd_desc.SetInput("Grad", {fused_vars_name.at("Grad")});
+  Sgd_desc.SetOutput("ParamOut", {fused_vars_name.at("Param")});
+  // TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal.
+  Sgd_desc.SetInput("LearningRate", sgd_ops[0]->Op()->Input("LearningRate"));
+  // NOTE: multi_devices_pass requires that every op should have a role.
+  Sgd_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
+  auto sgd_node = graph->CreateOpNode(&Sgd_desc);
+  InserInputAndOutputForOptOps(sgd_ops, sgd_node);
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(fuse_sgd_op_pass, paddle::framework::details::FuseSgdOpPass)
+    .RequirePassAttr(paddle::framework::details::kPlaces)
+    .RequirePassAttr(paddle::framework::details::kLocalScopes);
--- a/paddle/fluid/framework/details/fuse_sgd_op_pass.h
+++ b/paddle/fluid/framework/details/fuse_sgd_op_pass.h
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/details/build_strategy.h"
+#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph.h"
+namespace paddle {
+namespace framework {
+namespace details {
+class FuseSgdOpPass : public FuseOptimizerOpPass {
+ private:
+  virtual const std::string GetOpType() const;
+  virtual const std::vector<std::string> GetAuxiliaryVarNames() const;
+  // Fuse Sgd Ops
+  virtual void FuseOptimizerOps(
+      const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
+      const std::unordered_map<std::string, std::string> &fused_vars_name,
+      const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const;
+  void FuseSgdOps(
+      const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
+      const std::unordered_map<std::string, std::string> &fused_vars_name,
+      const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -24,6 +24,19 @@ namespace paddle {
 namespace framework {
 namespace details {
+// Note(zcd): Addresses should be aligned, otherwise, the results may have
+// diff.
+static size_t Alignment(size_t size, const platform::Place &place) {
+  // Allow to allocate the minimum chunk size is 4 KB.
+  size_t alignment = 1 << 12;
+  if (platform::is_gpu_place(place)) {
+    // Allow to allocate the minimum chunk size is 256 B.
+    alignment = 1 << 8;
+  }
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
 typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>>
    GradientAndLoDTensor;
@@ -111,10 +124,11 @@ void FusedAllReduceOpHandle::RunImpl() {
          return grad1.second->data<void>() < grad2.second->data<void>();
        });
+    size_t size_of_dtype = framework::SizeOfType(dtype);
    for (size_t k = 1; k < g_tensor.size(); ++k) {
      const void *cur_address = g_tensor.at(k - 1).second->data<void>();
      int64_t len = g_tensor.at(k - 1).second->numel();
-      auto offset = len * framework::SizeOfType(dtype);
+      auto offset = Alignment(len * size_of_dtype, places_[0]);
      void *infer_next_address = reinterpret_cast<void *>(
          reinterpret_cast<uintptr_t>(cur_address) + offset);
      const void *next_address = g_tensor.at(k).second->data<void>();
@@ -228,18 +242,21 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel(
    const std::vector<std::pair<std::string, const LoDTensor *>> &grad_tensor,
    proto::VarType::Type *dtype, int64_t *numel) const {
  *numel = 0;
+  size_t size_of_dtype = 0;
  for (size_t i = 0; i < grad_tensor.size(); ++i) {
-    // Get element number
-    int64_t len = grad_tensor.at(i).second->numel();
-    PADDLE_ENFORCE_GT(len, 0);
-    *numel += len;
    // Get dtype
    auto ele_type = grad_tensor.at(i).second->type();
    if (i == 0) {
      *dtype = ele_type;
+      size_of_dtype = framework::SizeOfType(ele_type);
    }
    PADDLE_ENFORCE_EQ(ele_type, *dtype);
+    // Get element number
+    int64_t len = grad_tensor.at(i).second->numel();
+    PADDLE_ENFORCE_GT(len, 0);
+    //    Alignment(len)
+    *numel += Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
  }
 }

--- a/paddle/fluid/framework/details/inplace_op_pass.cc
+++ b/paddle/fluid/framework/details/inplace_op_pass.cc
@@ -17,6 +17,8 @@
 #include <deque>
 #include <iterator>
 #include <memory>
+#include <queue>
+#include <sstream>
 #include <stack>
 #include <string>
 #include <unordered_map>
@@ -142,20 +144,19 @@ void InplacePass::InitSSAGraphNodes() const {
  }
 }
-std::unique_ptr<ir::Graph> InplacePass::ApplyImpl(
+void InplacePass::ApplyImpl(ir::Graph* graph) const {
-    std::unique_ptr<ir::Graph> graph) const {
  var_nodes_.clear();
-  view_.Build(graph.get());
+  view_.Build(graph);
  InitSSAGraphNodes();
+  auto cnt = 0;
  for (auto* op : view_.AllOps()) {
+    VLOG(4) << "Handle op " << cnt++ << ": " << op->Name();
    if (FLAGS_enable_inplace_whitelist && !whitelist_.count(op->Name()))
      continue;
-    TryInplaceOpInputOutput(op, graph.get());
+    TryInplaceOpInputOutput(op, graph);
  }
-  graph->ResolveHazard(var_nodes_);
+  // graph->ResolveHazard(var_nodes_);
-  return graph;
 }
 void InplacePass::InplaceModifyDesc(const std::string& var,
@@ -264,13 +265,10 @@ void InplacePass::WithdrawModify(const NodeSwapQueue& nodes,
 void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
                                          ir::Graph* graph) const {
  VLOG(4) << "Try to inplace op " << op->Name();
-  // FIXME(liuwei1031): Graph is not aware of the existence of BlockDescs and
+  // PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr,
-  // ProgramDescs.
+  //               "op_desc is nullptr");
-  // The operations related to BlockDesc or ProgramDesc should perform on Graph
-  // or Node directly!
-  PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr,
-                 "op_desc is nullptr");
  // some pre-requirments need to meet if the op want to inplaced.
+  PADDLE_ENFORCE(op->Op() != nullptr, "op_desc is nullptr");
  auto* op_desc = op->Op();
  auto& infer_inplace =
@@ -281,21 +279,58 @@ void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
  PADDLE_ENFORCE(static_cast<bool>(infer_inplace),
                 "%s's infer_inplace has not been registered", op_desc->Type());
-  auto* block = op_desc->Block();
+  auto in_to_outs = infer_inplace(*op_desc);
-  auto in_to_outs = infer_inplace(*op_desc, block);
  auto& all_ops = view_.AllOps();
  auto cursor = std::find(all_ops.begin(), all_ops.end(), op);
  size_t idx = std::distance(all_ops.begin(), cursor);
  for (auto& pair : in_to_outs) {
-    auto& in_var_name = pair.first;
+    auto& in_para_name = pair.first;
-    auto& out_var_name = pair.second;
+    auto& out_para_name = pair.second;
+    auto input_vars = op->Op()->Input(in_para_name);
+    if (!input_vars.size()) {
+      VLOG(4) << "Parameter " << in_para_name << " is empty skip "
+              << in_para_name << " => " << out_para_name << " pair";
+      continue;
+    }
+    auto output_vars = op->Op()->Output(out_para_name);
+    if (!output_vars.size()) {
+      VLOG(4) << "Parameter " << out_para_name << " is empty skip "
+              << in_para_name << " => " << out_para_name << " pair";
+      continue;
+    }
+    auto in_var_name = input_vars.at(0);
+    auto out_var_name = output_vars.at(0);
    auto* in_node = view_.GetNodeByName(in_var_name, op->inputs);
    auto* out_node = view_.GetNodeByName(out_var_name, op->outputs);
+    VLOG(4) << "Try to inplace " << in_var_name << " with " << out_var_name;
+    bool can_replace = true;
+    if (in_var_name == out_var_name) {
+      can_replace = false;
+      VLOG(4) << "SKIP: Input variable " << in_var_name << " & Output variable "
+              << out_var_name << " are the same";
+    } else if (!NodeCanReused(in_node)) {
+      can_replace = false;
+      VLOG(4) << "SKIP: Input varialbe " << in_var_name << "cannot be reused";
+    } else if (!NodeCanReused(out_node)) {
+      can_replace = false;
+      VLOG(4) << "SKIP: Output variable " << out_var_name
+              << " cannot be reused";
+    } else if (details::NodeSize(*in_node->Var()) !=
+               details::NodeSize(*out_node->Var())) {
+      can_replace = false;
+      VLOG(4) << "SKIP: Input and Output varialbe size not match";
+    }
+    if (!can_replace) continue;
    // 2. there is no external pending op on the input node
-    if (view_.PendingOpsOnVar(in_node).size() > 1) {
+    // if (view_.PendingOpsOnVar(in_node).size() > 1) {
+    if (in_node->outputs.size() > 1 && !view_.CheckDeps(in_node, op)) {
      VLOG(4) << string::Sprintf(
          "Skiped pair %s => %s. %s input has external dependency."
          "inplace such pair will overwrite the memory.",
@@ -342,6 +377,97 @@ void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
  }
 }
+void GraphView::TopoSort(ir::Graph* graph) {
+  //
+  ops_.clear();
+  auto deps_num = [](ir::Node* op) {
+    auto cnt = 0;
+    for (auto& var : op->inputs)
+      if (var->inputs.size() > 0) ++cnt;
+    return cnt;
+  };
+  std::queue<std::pair<ir::Node*, uint32_t>> ready_ops;
+  int level = 0;
+  auto nodes = graph->Nodes();
+  std::unordered_map<ir::Node*, uint32_t> deps_map;
+  for (auto& node : nodes) {
+    if (node->IsOp() && node->Op() != nullptr) {
+      deps_map[node] = deps_num(node);
+      if (0 == deps_map[node]) {
+        ready_ops.push({node, level});
+      }
+    }
+  }
+  while (!ready_ops.empty()) {
+    auto item = ready_ops.front();
+    ready_ops.pop();
+    ops_.emplace_back(item.first);
+    // record level when pop from queue
+    op_level_[item.first] = item.second;
+    for (auto node : item.first->outputs) {
+      for (auto op : node->outputs) {
+        --deps_map[op];
+        if (deps_map[op] == 0) ready_ops.push({op, item.second + 1});
+      }
+    }
+  }
+  bool all_ops_checked = true;
+  for (auto& node : nodes) {
+    if (node->IsOp() && node->Op() != nullptr && deps_map[node] > 0) {
+      all_ops_checked = false;
+      break;
+    }
+  }
+  PADDLE_ENFORCE(all_ops_checked, "All ops deps should be 0 after analysis");
+}
+// return true if current op node depeneds on all other op that use the same
+// variable node
+bool GraphView::CheckDeps(ir::Node* var, ir::Node* current_op) const {
+  // get op list that rely on the same variable
+  auto op_list = var->outputs;
+  for (auto& op : op_list) {
+    if (op == current_op) continue;
+    VLOG(4) << "    GraphView::CheckDeps : " << op->Name() << "  & "
+            << current_op->Name();
+    if (!CheckOpDeps(op, current_op)) return false;
+    VLOG(4) << "";
+  }
+  return true;
+}
+// check if op2 depends on op1's output
+bool GraphView::CheckOpDeps(ir::Node* op1, ir::Node* op2) const {
+  auto print_op = [&](ir::Node* op, const char* name) {
+    std::ostringstream os;
+    os << "        " << name << " : " << op->Name() << " ";
+    os << "Input args : ";
+    for (auto& arg : op->inputs) os << arg->Name() << " ";
+    os << "Output args : ";
+    for (auto& arg : op->outputs) os << arg->Name() << " ";
+    os << "Level : " << op_level_.at(op);
+    VLOG(4) << os.str();
+  };
+  print_op(op1, "OP1");
+  print_op(op2, "OP2");
+  if (op1 == op2) return true;
+  if (op_level_.at(op1) >= op_level_.at(op2)) return false;
+  for (auto& var : op2->inputs)
+    if (var->inputs.size() > 0 && CheckOpDeps(op1, var->inputs[0])) return true;
+  return false;
+}
 ir::Node* GraphView::GetNodeByName(const std::string& name,
                                   const std::vector<ir::Node*>& nodes) const {
  // nodes should be op->inputs/outputs
@@ -387,22 +513,7 @@ void GraphView::Build(ir::Graph* g) {
  // Because we insert some new created node. Which may have data race between
  // nodes.
  // resolve data harzards depends on the var nodes in right order.
-  ops_ = SortOpLikeDescOrder(*g);
+  TopoSort(g);
-  // 1. track the nodes which reused previous node in Python memory optimize.
-  // these node can not be inplaced, otherwise may generate a circle in graph.
-  std::unordered_set<std::string> all_vars;
-  for (auto& node : g->Nodes()) {
-    if (node->IsVar()) continue;
-    for (auto& out : node->outputs) {
-      if (out->IsCtrlVar() || out->Var() == nullptr) continue;
-      if (all_vars.count(out->Name())) {
-        dup_nodes_.emplace(out->Name());
-      } else {
-        all_vars.emplace(out->Name());
-      }
-    }
-  }
  // 2. track the nodes which used by parameter server.
  // these node can not be inplaced, otherwise trainer

--- a/paddle/fluid/framework/details/inplace_op_pass.h
+++ b/paddle/fluid/framework/details/inplace_op_pass.h
@@ -14,6 +14,7 @@
 #pragma once
 #include <map>
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -50,10 +51,15 @@ class GraphView {
  // map the parameter and gradient, must be skipped.
  bool InSkipSet(const std::string& var) const;
+  bool CheckDeps(ir::Node* var, ir::Node* current_op) const;
+  bool CheckOpDeps(ir::Node* op1, ir::Node* op2) const;
+  void TopoSort(ir::Graph* g);
 private:
  std::vector<ir::Node*> ops_;
  std::unordered_set<std::string> dup_nodes_;  // mem opt affect nodes
  std::map<ir::Node*, std::unordered_set<ir::Node*>> adj_list_;
+  std::unordered_map<ir::Node*, uint32_t> op_level_;
 };
 // swap pairs in sequence
@@ -63,8 +69,7 @@ class InplacePass : public ir::Pass {
  InplacePass();
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
+  void ApplyImpl(ir::Graph* graph) const override;
-      std::unique_ptr<ir::Graph> graph) const override;
  void InitSSAGraphNodes() const;

--- a/paddle/fluid/framework/details/memory_optimize_helper.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper.cc
@@ -190,7 +190,7 @@ struct NodeComparator {
    auto rhs_shape = rhs_desc->GetShape();
    if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) ||
        (lhs_shape[0] != -1 && rhs_shape[0] != -1)) {
-      return NodeSize(lhs) <= NodeSize(rhs);
+      return NodeSize(lhs) == NodeSize(rhs);
    } else {
      return false;
    }
@@ -449,6 +449,7 @@ void ControlFlowGraph::LiveVariableAnalysis() {
      live_in_[op].insert(var);
    }
    for (auto& var : defs_[op]) {
+      if (uses_[op].count(var)) continue;
      live_in_[op].erase(var);
    }

--- a/paddle/fluid/framework/details/memory_optimize_helper_test.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper_test.cc
@@ -142,15 +142,16 @@ TEST(OrderedSet, FindBestFitNode) {
  for (auto& node : nodes) {
    pool.Insert(node.get());
  }
+  // FIXME(liuwei1031) this API has changed,
+  // disable these tests temporarily
  // FindNextBestFitNode
-  auto* n = nodes[0].get();
+  // auto* n = nodes[0].get();
-  auto* cache = pool.FindBestFitNode(n);
+  // auto* cache = pool.FindBestFitNode(n);
-  PADDLE_ENFORCE(cache->Name() == "a");
+  // PADDLE_ENFORCE(cache->Name() == "a");
-  cache = pool.FindNextBestFitNode(n, cache);
+  // cache = pool.FindNextBestFitNode(n, cache);
-  PADDLE_ENFORCE(cache->Name() == "c");
+  // PADDLE_ENFORCE(cache->Name() == "c");
-  cache = pool.FindNextBestFitNode(n, cache);
+  // cache = pool.FindNextBestFitNode(n, cache);
-  PADDLE_ENFORCE(cache->Name() == "b");
+  // PADDLE_ENFORCE(cache->Name() == "b");
 }
 }  // namespace details

--- a/paddle/fluid/framework/details/memory_optimize_pass.cc
+++ b/paddle/fluid/framework/details/memory_optimize_pass.cc
@@ -44,8 +44,7 @@ namespace paddle {
 namespace framework {
 namespace details {
-std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
+void MemoryOptimizePass::ApplyImpl(ir::Graph* graph) const {
-    std::unique_ptr<ir::Graph> graph) const {
  auto nodes = graph->Nodes();
  CollectSkipVarsSet(nodes);
@@ -113,7 +112,7 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
          cfg_->RenameVarInCFGGraph(var_name, cache_name, idx);
          RenameVarInGraphDesc(var_name, cache_name, idx);
-          RenameVarInGraphNode(var_name, cache_name, idx, graph.get());
+          RenameVarInGraphNode(var_name, cache_name, idx, graph);
          pool_.Erase(cache_name);
        }
      }
@@ -128,8 +127,6 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
    }
  }
  graph->ResolveHazard(var_nodes_);
-  return graph;
 }
 void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {

--- a/paddle/fluid/framework/details/memory_optimize_pass.h
+++ b/paddle/fluid/framework/details/memory_optimize_pass.h
@@ -21,6 +21,7 @@
 #include <set>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
@@ -35,8 +36,7 @@ namespace details {
 class MemoryOptimizePass : public ir::Pass {
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
+  void ApplyImpl(ir::Graph* graph) const override;
-      std::unique_ptr<ir::Graph> graph) const override;
  // fill the variable map(var_nodes) by version.
  void InitSSAGraphNodes() const;

--- a/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc
+++ b/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc
@@ -34,8 +34,7 @@ static bool IsLockAndRecordEventFreeComputationOpHandle(
  return true;
 }
-std::unique_ptr<ir::Graph> ModifyOpLockAndRecordEventPass::ApplyImpl(
+void ModifyOpLockAndRecordEventPass::ApplyImpl(ir::Graph *ir_graph) const {
-    std::unique_ptr<ir::Graph> ir_graph) const {
  auto all_ops = ir::FilterByNodeWrapper<OpHandleBase>(*ir_graph);
  OpGraphView graph_view(all_ops);
  for (auto &op : all_ops) {
@@ -49,7 +48,6 @@ std::unique_ptr<ir::Graph> ModifyOpLockAndRecordEventPass::ApplyImpl(
               << compute_op->DebugString();
    }
  }
-  return ir_graph;
 }
 }  // namespace details

--- a/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.h
+++ b/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.h
@@ -23,8 +23,7 @@ namespace details {
 class ModifyOpLockAndRecordEventPass : public ir::Pass {
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
+  void ApplyImpl(ir::Graph* graph) const override;
-      std::unique_ptr<ir::Graph> graph) const override;
 };
 }  // namespace details

--- a/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc
@@ -23,10 +23,8 @@ namespace details {
 class SSAGraghBuilderWithChecker : public ir::Pass {
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
+  void ApplyImpl(ir::Graph *graph) const override {
-      std::unique_ptr<ir::Graph> graph) const override {
+    PADDLE_ENFORCE(IsValidGraph(graph));
-    PADDLE_ENFORCE(IsValidGraph(graph.get()));
-    return graph;
  }
  bool IsValidGraph(const ir::Graph *graph) const {

--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -32,6 +32,7 @@
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/operators/math/math_function.h"
 namespace paddle {
 namespace framework {
@@ -152,8 +153,7 @@ void MultiDevSSAGraphBuilderBase::Init() const {
  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
 }
-std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
+void MultiDevSSAGraphBuilderBase::ApplyImpl(ir::Graph *graph) const {
-    std::unique_ptr<ir::Graph> graph) const {
  Init();
  CheckGraph(*graph);
  std::vector<ir::Node *> sorted_ops = SortOperations(*graph);
@@ -223,7 +223,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
          for (size_t i = 0; i < backward_vars.size(); i += 2) {
            auto &p_name = backward_vars[i];
            auto &g_name = backward_vars[i + 1];
-            VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
+            VLOG(10) << "Bcast " << g_name << " for parameter " << p_name
+                     << " op_type " << node->Op()->Type();
            if (NeedCollectiveForGrad(g_name, sorted_ops)) {
              InsertCollectiveOp(&result, p_name, g_name);
            }
@@ -248,7 +249,6 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilderBase::ApplyImpl(
  AddOutputToLeafOps(&result);
  result.Erase(kGraphOps);
-  return graph;
 }
 void MultiDevSSAGraphBuilderBase::InsertScaleLossGradOp(
@@ -430,8 +430,9 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOp(ir::Graph *result,
  CreateOpHandleIOs(result, node, dev_id);
 }
-void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(
+void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result,
-    ir::Graph *result, const std::string &og) const {
+                                                    const std::string &og,
+                                                    bool is_encoded) const {
  OpHandleBase *op_handle = nullptr;
  auto append_allreduce_op = [&](
@@ -440,7 +441,9 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
    result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
        result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
-        scopes, places, nccl_ctxs_));
+        scopes, places, nccl_ctxs_, is_encoded,
+        static_cast<int>(strategy_.trainers_endpoints_.size()) *
+            places_.size()));
 #else
    result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
        result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
@@ -462,12 +465,15 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(
    PADDLE_ENFORCE(!vars.empty());
    auto &prev_grad = vars.back();
    op_handle->AddInput(prev_grad);
+    VLOG(10) << "all_reduce_op_handle add input " << prev_grad->DebugString();
    auto var =
        new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable),
                      vars.size(), i, og, places_[i]);
    vars.emplace_back(var);
    op_handle->AddOutput(var);
+    VLOG(10) << "all_reduce_op_handle add output " << og
+             << ", handle:" << var->DebugString();
  }
 }
@@ -959,6 +965,17 @@ int DistSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
  return op_dev_id;
 }
+bool DistSSAGraphBuilder::IsEncoded(const std::string &p_name) const {
+  auto u_name = p_name + "__dgc_u__";
+  auto it = all_vars_.find(u_name);
+  if (it == all_vars_.end()) {
+    VLOG(10) << "can't find u_name, so it's not encoded:" << u_name;
+    return false;
+  }
+  return true;
+}
 void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
                                             const std::string &p_name,
                                             const std::string &g_name) const {
@@ -975,7 +992,11 @@ void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
        CreateReduceOp(result, g_name, 0);
        CreateBroadcastOp(result, g_name, 0);
      } else {
-        CreateAllReduceOp(result, g_name);
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+        CreateAllReduceOp(result, g_name, IsEncoded(p_name));
+#else
+        PADDLE_ENFORCE(false, "Compiled withoud cuda!");
+#endif
      }
      break;
    default:

--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -20,7 +20,6 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/details/build_strategy.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph.h"
@@ -34,10 +33,13 @@ namespace framework {
 class Scope;
 namespace details {
+constexpr char kLossVarName[] = "loss_var_name";
+constexpr char kStrategy[] = "strategy";
+constexpr char kNRanks[] = "nranks";
 class MultiDevSSAGraphBuilderBase : public ir::Pass {
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
+  void ApplyImpl(ir::Graph *graph) const override;
-      std::unique_ptr<ir::Graph> graph) const override;
  virtual void Init() const;
@@ -75,7 +77,8 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass {
  bool IsSparseGradient(const std::string &og) const;
-  void CreateAllReduceOp(ir::Graph *result, const std::string &og) const;
+  void CreateAllReduceOp(ir::Graph *result, const std::string &og,
+                         bool is_encoded = false) const;
  void CreateBroadcastOp(ir::Graph *result, const std::string &p_name,
                         size_t src_dev_id) const;
@@ -192,6 +195,8 @@ class DistSSAGraphBuilder : public BalanceVarSSAGraphBuilder {
  mutable std::vector<std::unordered_set<std::string>> bcast_var_name_set_;
  mutable bool need_broadcast_var_{false};
+  bool IsEncoded(const std::string &p_name) const;
 };
 std::unordered_set<std::string> &MultiDevSSAGraphBuilder();

--- a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"

--- a/paddle/fluid/framework/details/multi_devices_graph_print_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.h
@@ -17,6 +17,7 @@
 #include <glog/logging.h>
 #include <fstream>
 #include <iosfwd>
+#include <memory>
 #include <ostream>
 #include <string>
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
@@ -40,13 +41,11 @@ class GraphvizSSAGraphPrinter : public SSAGraphPrinter {
 class SSAGraghBuilderWithPrinter : public ir::Pass {
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
+  void ApplyImpl(ir::Graph* graph) const override {
-      std::unique_ptr<ir::Graph> graph) const override {
    std::unique_ptr<std::ostream> fout(
        new std::ofstream(Get<std::string>(kGraphvizPath)));
    PADDLE_ENFORCE(fout->good());
    Get<GraphvizSSAGraphPrinter>("graph_printer").Print(*graph, *fout);
-    return graph;
  }
 };

--- a/paddle/fluid/framework/details/multi_devices_helper.h
+++ b/paddle/fluid/framework/details/multi_devices_helper.h
@@ -20,7 +20,6 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/details/var_handle.h"
@@ -41,22 +40,25 @@ namespace details {
 // `std::vector<VarHandle*>` is the version of varaibles.
 typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle *>>>
    GraphVars;
-const char kGraphVars[] = "vars";
+constexpr char kGraphVars[] = "vars";
-// aux variables to represent dependency. Useful to resolve data hazard.
-typedef std::unordered_set<VarHandleBase *> GraphDepVars;
-const char kGraphDepVars[] = "dep_vars";
-constexpr char kNCCLCtxs[] = "nccl_ctxs";
-constexpr char kLossVarName[] = "loss_var_name";
 constexpr char kPlaces[] = "places";
 constexpr char kLocalScopes[] = "local_scopes";
-constexpr char kStrategy[] = "strategy";
+constexpr char kNCCLCtxs[] = "nccl_ctxs";
-constexpr char kNRanks[] = "nranks";
+// aux variables to represent dependency. Useful to resolve data hazard.
+typedef std::unordered_set<VarHandleBase *> GraphDepVars;
+constexpr char kGraphDepVars[] = "dep_vars";
 typedef std::unordered_set<std::string> FusedVars;
 constexpr char kFusedVars[] = "fused_vars";
+constexpr char kFusedVarNamePrefix[] = "@FUSEDVAR@";
+typedef std::string FusedOptType;
+constexpr char kFusedOptType[] = "fused_opt_type";
+typedef std::string FusedGrads;
+constexpr char kFusedGrads[] = "fused_gradients";
 typedef std::vector<std::pair<std::string, std::string>> ParamsAndGrads;
 constexpr char kParamsAndGrads[] = "params_grads";
@@ -65,8 +67,6 @@ typedef std::vector<std::vector<std::pair<std::string, std::string>>>
    GroupGradsAndParams;
 constexpr char kGroupGradsAndParams[] = "group_grads_params";
-constexpr char kFusedVarNamePrefix[] = "@FUSEDVAR@";
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/grad_op_desc_maker.h"
 #include "paddle/fluid/framework/inplace_op_inference.h"
+#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
@@ -36,27 +37,86 @@ enum OpInfoFillType {
  kGradOpDescMaker = 2,
  kVarTypeInference = 3,
  kShapeInference = 4,
-  kInplaceOpInference = 5
+  kInplaceOpInference = 5,
+  kNoNeedBufferVarsInference = 6,
+  kUnknown = -1
 };
+namespace internal {
+template <typename T, OpInfoFillType kType>
+struct TypePair {
+  using Type = T;
+  static constexpr OpInfoFillType kFillType = kType;
+};
+using OpRegistryClasses = std::tuple<                                // NOLINT
+    TypePair<OperatorBase, kOperator>,                               // NOLINT
+    TypePair<OpProtoAndCheckerMaker, kOpProtoAndCheckerMaker>,       // NOLINT
+    TypePair<GradOpDescMakerBase, kGradOpDescMaker>,                 // NOLINT
+    TypePair<VarTypeInference, kVarTypeInference>,                   // NOLINT
+    TypePair<InferShapeBase, kShapeInference>,                       // NOLINT
+    TypePair<InplaceOpInference, kInplaceOpInference>,               // NOLINT
+    TypePair<NoNeedBufferVarsInference, kNoNeedBufferVarsInference>  // NOLINT
+    >;
+static constexpr int kOpRegistryClassNumber =
+    std::tuple_size<OpRegistryClasses>::value;
+template <typename T, int kPos, bool kIsBounded /* = true*/>
+struct IsMatchedBaseTypeImpl {
+  using PairType = typename std::tuple_element<kPos, OpRegistryClasses>::type;
+  static constexpr bool kValue =
+      std::is_base_of<typename PairType::Type, T>::value;
+};
+template <typename T, int kPos>
+struct IsMatchedBaseTypeImpl<T, kPos, false> {
+  static constexpr bool kValue = false;
+};
+template <typename T, int kPos>
+static inline constexpr bool IsMatchedBaseType() {
+  return IsMatchedBaseTypeImpl<
+      T, kPos, (kPos >= 0 && kPos < kOpRegistryClassNumber)>::kValue;
+}
+template <typename T, int kStart, int kEnd, bool kIsEnd, bool kIsMatched>
+struct OpInfoFillTypeGetterImpl {};
+// This case should not happen
+template <typename T, int kStart, int kEnd>
+struct OpInfoFillTypeGetterImpl<T, kStart, kEnd, true, true> {};
+template <typename T, int kStart, int kEnd>
+struct OpInfoFillTypeGetterImpl<T, kStart, kEnd, true, false> {
+  static constexpr OpInfoFillType kType = kUnknown;
+};
+template <typename T, int kStart, int kEnd>
+struct OpInfoFillTypeGetterImpl<T, kStart, kEnd, false, false> {
+  static constexpr OpInfoFillType kType =
+      OpInfoFillTypeGetterImpl<T, kStart + 1, kEnd, kStart + 1 == kEnd,
+                               IsMatchedBaseType<T, kStart + 1>()>::kType;
+};
+template <typename T, int kStart, int kEnd>
+struct OpInfoFillTypeGetterImpl<T, kStart, kEnd, false, true> {
+  using PairType = typename std::tuple_element<kStart, OpRegistryClasses>::type;
+  static constexpr OpInfoFillType kType = PairType::kFillType;
+};
+template <typename T>
+using OpInfoFillTypeGetter =
+    OpInfoFillTypeGetterImpl<T, 0, kOpRegistryClassNumber,
+                             kOpRegistryClassNumber == 0,
+                             IsMatchedBaseType<T, 0>()>;
+}  // namespace internal
 template <typename T>
 struct OpInfoFillTypeID {
  static constexpr OpInfoFillType ID() {
-    return std::is_base_of<OperatorBase, T>::value
+    return internal::OpInfoFillTypeGetter<T>::kType;
-               ? kOperator
-               : (std::is_base_of<OpProtoAndCheckerMaker, T>::value
-                      ? kOpProtoAndCheckerMaker
-                      : (std::is_base_of<GradOpDescMakerBase, T>::value
-                             ? kGradOpDescMaker
-                             : (std::is_base_of<VarTypeInference, T>::value
-                                    ? kVarTypeInference
-                                    : (std::is_base_of<InferShapeBase, T>::value
-                                           ? kShapeInference
-                                           : (std::is_base_of<
-                                                  InplaceOpInference, T>::value
-                                                  ? kInplaceOpInference
-                                                  : static_cast<OpInfoFillType>(
-                                                        -1))))));
  }
 };
@@ -149,9 +209,21 @@ struct OpInfoFiller<T, kShapeInference> {
 template <typename T>
 struct OpInfoFiller<T, kInplaceOpInference> {
  void operator()(const char* op_type, OpInfo* info) const {
-    info->infer_inplace_ = [](const OpDesc& op_desc, BlockDesc* block) {
+    info->infer_inplace_ = [](const OpDesc& op_desc) {
      T infer;
-      return infer(op_desc, block);
+      return infer(op_desc);
+    };
+  }
+};
+template <typename T>
+struct OpInfoFiller<T, kNoNeedBufferVarsInference> {
+  void operator()(const char* op_type, OpInfo* info) const {
+    info->infer_no_need_buffer_vars_ = [](const VariableNameMap& inputs,
+                                          const VariableNameMap& outputs,
+                                          const AttributeMap& attrs) {
+      T infer(inputs, outputs, attrs);
+      return infer();
    };
  }
 };

--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -96,7 +96,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
  auto seq_allreduce_pass =
      ir::PassRegistry::Instance().Get("all_reduce_deps_pass");
  for (size_t i = 0; i < graphs_.size(); ++i) {
-    graphs_[i] = seq_allreduce_pass->Apply(std::move(graphs_[i]));
+    graphs_[i].reset(seq_allreduce_pass->Apply(graphs_[i].release()));
  }
  // set the correct size of thread pool to each device.

--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -193,8 +193,80 @@ ExtractComputationOpFromLastLivedVar(VarHandle *var, size_t scope_idx,
  return shrink_func(computation_op);
 }
-std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
+/**
-    std::unique_ptr<ir::Graph> graph) const {
+ * Shrink op dependencies according to no need buffer vars.
+ *
+ * If some ops do not need Tensor buffer of any input,
+ * just remove the dependency of this op, i.e, decrease reference count.
+ *
+ * For example, input Y of elementwise_add_grad op is only used to infer shape
+ * and lod of Y@GRAD, we do not need the buffer of input Y. Data buffer of
+ * input Y can be collected before elementwise_add_grad op runs.
+ *
+ * This method returns whether the dependency count decreases to 0, and
+ * shrinks op dependency if possible.
+ */
+static bool ShrinkNoNeedBufferVarOpDependency(
+    const std::string &var_name,
+    std::unordered_set<ComputationOpHandle *> *op_handles) {
+  std::vector<ComputationOpHandle *> skip_ops;
+  for (auto *op_handle : *op_handles) {
+    auto *op_base = op_handle->GetOp();
+    auto &inferer = op_base->Info().NoNeedBufferVarsInferer();
+    if (!inferer) {
+      continue;
+    }
+    std::unordered_set<std::string> no_need_buffer_vars =
+        inferer(op_base->Inputs(), op_base->Outputs(), op_base->Attrs());
+    // Check whether var_name occurs in other inputs or outputs of the op
+    // If it occurs, we cannot decrease the dependency number.
+    bool occurred_in_other_vars = false;
+    for (auto &in_pair : op_base->Inputs()) {
+      if (no_need_buffer_vars.count(in_pair.first) > 0) {
+        continue;
+      }
+      auto &args = in_pair.second;
+      auto iter = std::find(args.begin(), args.end(), var_name);
+      if (iter != args.end()) {
+        occurred_in_other_vars = true;
+        break;
+      }
+    }
+    if (occurred_in_other_vars) {
+      continue;
+    }
+    for (auto &out_pair : op_base->Outputs()) {
+      auto &args = out_pair.second;
+      auto iter = std::find(args.begin(), args.end(), var_name);
+      if (iter != args.end()) {
+        occurred_in_other_vars = true;
+        break;
+      }
+    }
+    if (!occurred_in_other_vars) {
+      VLOG(2) << "Shrink var " << var_name << " in op " << op_handle->Name();
+      skip_ops.emplace_back(op_handle);
+    }
+  }
+  if (skip_ops.size() == op_handles->size()) {
+    op_handles->clear();
+    return true;
+  } else {
+    for (auto *skip_op : skip_ops) {
+      op_handles->erase(skip_op);
+    }
+    return false;
+  }
+}
+void ReferenceCountPass::ApplyImpl(ir::Graph *graph) const {
  auto &ref_cnts = Get<std::vector<ReferenceCountMap>>(kGlobalReferenceCount);
  auto &last_live_ops_of_vars =
      Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
@@ -229,21 +301,46 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
        continue;
      }
-      bool ok;
+      auto &var_name = name_var_pair.first;
-      auto result = ExtractComputationOpFromLastLivedVar(
+      auto &var_handles = name_var_pair.second;
-          name_var_pair.second.back(), i, shrink_func, &ok);
+      for (auto iter = var_handles.rbegin(); iter != var_handles.rend();
+           ++iter) {
+        bool ok;
+        auto result =
+            ExtractComputationOpFromLastLivedVar(*iter, i, shrink_func, &ok);
+        // Seldomly, some vars may have no pending or preceding computation ops
+        // Just break;
+        if (!ok) break;
+        VLOG(10) << "Extract " << result.size() << " ops of var " << var_name;
+        size_t original_op_deps = result.size();
+        // If all ops do not need buffer of var_name, calculate reference count
+        // of the previous version of var_name.
+        if (ShrinkNoNeedBufferVarOpDependency(var_name, &result)) {
+          VLOG(10) << "Try to precede reference count computing at var "
+                   << var_name;
+          continue;
+        }
+        size_t final_op_deps = result.size();
+        if (final_op_deps < original_op_deps) {
+          VLOG(5) << "Shrink op deps from " << original_op_deps << " to "
+                  << final_op_deps;
+        }
-      if (ok) {
-        auto &var_name = name_var_pair.first;
        PADDLE_ENFORCE(!result.empty(), "Last living ops of %s cannot be empty",
                       var_name);
        ref_cnts[i].emplace(var_name, result.size());
        last_live_ops_of_vars[i].emplace(var_name, std::move(result));
+        break;
      }
+      // Seldomly, all preceding trying failed.
+      // Just skip this corner case
    }
  }
-  return graph;
 }
 }  // namespace details

--- a/paddle/fluid/framework/details/reference_count_pass.h
+++ b/paddle/fluid/framework/details/reference_count_pass.h
@@ -23,8 +23,7 @@ namespace details {
 class ReferenceCountPass : public ir::Pass {
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
+  void ApplyImpl(ir::Graph* graph) const override;
-      std::unique_ptr<ir::Graph> graph) const override;
 };
 }  // namespace details

--- a/paddle/fluid/framework/details/sequential_execution_pass.cc
+++ b/paddle/fluid/framework/details/sequential_execution_pass.cc
@@ -29,8 +29,7 @@ static bool IsSameOpDesc(OpDesc *op1, OpDesc *op2) {
         op1->Outputs() == op2->Outputs();
 }
-std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
+void SequentialExecutionPass::ApplyImpl(ir::Graph *graph) const {
-    std::unique_ptr<ir::Graph> graph) const {
  // FIXME(zjl): Insert dependencies between some distributed ops may cause
  // the multi_devices_graph_pass fails. So we skip these ops here.
  // Indeed, maybe we should not insert dependencies between these ops
@@ -98,7 +97,6 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
    VLOG(10) << "Add dependencies between " << op_node_list[i - 1]->Name()
             << " and " << op_node_list[i]->Name();
  }
-  return graph;
 }
 }  // namespace details

--- a/paddle/fluid/framework/details/sequential_execution_pass.h
+++ b/paddle/fluid/framework/details/sequential_execution_pass.h
@@ -23,8 +23,7 @@ namespace details {
 class SequentialExecutionPass : public ir::Pass {
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
+  void ApplyImpl(ir::Graph* graph) const override;
-      std::unique_ptr<ir::Graph> graph) const override;
 };
 }  // namespace details

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -24,13 +24,13 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
    const std::vector<platform::Place> &places, ir::Graph *graph)
    : graph_(graph),
-      pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
-                                       : nullptr),
-      prepare_pool_(1),
      local_scopes_(local_scopes),
      places_(places),
      fetch_ctxs_(places),
-      strategy_(strategy) {
+      strategy_(strategy),
+      prepare_pool_(1),
+      pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
+                                       : nullptr) {
  if (strategy_.num_iteration_per_run_ > 1) {
    int read_op_num = 0;
    for (auto *node : graph_->Nodes()) {

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -66,13 +66,20 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
             details::OpHandleBase *op);
 private:
+  // Note(zcd): the ThreadPool should be placed last so that ThreadPool should
+  // be destroyed first.
  ir::Graph *graph_;
-  std::unique_ptr<::ThreadPool> pool_;
-  ::ThreadPool prepare_pool_;
  std::vector<Scope *> local_scopes_;
  std::vector<platform::Place> places_;
  platform::DeviceContextPool fetch_ctxs_;
  ExceptionHolder exception_holder_;
+  std::unique_ptr<OpDependentData> op_deps_;
+  std::future<std::unique_ptr<OpDependentData>> op_deps_futures_;
+  ExecutionStrategy strategy_;
+  // use std::list because clear(), push_back, and for_each are O(1)
+  std::list<std::future<void>> run_op_futures_;
+  ::ThreadPool prepare_pool_;
+  std::unique_ptr<::ThreadPool> pool_;
  void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops,
                       OpHandleBase *op_instance) const;
@@ -91,14 +98,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  void PrepareOpDeps();
  void CopyOpDeps();
- private:
-  std::future<std::unique_ptr<OpDependentData>> op_deps_futures_;
-  ExecutionStrategy strategy_;
-  std::unique_ptr<OpDependentData> op_deps_;
-  // use std::list because clear(), push_back, and for_each are O(1)
-  std::list<std::future<void>> run_op_futures_;
 };
 }  // namespace details

--- a/paddle/fluid/framework/details/var_handle.cc
+++ b/paddle/fluid/framework/details/var_handle.cc
@@ -24,7 +24,8 @@ VarHandle::~VarHandle() { VLOG(4) << "deleting var handle " << DebugString(); }
 std::string VarHandle::DebugString() const {
  std::stringstream ss;
-  ss << name_ << ":" << place_;
+  ss << "name:" << name_ << ", place:" << place_ << ", version:" << version_
+     << ", scope_idx:" << scope_idx_;
  return ss.str();
 }

--- a/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc
+++ b/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc
@@ -23,8 +23,7 @@ namespace details {
 class WhileOpEagerDeletionPass : public ir::Pass {
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
+  void ApplyImpl(ir::Graph *graph) const override {
-      std::unique_ptr<ir::Graph> graph) const override {
    auto all_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
    // Find all while_op and while_grad_op
@@ -50,7 +49,6 @@ class WhileOpEagerDeletionPass : public ir::Pass {
      operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
          while_ops, while_grad_ops);
    }
-    return graph;
  }
 };

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <utility>
+#include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
@@ -48,97 +49,23 @@ namespace {
 int kProgramId = -1;
 }  // namespace
-static std::unordered_map<std::string, size_t> GetNonPersistableReferenceCounts(
-    const BlockDesc& block, const std::vector<std::string>& skip_var_list) {
-  std::unordered_map<std::string, size_t> ref_cnts;
-  std::unordered_set<std::string> skip_vars(skip_var_list.begin(),
-                                            skip_var_list.end());
-  auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) {
-    for (auto& name_pair : name_map) {
-      for (auto& name : name_pair.second) {
-        if (skip_vars.count(name)) continue;
-        auto* var_desc = block.FindVar(name);
-        if (var_desc == nullptr || var_desc->Persistable()) continue;
-        auto type = var_desc->Proto()->type().type();
-        if (type != proto::VarType::LOD_TENSOR &&
-            type != proto::VarType::SELECTED_ROWS &&
-            type != proto::VarType::LOD_TENSOR_ARRAY) {
-          continue;
-        }
-        ++ref_cnts[name];
-      }
-    }
-  };
-  for (auto op_desc : block.AllOps()) {
-    update_ref_cnts(op_desc, op_desc->Inputs());
-    update_ref_cnts(op_desc, op_desc->Outputs());
-  }
-  return ref_cnts;
-}
 ExecutorPrepareContext::ExecutorPrepareContext(
-    const framework::ProgramDesc& prog, size_t block_id,
+    const framework::ProgramDesc& prog, size_t block_id)
-    const std::vector<std::string>& keep_vars, bool force_disable_gc)
+    : prog_(prog), block_id_(block_id) {}
-    : prog_(prog), block_id_(block_id), force_disable_gc_(force_disable_gc) {
-  if (GetEagerDeletionThreshold() >= 0 && !force_disable_gc_) {
+void ExecutorPrepareContext::PrepareUnusedVars(
-    global_ref_cnts_ =
+    const std::vector<std::string>& keep_vars, bool force_disable_gc) {
-        GetNonPersistableReferenceCounts(prog.Block(block_id), keep_vars);
+  force_disable_gc_ = force_disable_gc;
+  if (GetEagerDeletionThreshold() < 0 || force_disable_gc_) {
+    return;
  }
+  unused_vars_ = GetUnusedVars(prog_.Block(block_id_), ops_, keep_vars);
 }
 ExecutorPrepareContext::~ExecutorPrepareContext() {
  VLOG(5) << "destroy ExecutorPrepareContext";
 }
-static void DeleteUnusedTensors(
-    const Scope& scope, const OperatorBase* op, GarbageCollector* gc,
-    std::unordered_map<std::string, size_t>* ref_cnts) {
-  std::deque<std::shared_ptr<memory::Allocation>> garbages;
-  auto handler = [&](const VariableNameMap& name_map) {
-    for (auto& name_pair : name_map) {
-      for (auto& name : name_pair.second) {
-        auto it = ref_cnts->find(name);
-        if (it == ref_cnts->end()) continue;
-        if (--(it->second) != 0) {
-          continue;
-        }
-        auto* var = scope.FindVar(name);
-        if (var == nullptr) {
-          continue;
-        }
-        VLOG(2) << "Erase variable " << name;
-        if (var->IsType<LoDTensor>()) {
-          garbages.emplace_back(
-              var->GetMutable<LoDTensor>()->MoveMemoryHolder());
-        } else if (var->IsType<SelectedRows>()) {
-          garbages.emplace_back(var->GetMutable<SelectedRows>()
-                                    ->mutable_value()
-                                    ->MoveMemoryHolder());
-        } else if (var->IsType<LoDTensorArray>()) {
-          auto* lod_tensor_arr = var->GetMutable<LoDTensorArray>();
-          for (auto& t : *lod_tensor_arr) {
-            garbages.emplace_back(t.MoveMemoryHolder());
-          }
-        } else {
-          PADDLE_THROW("Type %s of %s is not supported eager deletion",
-                       framework::ToTypeName(var->Type()), name);
-        }
-      }
-    }
-  };
-  handler(op->Inputs());
-  handler(op->Outputs());
-  if (!garbages.empty()) {
-    gc->Add(std::move(garbages));
-  }
-}
 Executor::Executor(const platform::Place& place) : place_(place) {}
 void Executor::Close() {
@@ -362,8 +289,8 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
 std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
    const ProgramDesc& program, int block_id,
    const std::vector<std::string>& skip_ref_cnt_vars, bool force_disable_gc) {
-  std::unique_ptr<ExecutorPrepareContext> ctx(new ExecutorPrepareContext(
+  std::unique_ptr<ExecutorPrepareContext> ctx(
-      program, block_id, skip_ref_cnt_vars, force_disable_gc));
+      new ExecutorPrepareContext(program, block_id));
  PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
  auto& block = program.Block(block_id);
  for (auto& op_desc : block.AllOps()) {
@@ -375,6 +302,7 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
        ctx->prog_.Block(ctx->block_id_), &ctx->ops_);
  }
 #endif
+  ctx->PrepareUnusedVars(skip_ref_cnt_vars, force_disable_gc);
  return ctx;
 }
@@ -389,19 +317,17 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
  std::vector<std::shared_ptr<ExecutorPrepareContext>> result;
  size_t idx = 0;
  for (auto& bid : block_ids) {
-    ExecutorPrepareContext* ctx;
-    if (skip_ref_cnt_vars.empty()) {
-      ctx = new ExecutorPrepareContext(program, bid, std::vector<std::string>(),
-                                       force_disable_gc);
-    } else {
-      ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx],
-                                       force_disable_gc);
-    }
    PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
+    auto* ctx = new ExecutorPrepareContext(program, bid);
    auto& block = program.Block(bid);
    for (auto& op_desc : block.AllOps()) {
      ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
    }
+    if (skip_ref_cnt_vars.empty()) {
+      ctx->PrepareUnusedVars(std::vector<std::string>(), force_disable_gc);
+    } else {
+      ctx->PrepareUnusedVars(skip_ref_cnt_vars[idx], force_disable_gc);
+    }
    result.push_back(std::shared_ptr<ExecutorPrepareContext>(ctx));
    ++idx;
  }
@@ -425,7 +351,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
  // FIXME(zjl): recurrent_op is rather complex, we would
  // disable gc forcely in recurrent_op
  if (!ctx->force_disable_gc_ && max_memory_size >= 0) {
-    ctx->ResetReferenceCount();
 #ifdef PADDLE_WITH_CUDA
    if (platform::is_gpu_place(place_)) {
      if (IsFastEagerDeletionModeEnabled()) {
@@ -453,8 +378,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
    op->Run(*local_scope, place_);
    if (gc) {
-      DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
+      DeleteUnusedTensors(*local_scope, op.get(), ctx->unused_vars_, gc.get());
-                          &(ctx->runtime_ref_cnts_));
    }
  }

--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -30,22 +30,20 @@ namespace paddle {
 namespace framework {
 struct ExecutorPrepareContext {
-  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id,
+  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id);
-                         const std::vector<std::string>& skip_ref_cnt_vars =
-                             std::vector<std::string>(),
-                         bool force_disable_gc = false);
  ~ExecutorPrepareContext();
-  void ResetReferenceCount() { runtime_ref_cnts_ = global_ref_cnts_; }
+  void PrepareUnusedVars(const std::vector<std::string>& keep_vars,
+                         bool force_disable_gc = false);
  const framework::ProgramDesc& prog_;
-  size_t block_id_;
+  const size_t block_id_;
-  bool force_disable_gc_;
  std::vector<std::unique_ptr<OperatorBase>> ops_;
-  std::unordered_map<std::string, size_t> global_ref_cnts_;
+  std::unordered_map<OperatorBase*, std::vector<std::string>> unused_vars_;
-  std::unordered_map<std::string, size_t> runtime_ref_cnts_;
+  bool force_disable_gc_{false};
 };
 class Executor {

--- a/paddle/fluid/framework/executor_gc_helper.cc
+++ b/paddle/fluid/framework/executor_gc_helper.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/executor_gc_helper.h"
+#include <deque>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "glog/logging.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace framework {
+struct OpInOutInfo {
+ public:
+  void Build(const OperatorBase *op) {
+    is_built_ = true;
+    auto &inferer = op->Info().NoNeedBufferVarsInferer();
+    if (inferer) {
+      no_need_buffer_ins_ = inferer(op->Inputs(), op->Outputs(), op->Attrs());
+      if (no_need_buffer_ins_.empty()) return;
+      for (auto &in_name_pair : op->Inputs()) {
+        if (no_need_buffer_ins_.count(in_name_pair.first) != 0) {
+          continue;
+        }
+        for (auto &in_arg_name : in_name_pair.second) {
+          other_args_set_.insert(in_arg_name);
+        }
+      }
+      for (auto &out_name_pair : op->Outputs()) {
+        for (auto &out_arg_name : out_name_pair.second) {
+          other_args_set_.insert(out_arg_name);
+        }
+      }
+    }
+  }
+  bool IsBuilt() const { return is_built_; }
+  bool IsInArgBufferNeeded(const std::string &in_arg_name) const {
+    return no_need_buffer_ins_.empty() ||
+           other_args_set_.count(in_arg_name) != 0;
+  }
+ private:
+  // A set to record unused buffer input vars of op
+  std::unordered_set<std::string> no_need_buffer_ins_;
+  // A set to record other args of op (including in, out)
+  std::unordered_set<std::string> other_args_set_;
+  bool is_built_{false};
+};
+static bool VarCanBeDeleted(const std::string &name, const BlockDesc &block,
+                            const std::unordered_set<std::string> &skip_vars) {
+  if (skip_vars.count(name) != 0) {
+    return false;
+  }
+  auto *var_desc = block.FindVar(name);
+  if (var_desc == nullptr || var_desc->Persistable()) {
+    return false;
+  }
+  auto type = var_desc->Proto()->type().type();
+  return type == proto::VarType::LOD_TENSOR ||
+         type == proto::VarType::SELECTED_ROWS ||
+         type == proto::VarType::LOD_TENSOR_ARRAY;
+}
+std::unordered_map<OperatorBase *, std::vector<std::string>> GetUnusedVars(
+    const BlockDesc &block,
+    const std::vector<std::unique_ptr<OperatorBase>> &ops,
+    const std::vector<std::string> &skip_var_list) {
+  std::unordered_set<std::string> skip_vars(skip_var_list.begin(),
+                                            skip_var_list.end());
+  std::unordered_map<std::string, size_t> var_op_idx_map;
+  for (size_t i = 0; i < ops.size(); ++i) {
+    auto *op = ops[i].get();
+    OpInOutInfo info;
+    for (auto &name_pair : op->Inputs()) {
+      for (auto &name : name_pair.second) {
+        if (!VarCanBeDeleted(name, block, skip_vars)) {
+          continue;
+        }
+        // var can be gc-ed
+        if (!info.IsBuilt()) {
+          info.Build(op);
+        }
+        if (info.IsInArgBufferNeeded(name)) {
+          // Update the last living op of variable to current op
+          var_op_idx_map[name] = i;
+        } else {
+          VLOG(10) << "Skip reference count computing of variable "
+                   << name_pair.first << "(" << name << ") in Operator "
+                   << op->Type();
+        }
+      }
+    }
+    for (auto &name_pair : op->Outputs()) {
+      for (auto &name : name_pair.second) {
+        if (VarCanBeDeleted(name, block, skip_vars)) {
+          // Update the last living op of variable to current op
+          var_op_idx_map[name] = i;
+        }
+      }
+    }
+  }
+  std::unordered_map<OperatorBase *, std::vector<std::string>> result;
+  for (auto &name_op_idx_pair : var_op_idx_map) {
+    auto &name = name_op_idx_pair.first;
+    size_t op_idx = name_op_idx_pair.second;
+    result[ops[op_idx].get()].emplace_back(name);
+  }
+  return result;
+}
+void DeleteUnusedTensors(
+    const Scope &scope, OperatorBase *op,
+    const std::unordered_map<OperatorBase *, std::vector<std::string>>
+        &delete_vars_map,
+    GarbageCollector *gc) {
+  auto iter = delete_vars_map.find(op);
+  if (iter == delete_vars_map.end()) {
+    return;
+  }
+  auto &delete_vars = iter->second;
+  std::deque<std::shared_ptr<memory::Allocation>> garbages;
+  for (auto &var_name : delete_vars) {
+    auto *var = scope.FindVar(var_name);
+    if (var == nullptr) {
+      continue;
+    }
+    VLOG(2) << "Erase variable " << var_name;
+    if (var->IsType<LoDTensor>()) {
+      garbages.emplace_back(var->GetMutable<LoDTensor>()->MoveMemoryHolder());
+    } else if (var->IsType<SelectedRows>()) {
+      garbages.emplace_back(
+          var->GetMutable<SelectedRows>()->mutable_value()->MoveMemoryHolder());
+    } else if (var->IsType<LoDTensorArray>()) {
+      auto *lod_tensor_arr = var->GetMutable<LoDTensorArray>();
+      for (auto &t : *lod_tensor_arr) {
+        garbages.emplace_back(t.MoveMemoryHolder());
+      }
+    } else {
+      PADDLE_THROW("Type %s of %s is not supported eager deletion",
+                   framework::ToTypeName(var->Type()), var_name);
+    }
+  }
+  if (!garbages.empty()) {
+    gc->Add(std::move(garbages));
+  }
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/executor_gc_helper.h
+++ b/paddle/fluid/framework/executor_gc_helper.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/framework/garbage_collector.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+namespace paddle {
+namespace framework {
+// Result map: op -> variable names that can be deleted after op runs
+std::unordered_map<OperatorBase *, std::vector<std::string>> GetUnusedVars(
+    const BlockDesc &block,
+    const std::vector<std::unique_ptr<OperatorBase>> &ops,
+    const std::vector<std::string> &skip_vars);
+// Collect unused tensors after op runs
+void DeleteUnusedTensors(
+    const Scope &scope, OperatorBase *op,
+    const std::unordered_map<OperatorBase *, std::vector<std::string>>
+        &delete_vars_map,
+    GarbageCollector *gc);
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -13,14 +13,36 @@
 // limitations under the License.
 #include <algorithm>
+#include <deque>
+#include <functional>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <utility>
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
+#include "gflags/gflags.h"
+#include "glog/logging.h"
 #include "paddle/fluid/framework/garbage_collector.h"
 namespace paddle {
 namespace framework {
+DEFINE_double(
+    eager_delete_tensor_gb, -1.0,
+    "Memory size threshold (GB) when the garbage collector clear tensors."
+    "Disabled when this value is less than 0");
+DEFINE_bool(fast_eager_deletion_mode, true,
+            "Fast eager deletion mode. If enabled, memory would release "
+            "immediately without waiting GPU kernel ends.");
+DEFINE_double(memory_fraction_of_eager_deletion, 1.0,
+              "Fraction of eager deletion. If less than 1.0, all variables in "
+              "the program would be sorted according to its memory size, and "
+              "only the FLAGS_memory_fraction_of_eager_deletion of the largest "
+              "variables would be deleted.");
 GarbageCollector::GarbageCollector(const platform::Place &place,
                                   size_t max_memory_size)
    : max_memory_size_((std::max)(max_memory_size, static_cast<size_t>(1))) {
@@ -85,5 +107,25 @@ void StreamGarbageCollector::ClearCallback(
  callback_manager_->AddCallback(callback);
 }
 #endif
+int64_t GetEagerDeletionThreshold() {
+  return FLAGS_eager_delete_tensor_gb < 0
+             ? -1
+             : static_cast<int64_t>(FLAGS_eager_delete_tensor_gb *
+                                    (static_cast<int64_t>(1) << 30));
+}
+bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; }
+void SetEagerDeletionMode(double threshold, double fraction, bool fast_mode) {
+  FLAGS_eager_delete_tensor_gb = threshold;
+  FLAGS_memory_fraction_of_eager_deletion = fraction;
+  FLAGS_fast_eager_deletion_mode = fast_mode;
+}
+double GetEagerDeletionMemoryFraction() {
+  return FLAGS_memory_fraction_of_eager_deletion;
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -18,6 +18,8 @@
 #include <functional>
 #include <memory>
 #include <mutex>  // NOLINT
+#include <utility>
+#include "gflags/gflags.h"
 #include "paddle/fluid/platform/device_context.h"
 namespace paddle {
@@ -126,5 +128,12 @@ void GarbageCollector::Add(Container &&objs, Callback &&callback) {
  }
 }
+int64_t GetEagerDeletionThreshold();
+bool IsFastEagerDeletionModeEnabled();
+void SetEagerDeletionMode(double threshold, double fraction, bool fast_mode);
+double GetEagerDeletionMemoryFraction();
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/inplace_op_inference.h
+++ b/paddle/fluid/framework/inplace_op_inference.h
@@ -17,8 +17,8 @@
 #include <numeric>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include "glog/logging.h"
-#include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/details/memory_optimize_helper.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/type_defs.h"
@@ -32,55 +32,22 @@ namespace framework {
  then Out will inplaced use X's memory. The base class will do
  legality validation for both variables.
 */
 class InplaceOpInference {
 public:
  virtual ~InplaceOpInference() {}
  virtual std::unordered_map<std::string, std::string> operator()(
-      const OpDesc& op_desc, BlockDesc* block) const = 0;
+      const OpDesc& op_desc) const = 0;
-};
-class InplaceInToOut : public InplaceOpInference {
- public:
-  std::unordered_map<std::string, std::string> operator()(
-      const OpDesc& op_desc, BlockDesc* block) const {
-    std::unordered_map<std::string, std::string> ret;
-    auto in_out_var_names_pair = this->Apply(op_desc, block);
-    for (auto& pair : in_out_var_names_pair) {
-      PADDLE_ENFORCE(!op_desc.Input(pair.first).empty(),
-                     string::Sprintf("op %s do not have input of %s!",
-                                     op_desc.Type(), pair.first));
-      PADDLE_ENFORCE(!op_desc.Output(pair.second).empty(),
-                     string::Sprintf("op %s do not have output of %s!",
-                                     op_desc.Type(), pair.second));
-      auto& in_name = op_desc.Input(pair.first).at(0);
-      auto& out_name = op_desc.Output(pair.second).at(0);
-      auto in = block->FindRecursiveOrCreateVar(in_name);
-      auto out = block->FindRecursiveOrCreateVar(out_name);
-      if (TryInplaceInputOutput(in, out)) ret.insert({in_name, out_name});
-    }
-    return ret;
-  }
- protected:
-  virtual std::unordered_map<std::string, std::string> Apply(
-      const OpDesc& op_desc, BlockDesc* block) const = 0;
-  bool TryInplaceInputOutput(const VarDesc& in, const VarDesc& out) const {
-    return in.Name() != out.Name() && details::NodeCanReused(in) &&
-           details::NodeCanReused(out) &&
-           details::NodeSize(out) <= details::NodeSize(in);
-  }
 };
 /*
  Inplace In and Out for operator only have an Input and an Output.
  For example, activation op.
 */
-class SingleOpInplaceInToOut : public InplaceInToOut {
+class SingleOpInplaceInToOut : public InplaceOpInference {
- protected:
+ public:
-  std::unordered_map<std::string, std::string> Apply(
+  std::unordered_map<std::string, std::string> operator()(
-      const OpDesc& op_desc, BlockDesc* block) const override {
+      const OpDesc& op_desc) const override {
    PADDLE_ENFORCE(!op_desc.InputNames().empty(),
                   "Op inputs must not be empty");
    PADDLE_ENFORCE(!op_desc.OutputNames().empty(),
@@ -95,10 +62,10 @@ class SingleOpInplaceInToOut : public InplaceInToOut {
  Gradient op. Inplace output use it's Input.
  For example, Input@Grad->Input reuse strategy.
 */
-class GradOpInplaceInToOut : public InplaceInToOut {
+class GradOpInplaceInToOut : public InplaceOpInference {
- protected:
+ public:
-  std::unordered_map<std::string, std::string> Apply(
+  std::unordered_map<std::string, std::string> operator()(
-      const OpDesc& op_desc, BlockDesc* block) const override {
+      const OpDesc& op_desc) const override {
    std::unordered_map<std::string, std::string> ret;
    std::unordered_set<std::string> output_names(op_desc.OutputNames().begin(),
                                                 op_desc.OutputNames().end());

--- a/paddle/fluid/framework/inplace_op_inference_test.cc
+++ b/paddle/fluid/framework/inplace_op_inference_test.cc
@@ -127,26 +127,20 @@ class MultiOutGradShapeInference : public framework::InferShapeBase {
  }
 };
-class MultiOutInplaceInToOut : public framework::InplaceInToOut {
+class MultiOutInplaceInToOut : public framework::InplaceOpInference {
 public:
-  using framework::InplaceInToOut::InplaceInToOut;
+  std::unordered_map<std::string, std::string> operator()(
+      const OpDesc& op_desc) const override {
- protected:
-  std::unordered_map<std::string, std::string> Apply(
-      const OpDesc& op_desc, BlockDesc* block) const override {
    return std::unordered_map<std::string, std::string>{
        {"X", "Out"}, {"Y", "YOut"}, {"Z", "ZOut"},
    };
  }
 };
-class MultiOutGradInplaceInToOut : public framework::InplaceInToOut {
+class MultiOutGradInplaceInToOut : public framework::InplaceOpInference {
 public:
-  using framework::InplaceInToOut::InplaceInToOut;
+  std::unordered_map<std::string, std::string> operator()(
+      const OpDesc& op_desc) const override {
- protected:
-  std::unordered_map<std::string, std::string> Apply(
-      const OpDesc& op_desc, BlockDesc* block) const override {
    return std::unordered_map<std::string, std::string>{
        {framework::GradVarName("YOut"), framework::GradVarName("Y")},
        {framework::GradVarName("Out"), framework::GradVarName("X")},
@@ -171,118 +165,118 @@ REGISTER_OPERATOR(multi_out_grad, f::NOP, f::MultiOutGradInplaceInToOut,
 namespace paddle {
 namespace framework {
-TEST(InferInplace, SingleOpInplaceInToOut) {
+// TEST(InferInplace, SingleOpInplaceInToOut) {
-  ProgramDesc prog;
+//   ProgramDesc prog;
-  auto* op = prog.MutableBlock(0)->AppendOp();
+//   auto* op = prog.MutableBlock(0)->AppendOp();
-  op->SetType("single_op");
+//   op->SetType("single_op");
-  op->SetInput("X", {"test2_a", "test2_b", "test2_c"});
+//   op->SetInput("X", {"test2_a", "test2_b", "test2_c"});
-  op->SetOutput("Out", {"test2_out"});
+//   op->SetOutput("Out", {"test2_out"});
+//
-  prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128});
+//   prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128});
-  prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("test2_out");
+//   prog.MutableBlock(0)->Var("test2_out");
-  prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 128, 128});
+//   prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 128, 128});
+//
-  auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
+//   auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
-  auto in_to_outs = infer_inplace(*op, op->Block());
+//   auto in_to_outs = infer_inplace(*op);
-  EXPECT_EQ(in_to_outs.size(), 1ul);
+//   EXPECT_EQ(in_to_outs.size(), 1ul);
-  auto it = in_to_outs.begin();
+//   auto it = in_to_outs.begin();
-  EXPECT_EQ(it->first, "test2_a");
+//   EXPECT_EQ(it->first, "test2_a");
-  EXPECT_EQ(it->second, "test2_out");
+//   EXPECT_EQ(it->second, "test2_out");
-}
+// }
+//
-TEST(InferInplace, SingleGradOpInplaceInToOut) {
+// TEST(InferInplace, SingleGradOpInplaceInToOut) {
-  ProgramDesc prog;
+//   ProgramDesc prog;
-  auto* op = prog.MutableBlock(0)->AppendOp();
+//   auto* op = prog.MutableBlock(0)->AppendOp();
-  op->SetType("single_op_grad");
+//   op->SetType("single_op_grad");
-  op->SetInput(GradVarName("Out"), {"test2_out"});
+//   op->SetInput(GradVarName("Out"), {"test2_out"});
-  op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"});
+//   op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"});
+//
-  prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("test2_out");
+//   prog.MutableBlock(0)->Var("test2_out");
-  prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 1024, 1024});
+//
-  auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
+//   auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
-  auto in_to_outs = infer_inplace(*op, op->Block());
+//   auto in_to_outs = infer_inplace(*op);
-  EXPECT_EQ(in_to_outs.size(), 1ul);
+//   EXPECT_EQ(in_to_outs.size(), 1ul);
-  auto it = in_to_outs.begin();
+//   auto it = in_to_outs.begin();
-  EXPECT_EQ(it->first, "test2_out");
+//   EXPECT_EQ(it->first, "test2_out");
-  EXPECT_EQ(it->second, "test2_a");
+//   EXPECT_EQ(it->second, "test2_a");
-}
+// }
+//
-TEST(InferInplace, MultiOutInplaceInToOut) {
+// TEST(InferInplace, MultiOutInplaceInToOut) {
-  ProgramDesc prog;
+//   ProgramDesc prog;
-  auto* op = prog.MutableBlock(0)->AppendOp();
+//   auto* op = prog.MutableBlock(0)->AppendOp();
-  op->SetType("multi_out_op");
+//   op->SetType("multi_out_op");
-  op->SetInput("X", {"a0", "a1"});
+//   op->SetInput("X", {"a0", "a1"});
-  op->SetInput("Y", {"b0"});
+//   op->SetInput("Y", {"b0"});
-  op->SetInput("Z", {"c0", "c1"});
+//   op->SetInput("Z", {"c0", "c1"});
-  op->SetOutput("Out", {"o0"});
+//   op->SetOutput("Out", {"o0"});
-  op->SetOutput("YOut", {"y0"});
+//   op->SetOutput("YOut", {"y0"});
-  op->SetOutput("ZOut", {"z0"});
+//   op->SetOutput("ZOut", {"z0"});
+//
-  prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("o0");
+//   prog.MutableBlock(0)->Var("o0");
-  prog.MutableBlock(0)->Var("y0");
+//   prog.MutableBlock(0)->Var("y0");
-  prog.MutableBlock(0)->Var("z0");
+//   prog.MutableBlock(0)->Var("z0");
-  prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
+//
-  auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
+//   auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
-  auto in_to_outs = infer_inplace(*op, op->Block());
+//   auto in_to_outs = infer_inplace(*op);
-  EXPECT_EQ(in_to_outs.size(), 3ul);
+//   EXPECT_EQ(in_to_outs.size(), 3ul);
-  std::unordered_map<std::string, std::string> expects = {
+//   std::unordered_map<std::string, std::string> expects = {
-      {"a0", "o0"}, {"b0", "y0"}, {"c0", "z0"},
+//       {"a0", "o0"}, {"b0", "y0"}, {"c0", "z0"},
-  };
+//   };
-  EXPECT_TRUE(expects == in_to_outs);
+//   EXPECT_TRUE(expects == in_to_outs);
-}
+// }
+//
-TEST(InferInplace, MultiGradInplaceInToOut) {
+// TEST(InferInplace, MultiGradInplaceInToOut) {
-  ProgramDesc prog;
+//   ProgramDesc prog;
-  auto* op = prog.MutableBlock(0)->AppendOp();
+//   auto* op = prog.MutableBlock(0)->AppendOp();
-  op->SetType("multi_out_grad");
+//   op->SetType("multi_out_grad");
-  op->SetInput(GradVarName("Out"), {"o0"});
+//   op->SetInput(GradVarName("Out"), {"o0"});
-  op->SetInput(GradVarName("YOut"), {"y0"});
+//   op->SetInput(GradVarName("YOut"), {"y0"});
-  op->SetInput(GradVarName("ZOut"), {"z0"});
+//   op->SetInput(GradVarName("ZOut"), {"z0"});
-  op->SetOutput(GradVarName("X"), {"a0", "a1"});
+//   op->SetOutput(GradVarName("X"), {"a0", "a1"});
-  op->SetOutput(GradVarName("Y"), {"b0"});
+//   op->SetOutput(GradVarName("Y"), {"b0"});
-  op->SetOutput(GradVarName("Z"), {"c0", "c1"});
+//   op->SetOutput(GradVarName("Z"), {"c0", "c1"});
+//
-  prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
+//   prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("o0");
+//   prog.MutableBlock(0)->Var("o0");
-  prog.MutableBlock(0)->Var("y0");
+//   prog.MutableBlock(0)->Var("y0");
-  prog.MutableBlock(0)->Var("z0");
+//   prog.MutableBlock(0)->Var("z0");
-  prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
-  prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
+//   prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
+//
-  auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
+//   auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
-  auto in_to_outs = infer_inplace(*op, op->Block());
+//   auto in_to_outs = infer_inplace(*op);
+//
-  EXPECT_EQ(in_to_outs.size(), 3ul);
+//   EXPECT_EQ(in_to_outs.size(), 3ul);
-  std::unordered_map<std::string, std::string> expects = {
+//   std::unordered_map<std::string, std::string> expects = {
-      {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"},
+//       {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"},
-  };
+//   };
-  EXPECT_TRUE(expects == in_to_outs);
+//   EXPECT_TRUE(expects == in_to_outs);
-}
+// }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -68,16 +68,22 @@ pass_library(transpose_flatten_concat_fuse_pass inference)
 pass_library(identity_scale_op_clean_pass base)
 pass_library(sync_batch_norm_pass base)
 pass_library(runtime_context_cache_pass base)
+pass_library(simplify_anakin_detection_pattern_pass inference)
+pass_library(anakin_fillconstant_elementwisemul_fuse inference)
 # There may be many transpose-flatten structures in a model, and the output of
 # these structures will be used as inputs to the concat Op. This pattern will
 # be detected by our pass. The index here represents the number of structures in the
 # pattern. We use index 3 ~ 6, because these quantities of structures are
 # common in the models.
-foreach (index RANGE 3 6)
+foreach (index RANGE 2 6)
   file(APPEND ${pass_file} "USE_PASS(transpose_flatten${index}_concat_fuse_pass);\n")
 endforeach()
+foreach (index RANGE 2 6)
+   file(APPEND ${pass_file} "USE_PASS(simplify_anakin_detection_pattern_pass${index});\n")
+endforeach()
 if(WITH_MKLDNN)
    pass_library(mkldnn_placement_pass base mkldnn)
    pass_library(depthwise_conv_mkldnn_pass base mkldnn)

--- a/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.cc
+++ b/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <memory>
+#include <string>
+#include "paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h"
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
+#define GET_NODES                 \
+  GET_IR_NODE(fill_constant);     \
+  GET_IR_NODE(fill_constant_out); \
+  GET_IR_NODE(elementwise_mul);   \
+  GET_IR_NODE(elementwise_mul_out);
+void AnakinFillconstantElementwisemulFuse::ApplyImpl(ir::Graph* graph) const {
+  const std::string pattern_name = "anakin_fillconstant_elementwisemul_fuse";
+  FusePassBase::Init(pattern_name, graph);
+  GraphPatternDetector gpd;
+  auto* x = gpd.mutable_pattern()
+                ->NewNode("x")
+                ->assert_is_op_input("elementwise_mul", "X")
+                ->AsInput();
+  patterns::AnakinFillConstantElementWiseMulFuse pattern(gpd.mutable_pattern(),
+                                                         pattern_name);
+  pattern(x);
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_NODES;
+    PADDLE_ENFORCE(subgraph.count(x));
+    auto* elementwise_in = subgraph.at(x);
+    float constant_value =
+        boost::get<float>(fill_constant->Op()->GetAttr("value"));
+    framework::OpDesc new_op_desc;
+    new_op_desc.SetType("scale");
+    new_op_desc.SetInput("X", {elementwise_in->Name()});
+    new_op_desc.SetAttr("scale", constant_value);
+    new_op_desc.SetAttr("bias", static_cast<float>(0.0));
+    new_op_desc.SetAttr("bias_after_scale", true);
+    new_op_desc.SetOutput("Out", {elementwise_mul_out->Name()});
+    new_op_desc.Flush();
+    // Create a new node for the fused op.
+    auto* scale_op = graph->CreateOpNode(&new_op_desc);
+    IR_NODE_LINK_TO(elementwise_in, scale_op);       // Input
+    IR_NODE_LINK_TO(scale_op, elementwise_mul_out);  // Output
+    // Delete the unneeded nodes.
+    GraphSafeRemoveNodes(graph,
+                         {fill_constant, fill_constant_out, elementwise_mul});
+  };
+  gpd(graph, handler);
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(anakin_fillconstant_elementwisemul_fuse,
+              paddle::framework::ir::AnakinFillconstantElementwisemulFuse);
--- a/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h
+++ b/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+class AnakinFillconstantElementwisemulFuse : public FusePassBase {
+ public:
+  virtual ~AnakinFillconstantElementwisemulFuse() {}
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -14,6 +14,7 @@
 #include "paddle/fluid/framework/ir/attention_lstm_fuse_pass.h"
 #include <string>
+#include <unordered_set>
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -253,8 +254,7 @@ void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input,
 // Parameters
-std::unique_ptr<ir::Graph> AttentionLSTMFusePass::ApplyImpl(
+void AttentionLSTMFusePass::ApplyImpl(ir::Graph* graph) const {
-    std::unique_ptr<ir::Graph> graph) const {
  PDPattern external_pattern, subblock_pattern;
  // Use the following variables to tell whether this model is RNN1.
@@ -269,12 +269,11 @@ std::unique_ptr<ir::Graph> AttentionLSTMFusePass::ApplyImpl(
    }
  }
  if (count < specified_vars.size()) {
-    return graph;
+    return;
  }
  // Continue to fuse.
-  FindWhileOp(graph.get());
+  FindWhileOp(graph);
-  return graph;
 }
 }  // namespace ir

--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h
@@ -22,8 +22,7 @@ namespace ir {
 class AttentionLSTMFusePass : public FusePassBase {
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
+  void ApplyImpl(ir::Graph* graph) const override;
-      std::unique_ptr<ir::Graph> graph) const override;
 };
 }  // namespace ir

--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
@@ -77,10 +77,9 @@ void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight,
  weights_array_2d.colwise() *= scale_array;
 }
-std::unique_ptr<ir::Graph> ConvAffineChannelFusePass::ApplyImpl(
+void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
-    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph);
-  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init(name_scope_, graph);
-  FusePassBase::Init(name_scope_, graph.get());
  auto* scope = param_scope();
  PADDLE_ENFORCE(scope);
@@ -139,7 +138,7 @@ std::unique_ptr<ir::Graph> ConvAffineChannelFusePass::ApplyImpl(
    desc.SetAttr("axis", 1);
    auto eltwise_op = g->CreateOpNode(&desc);  // OpDesc will be copied.
-    GraphSafeRemoveNodes(graph.get(), {ac_scale, ac_bias, affine_channel});
+    GraphSafeRemoveNodes(graph, {ac_scale, ac_bias, affine_channel});
    IR_NODE_LINK_TO(conv_out, eltwise_op);
    IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op);
@@ -147,16 +146,14 @@ std::unique_ptr<ir::Graph> ConvAffineChannelFusePass::ApplyImpl(
    found_conv_ac_count++;
  };
-  gpd(graph.get(), handler);
+  gpd(graph, handler);
  AddStatis(found_conv_ac_count);
-  return graph;
 }
-std::unique_ptr<ir::Graph> ConvEltwiseAddAffineChannelFusePass::ApplyImpl(
+void ConvEltwiseAddAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
-    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph);
-  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init(name_scope_, graph);
-  FusePassBase::Init(name_scope_, graph.get());
  auto* scope = param_scope();
  PADDLE_ENFORCE(scope);
@@ -199,7 +196,7 @@ std::unique_ptr<ir::Graph> ConvEltwiseAddAffineChannelFusePass::ApplyImpl(
    eltwise->Op()->SetAttr("axis", 1);
    eltwise->Op()->SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
-    GraphSafeRemoveNodes(graph.get(),
+    GraphSafeRemoveNodes(graph,
                         {ac_scale, ac_bias, affine_channel, eltwise_out});
    IR_NODE_LINK_TO(eltwise, ac_out);
@@ -207,9 +204,8 @@ std::unique_ptr<ir::Graph> ConvEltwiseAddAffineChannelFusePass::ApplyImpl(
    found_conv_ac_count++;
  };
-  gpd(graph.get(), handler);
+  gpd(graph, handler);
  AddStatis(found_conv_ac_count);
-  return graph;
 }
 }  // namespace ir

--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
@@ -31,8 +31,7 @@ class ConvAffineChannelFusePass : public FusePassBase {
  virtual ~ConvAffineChannelFusePass() {}
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
+  void ApplyImpl(ir::Graph*) const override;
-      std::unique_ptr<ir::Graph> graph) const override;
  const std::string name_scope_{"conv_affine_channel_fuse"};
 };
@@ -41,8 +40,7 @@ class ConvEltwiseAddAffineChannelFusePass : public FusePassBase {
  virtual ~ConvEltwiseAddAffineChannelFusePass() {}
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
+  void ApplyImpl(ir::Graph*) const override;
-      std::unique_ptr<ir::Graph> graph) const override;
  const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"};
 };

--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -101,10 +101,9 @@ void recompute_bias_and_weights(const Scope* scope,
  weights_array_2d.colwise() *= variance_array;
 }
-std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
+void ConvBNFusePass::ApplyImpl(ir::Graph* graph) const {
-    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph);
-  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init(name_scope_, graph);
-  FusePassBase::Init(name_scope_, graph.get());
  auto* scope = param_scope();
  PADDLE_ENFORCE(scope);
@@ -187,7 +186,7 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
                            std::vector<std::string>({bn_out->Name()}));
      GraphSafeRemoveNodes(
-          graph.get(),
+          graph,
          {conv_out, bn_scale, bn_bias, bn_mean, bn_variance, batch_norm,
           bn_mean_out, bn_variance_out, bn_saved_mean, bn_saved_variance});
@@ -203,10 +202,9 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
      desc.SetAttr("axis", 1);
      auto eltwise_op = g->CreateOpNode(&desc);  // OpDesc will be copied.
-      GraphSafeRemoveNodes(
+      GraphSafeRemoveNodes(graph, {bn_scale, bn_bias, bn_mean, bn_variance,
-          graph.get(),
+                                   batch_norm, bn_mean_out, bn_variance_out,
-          {bn_scale, bn_bias, bn_mean, bn_variance, batch_norm, bn_mean_out,
+                                   bn_saved_mean, bn_saved_variance});
-           bn_variance_out, bn_saved_mean, bn_saved_variance});
      IR_NODE_LINK_TO(conv_out, eltwise_op);
      IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op);
@@ -215,16 +213,14 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
    }
  };
-  gpd(graph.get(), handler);
+  gpd(graph, handler);
  AddStatis(found_conv_bn_count);
-  return graph;
 }
-std::unique_ptr<ir::Graph> ConvEltwiseAddBNFusePass::ApplyImpl(
+void ConvEltwiseAddBNFusePass::ApplyImpl(ir::Graph* graph) const {
-    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph);
-  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init(name_scope_, graph);
-  FusePassBase::Init(name_scope_, graph.get());
  auto* scope = param_scope();
  PADDLE_ENFORCE(scope);
@@ -274,7 +270,7 @@ std::unique_ptr<ir::Graph> ConvEltwiseAddBNFusePass::ApplyImpl(
    eltwise->Op()->SetOutput("Out", std::vector<std::string>({bn_out->Name()}));
    GraphSafeRemoveNodes(
-        graph.get(),
+        graph,
        {bn_scale, bn_bias, bn_mean, bn_variance, batch_norm, bn_mean_out,
         bn_variance_out, bn_saved_mean, bn_saved_variance, eltwise_out});
@@ -283,10 +279,9 @@ std::unique_ptr<ir::Graph> ConvEltwiseAddBNFusePass::ApplyImpl(
    found_conv_bn_count++;
  };
-  gpd(graph.get(), handler);
+  gpd(graph, handler);
  AddStatis(found_conv_bn_count);
-  return graph;
 }
 }  // namespace ir

--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h
@@ -31,8 +31,7 @@ class ConvBNFusePass : public FusePassBase {
  virtual ~ConvBNFusePass() {}
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
+  void ApplyImpl(ir::Graph* graph) const override;
-      std::unique_ptr<ir::Graph> graph) const override;
  const std::string name_scope_{"conv_bn_fuse"};
 };
@@ -41,8 +40,7 @@ class ConvEltwiseAddBNFusePass : public FusePassBase {
  virtual ~ConvEltwiseAddBNFusePass() {}
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
+  void ApplyImpl(ir::Graph* graph) const override;
-      std::unique_ptr<ir::Graph> graph) const override;
  const std::string name_scope_{"conv_eltwiseadd_bn_fuse"};
 };

--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc
@@ -50,10 +50,9 @@ framework::proto::OpDesc PrepareOpDesc(
  return *desc.Proto();
 }
-std::unique_ptr<ir::Graph> ConvElementwiseAddActFusePass::ApplyImpl(
+void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const {
-    std::unique_ptr<ir::Graph> graph) const {
  const std::string pattern_name = "conv_elementwise_add_act_fuse";
-  FusePassBase::Init(pattern_name, graph.get());
+  FusePassBase::Init(pattern_name, graph);
  GraphPatternDetector gpd;
  auto* x = gpd.mutable_pattern()->NewNode("x")->AsInput()->assert_is_op_input(
@@ -95,7 +94,6 @@ std::unique_ptr<ir::Graph> ConvElementwiseAddActFusePass::ApplyImpl(
                          elementwise_add_out});
  };
  gpd(graph.get(), handler);
-  return graph;
 }
 }  // namespace ir

--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
@@ -51,10 +51,9 @@ framework::proto::OpDesc PrepareOpDesc(
  return *desc.Proto();
 }
-std::unique_ptr<ir::Graph> ConvElementwiseAdd2ActFusePass::ApplyImpl(
+void ConvElementwiseAdd2ActFusePass::ApplyImpl(ir::Graph* graph) const {
-    std::unique_ptr<ir::Graph> graph) const {
  const std::string pattern_name = "conv_elementwise_add2_act_fuse";
-  FusePassBase::Init(pattern_name, graph.get());
+  FusePassBase::Init(pattern_name, graph);
  GraphPatternDetector gpd;
  auto* x = gpd.mutable_pattern()->NewNode("x")->AsInput()->assert_is_op_input(
@@ -92,12 +91,10 @@ std::unique_ptr<ir::Graph> ConvElementwiseAdd2ActFusePass::ApplyImpl(
    // Delete the unneeded nodes.
    GraphSafeRemoveNodes(
-        graph.get(),
+        graph, {conv_op, conv_out, elementwise_add_op, elementwise_add_op_1,
-        {conv_op, conv_out, elementwise_add_op, elementwise_add_op_1,
+                elementwise_add_out, elementwise_add_out_1, act_op});
-         elementwise_add_out, elementwise_add_out_1, act_op});
  };
-  gpd(graph.get(), handler);
+  gpd(graph, handler);
-  return graph;
 }
 }  // namespace ir

--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
@@ -25,8 +25,7 @@ class ConvElementwiseAdd2ActFusePass : public FusePassBase {
  virtual ~ConvElementwiseAdd2ActFusePass() {}
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
+  void ApplyImpl(ir::Graph* graph) const override;
-      std::unique_ptr<ir::Graph> graph) const override;
 };
 }  // namespace ir

--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
@@ -48,10 +48,9 @@ framework::proto::OpDesc PrepareOpDesc(
  return *desc.Proto();
 }
-std::unique_ptr<ir::Graph> ConvElementwiseAddActFusePass::ApplyImpl(
+void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const {
-    std::unique_ptr<ir::Graph> graph) const {
  const std::string pattern_name = "conv_elementwise_add_act_fuse";
-  FusePassBase::Init(pattern_name, graph.get());
+  FusePassBase::Init(pattern_name, graph);
  GraphPatternDetector gpd;
  auto* x = gpd.mutable_pattern()
@@ -88,12 +87,11 @@ std::unique_ptr<ir::Graph> ConvElementwiseAddActFusePass::ApplyImpl(
    IR_NODE_LINK_TO(new_conv_op, act_out);               // Output
    // Delete the unneeded nodes.
-    GraphSafeRemoveNodes(graph.get(), {conv_op, conv_out, elementwise_add_op,
+    GraphSafeRemoveNodes(graph, {conv_op, conv_out, elementwise_add_op,
-                                       elementwise_add_out, act_op});
+                                 elementwise_add_out, act_op});
  };
-  gpd(graph.get(), handler);
+  gpd(graph, handler);
-  return graph;
 }
 }  // namespace ir

--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
@@ -25,8 +25,7 @@ class ConvElementwiseAddActFusePass : public FusePassBase {
  virtual ~ConvElementwiseAddActFusePass() {}
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
+  void ApplyImpl(ir::Graph* graph) const override;
-      std::unique_ptr<ir::Graph> graph) const override;
 };
 }  // namespace ir

--- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
@@ -30,10 +30,9 @@ namespace ir {
  GET_IR_NODE(elementwise_add_in_y); \
  GET_IR_NODE(elementwise_add_out);
-std::unique_ptr<ir::Graph> ConvElementwiseAddFusePass::ApplyImpl(
+void ConvElementwiseAddFusePass::ApplyImpl(ir::Graph* graph) const {
-    std::unique_ptr<ir::Graph> graph) const {
  const std::string pattern_name = "conv_elementwise_add_fuse";
-  FusePassBase::Init(pattern_name, graph.get());
+  FusePassBase::Init(pattern_name, graph);
  GraphPatternDetector gpd;
  auto* x = gpd.mutable_pattern()
@@ -76,11 +75,10 @@ std::unique_ptr<ir::Graph> ConvElementwiseAddFusePass::ApplyImpl(
    IR_NODE_LINK_TO(new_conv_op, elementwise_add_out);   // Output
    // Delete the unneeded nodes.
-    GraphSafeRemoveNodes(graph.get(), {conv_op, conv_out, elementwise_add_op});
+    GraphSafeRemoveNodes(graph, {conv_op, conv_out, elementwise_add_op});
  };
-  gpd(graph.get(), handler);
+  gpd(graph, handler);
-  return graph;
 }
 }  // namespace ir

--- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
@@ -25,8 +25,7 @@ class ConvElementwiseAddFusePass : public FusePassBase {
  virtual ~ConvElementwiseAddFusePass() {}
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
+  void ApplyImpl(ir::Graph* graph) const override;
-      std::unique_ptr<ir::Graph> graph) const override;
 };
 }  // namespace ir

--- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
@@ -15,6 +15,8 @@
 #include "paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h"
 #include <algorithm>
 #include <string>
+#include <unordered_set>
+#include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/operators/math/blas.h"
@@ -201,7 +203,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
      // Remove unneeded nodes.
      // TODO(jczaja): Proper removing of lookup table
      std::unordered_set<const Node*> marked_nodes(
-          //{lookup_table, mul, lstm, elementwise_add, fc_bias, W});
+          // {lookup_table, mul, lstm, elementwise_add, fc_bias, W});
          {mul, lstm, elementwise_add, fc_bias});
      GraphSafeRemoveNodes(graph, marked_nodes);
    } else {
@@ -224,15 +226,13 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
  return fusion_count;
 }
-std::unique_ptr<ir::Graph> EmbeddingFCLSTMFusePass::ApplyImpl(
+void EmbeddingFCLSTMFusePass::ApplyImpl(ir::Graph* graph) const {
-    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph);
-  FusePassBase::Init(name_scope_, graph.get());
-  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
+  int fusion_count =
-                                 true /*with_fc_bias*/);
+      BuildFusion(graph, name_scope_, param_scope(), true /*with_fc_bias*/);
  AddStatis(fusion_count);
-  return graph;
 }
 }  // namespace ir

--- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
@@ -32,8 +32,7 @@ class EmbeddingFCLSTMFusePass : public FusePassBase {
  virtual ~EmbeddingFCLSTMFusePass() {}
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
+  void ApplyImpl(ir::Graph* graph) const override;
-      std::unique_ptr<ir::Graph> graph) const override;
  const std::string name_scope_{"embedding_fc_lstm_fuse"};
 };

--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -14,6 +14,7 @@
 #include "paddle/fluid/framework/ir/fc_fuse_pass.h"
 #include <string>
+#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -22,10 +23,9 @@ namespace paddle {
 namespace framework {
 namespace ir {
-std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
+void FCFusePass::ApplyImpl(ir::Graph* graph) const {
-    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph);
-  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init("fc_fuse", graph);
-  FusePassBase::Init("fc_fuse", graph.get());
  std::unordered_set<Node*> nodes2delete;
@@ -61,7 +61,7 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
    desc.SetAttr("in_num_col_dims", mul->Op()->GetAttr("x_num_col_dims"));
    desc.SetType("fc");
    auto fc_node = g->CreateOpNode(&desc);  // OpDesc will be copied.
-    GraphSafeRemoveNodes(graph.get(), {mul, elementwise_add, mul_out});
+    GraphSafeRemoveNodes(graph, {mul, elementwise_add, mul_out});
    PADDLE_ENFORCE(subgraph.count(x));
    IR_NODE_LINK_TO(subgraph.at(x), fc_node);
@@ -72,10 +72,9 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
    found_fc_count++;
  };
-  gpd(graph.get(), handler);
+  gpd(graph, handler);
  AddStatis(found_fc_count);
-  return graph;
 }
 }  // namespace ir

--- a/paddle/fluid/framework/ir/fc_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.h
@@ -31,8 +31,7 @@ class FCFusePass : public FusePassBase {
  virtual ~FCFusePass() {}
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
+  void ApplyImpl(ir::Graph* graph) const override;
-      std::unique_ptr<ir::Graph> graph) const override;
 };
 }  // namespace ir

--- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
@@ -73,7 +73,7 @@ TEST(FCFusePass, basic) {
  int pre_nodes = graph->Nodes().size();
-  graph = pass->Apply(std::move(graph));
+  graph.reset(pass->Apply(graph.release()));
  int after_nodes = graph->Nodes().size();

--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
@@ -14,6 +14,7 @@
 #include "paddle/fluid/framework/ir/fc_gru_fuse_pass.h"
 #include <string>
+#include <unordered_set>
 #include "paddle/fluid/framework/lod_tensor.h"
 namespace paddle {
@@ -39,7 +40,6 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
  // Create New OpDesc
  auto gru_creater = [&](Node* gru, Node* x, Node* weight_x, Node* weight_h,
                         Node* bias, Node* hidden, Node* fc_bias) {
    OpDesc op_desc;
    op_desc.SetType("fusion_gru");
@@ -155,26 +155,22 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
  return fusion_count;
 }
-std::unique_ptr<ir::Graph> MulGRUFusePass::ApplyImpl(
+void MulGRUFusePass::ApplyImpl(ir::Graph* graph) const {
-    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph);
-  FusePassBase::Init(name_scope_, graph.get());
-  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
+  int fusion_count =
-                                 false /*with_fc_bias*/);
+      BuildFusion(graph, name_scope_, param_scope(), false /*with_fc_bias*/);
  AddStatis(fusion_count);
-  return graph;
 }
-std::unique_ptr<ir::Graph> FCGRUFusePass::ApplyImpl(
+void FCGRUFusePass::ApplyImpl(ir::Graph* graph) const {
-    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph);
-  FusePassBase::Init(name_scope_, graph.get());
-  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
+  int fusion_count =
-                                 true /*with_fc_bias*/);
+      BuildFusion(graph, name_scope_, param_scope(), true /*with_fc_bias*/);
  AddStatis(fusion_count);
-  return graph;
 }
 }  // namespace ir

--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h
@@ -30,8 +30,7 @@ class FCGRUFusePass : public FusePassBase {
  virtual ~FCGRUFusePass() {}
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
+  void ApplyImpl(ir::Graph* graph) const override;
-      std::unique_ptr<ir::Graph> graph) const override;
  const std::string name_scope_{"fc_gru_fuse"};
 };
@@ -42,8 +41,7 @@ class MulGRUFusePass : public FusePassBase {
  virtual ~MulGRUFusePass() {}
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
+  void ApplyImpl(ir::Graph* graph) const override;
-      std::unique_ptr<ir::Graph> graph) const override;
  const std::string name_scope_{"fc_nobias_gru_fuse"};
 };

--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -14,6 +14,7 @@
 #include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h"
 #include <string>
+#include <unordered_set>
 #include "paddle/fluid/framework/lod_tensor.h"
 namespace paddle {
@@ -157,26 +158,22 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
  return fusion_count;
 }
-std::unique_ptr<ir::Graph> MulLstmFusePass::ApplyImpl(
+void MulLstmFusePass::ApplyImpl(ir::Graph* graph) const {
-    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph);
-  FusePassBase::Init(name_scope_, graph.get());
-  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
+  int fusion_count =
-                                 false /*with_fc_bias*/);
+      BuildFusion(graph, name_scope_, param_scope(), false /*with_fc_bias*/);
  AddStatis(fusion_count);
-  return graph;
 }
-std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
+void FCLstmFusePass::ApplyImpl(ir::Graph* graph) const {
-    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph);
-  FusePassBase::Init(name_scope_, graph.get());
-  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
+  int fusion_count =
-                                 true /*with_fc_bias*/);
+      BuildFusion(graph, name_scope_, param_scope(), true /*with_fc_bias*/);
  AddStatis(fusion_count);
-  return graph;
 }
 }  // namespace ir

--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
@@ -32,8 +32,7 @@ class FCLstmFusePass : public FusePassBase {
  virtual ~FCLstmFusePass() {}
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
+  void ApplyImpl(ir::Graph* graph) const override;
-      std::unique_ptr<ir::Graph> graph) const override;
  const std::string name_scope_{"fc_lstm_fuse"};
 };
@@ -43,8 +42,7 @@ class MulLstmFusePass : public FusePassBase {
  virtual ~MulLstmFusePass() {}
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
+  void ApplyImpl(ir::Graph* graph) const override;
-      std::unique_ptr<ir::Graph> graph) const override;
  const std::string name_scope_{"fc_nobias_lstm_fuse"};
 };

--- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
@@ -15,6 +15,8 @@
 #include "paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h"
 #include <algorithm>
 #include <string>
+#include <unordered_set>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -23,29 +25,25 @@ namespace paddle {
 namespace framework {
 namespace ir {
-std::unique_ptr<ir::Graph> FuseElewiseAddActPass::ApplyImpl(
+void FuseElewiseAddActPass::ApplyImpl(ir::Graph *graph) const {
-    std::unique_ptr<ir::Graph> graph) const {
  std::unordered_set<std::string> act_types = {"relu", "scale"};
-  graph = FuseActElewiseAdd(std::move(graph), act_types);
+  graph = FuseActElewiseAdd(graph, act_types);
-  graph = FuseElewiseAddAct(std::move(graph), act_types);
+  graph = FuseElewiseAddAct(graph, act_types);
  // backward
  {
    std::unordered_set<std::string> in_place_act_types = {"relu_grad"};
-    graph = FuseElewiseAddActInplaceGrad(std::move(graph), in_place_act_types);
+    graph = FuseElewiseAddActInplaceGrad(graph, in_place_act_types);
  }
  // Remove the removable intermediate_out.
-  RemoveIntermediateOut(graph.get());
+  RemoveIntermediateOut(graph);
-  return graph;
 }
 // ele_add(x, act(y))
-std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddAct(
+ir::Graph *FuseElewiseAddActPass::FuseElewiseAddAct(
-    std::unique_ptr<ir::Graph> graph,
+    ir::Graph *graph, const std::unordered_set<std::string> &act_types) const {
-    const std::unordered_set<std::string> &act_types) const {
+  PADDLE_ENFORCE(graph);
-  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init("elewise_add_act", graph);
-  FusePassBase::Init("elewise_add_act", graph.get());
  GraphPatternDetector gpd;
  auto *x = gpd.mutable_pattern()
@@ -86,18 +84,17 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddAct(
    found_elewise_add_act_count++;
  };
-  gpd(graph.get(), handler);
+  gpd(graph, handler);
  AddStatis(found_elewise_add_act_count);
  return graph;
 }
 // act(ele_add(x,y))
-std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseActElewiseAdd(
+ir::Graph *FuseElewiseAddActPass::FuseActElewiseAdd(
-    std::unique_ptr<ir::Graph> graph,
+    ir::Graph *graph, const std::unordered_set<std::string> &act_types) const {
-    const std::unordered_set<std::string> &act_types) const {
+  PADDLE_ENFORCE(graph);
-  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init("act_elewise_add", graph);
-  FusePassBase::Init("act_elewise_add", graph.get());
  GraphPatternDetector gpd;
  auto *x = gpd.mutable_pattern()
@@ -137,7 +134,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseActElewiseAdd(
    found_elewise_add_act_count++;
  };
-  gpd(graph.get(), handler);
+  gpd(graph, handler);
  AddStatis(found_elewise_add_act_count);
  return graph;
@@ -146,11 +143,10 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseActElewiseAdd(
 // the backward of act(ele_add(x,y))
 // act_grad: in["Out", "Out@GRAD"], out["X@GRAD"]
 // ele_add_grad: in["Y", "Out@GRAD"], out["X@GRAD", "Y@GRAD"]
-std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad(
+ir::Graph *FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad(
-    std::unique_ptr<ir::Graph> graph,
+    ir::Graph *graph, const std::unordered_set<std::string> &act_types) const {
-    const std::unordered_set<std::string> &act_types) const {
+  PADDLE_ENFORCE(graph);
-  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init("elewise_add_act_grad", graph);
-  FusePassBase::Init("elewise_add_act_grad", graph.get());
  GraphPatternDetector gpd;
  auto *d_act_out = gpd.mutable_pattern()
@@ -217,7 +213,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad(
    found_elewise_add_act_count++;
  };
-  gpd(graph.get(), handler);
+  gpd(graph, handler);
  AddStatis(found_elewise_add_act_count);
  return graph;

--- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
+++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
@@ -14,6 +14,8 @@
 #pragma once
 #include <string>
+#include <unordered_set>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
@@ -32,20 +34,16 @@ class FuseElewiseAddActPass : public FusePassBase {
  virtual ~FuseElewiseAddActPass() {}
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
+  void ApplyImpl(ir::Graph *graph) const override;
-      std::unique_ptr<ir::Graph> graph) const override;
-  std::unique_ptr<ir::Graph> FuseElewiseAddAct(
+  ir::Graph *FuseElewiseAddAct(
-      std::unique_ptr<ir::Graph> graph,
+      ir::Graph *graph, const std::unordered_set<std::string> &act_types) const;
-      const std::unordered_set<std::string> &act_types) const;
-  std::unique_ptr<ir::Graph> FuseActElewiseAdd(
+  ir::Graph *FuseActElewiseAdd(
-      std::unique_ptr<ir::Graph> graph,
+      ir::Graph *graph, const std::unordered_set<std::string> &act_types) const;
-      const std::unordered_set<std::string> &act_types) const;
-  std::unique_ptr<ir::Graph> FuseElewiseAddActInplaceGrad(
+  ir::Graph *FuseElewiseAddActInplaceGrad(
-      std::unique_ptr<ir::Graph> graph,
+      ir::Graph *graph, const std::unordered_set<std::string> &act_types) const;
-      const std::unordered_set<std::string> &act_types) const;
  /**
   * Remove the removable intermediate_out.

--- a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h"
 #include <algorithm>
 #include <string>
+#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -23,20 +24,18 @@ namespace paddle {
 namespace framework {
 namespace ir {
-std::unique_ptr<ir::Graph> FuseReluDepthwiseConvPass::ApplyImpl(
+void FuseReluDepthwiseConvPass::ApplyImpl(ir::Graph *graph) const {
-    std::unique_ptr<ir::Graph> graph) const {
+  graph = FuseReluDepthwiseConv(graph, true);
-  graph = FuseReluDepthwiseConv(std::move(graph), true);
+  graph = FuseReluDepthwiseConv(graph, false);
-  graph = FuseReluDepthwiseConv(std::move(graph), false);
-  return graph;
 }
-std::unique_ptr<ir::Graph> FuseReluDepthwiseConvPass::FuseReluDepthwiseConv(
+ir::Graph *FuseReluDepthwiseConvPass::FuseReluDepthwiseConv(
-    std::unique_ptr<ir::Graph> graph, bool only_forward) const {
+    ir::Graph *graph, bool only_forward) const {
-  PADDLE_ENFORCE(graph.get());
+  PADDLE_ENFORCE(graph);
  if (only_forward)
-    FusePassBase::Init("relu_depthwise_conv_only_forward", graph.get());
+    FusePassBase::Init("relu_depthwise_conv_only_forward", graph);
  else
-    FusePassBase::Init("relu_depthwise_conv", graph.get());
+    FusePassBase::Init("relu_depthwise_conv", graph);
  /*
           x ---act--> y ---layer-> z
            +----------+
@@ -144,10 +143,9 @@ std::unique_ptr<ir::Graph> FuseReluDepthwiseConvPass::FuseReluDepthwiseConv(
    }
    count++;
  };
-  gpd(graph.get(), handler);
+  gpd(graph, handler);
-  GraphSafeRemoveNodes(graph.get(), need_removed_nodes);
+  GraphSafeRemoveNodes(graph, need_removed_nodes);
  AddStatis(count);
  return graph;
 }

--- a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
+++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h
@@ -32,10 +32,8 @@ class FuseReluDepthwiseConvPass : public FusePassBase {
  virtual ~FuseReluDepthwiseConvPass() {}
 protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(
+  void ApplyImpl(ir::Graph* graph) const override;
-      std::unique_ptr<ir::Graph> graph) const override;
+  ir::Graph* FuseReluDepthwiseConv(ir::Graph* graph, bool only_forward) const;
-  std::unique_ptr<ir::Graph> FuseReluDepthwiseConv(
-      std::unique_ptr<ir::Graph> graph, bool only_forward) const;
 };
 }  // namespace ir

--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1470,6 +1470,171 @@ PDNode *patterns::TransposeFlattenConcat::operator()(
  return concat_out;
 }
+PDNode *patterns::AnakinDetectionPattern::operator()(
+    std::vector<PDNode *> conv_in, int times) {
+  // The times represents the repeat times of the
+  // {prior_box, prior_box_loc_out, flatten, prior_box_var_out, reshape}
+  const int kNumFields = 7;
+  const int kPriorBoxLocOffset = 1;
+  const int kReshape1Offset = 2;
+  const int kReshape1OutOffset = 3;
+  const int kPriorBoxVarOffset = 4;
+  const int kReshape2Offset = 5;
+  const int kReshape2OutOffset = 6;
+  const int kBoxCoderThirdInputOffset = times;
+  const int kMultiClassSecondInputNmsOffset = times + 1;
+  std::vector<PDNode *> nodes;
+  for (int i = 0; i < times; i++) {
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("prior_box" + std::to_string(i)))
+            ->assert_is_op("density_prior_box"));
+    nodes.push_back(pattern->NewNode(GetNodeName("box_out" + std::to_string(i)))
+                        ->assert_is_op_output("density_prior_box", "Boxes")
+                        ->assert_is_op_input("reshape2", "X")
+                        ->AsIntermediate());
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("reshape1" + std::to_string(i)))
+            ->assert_is_op("reshape2"));
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("reshape1_out" + std::to_string(i)))
+            ->assert_is_op_output("reshape2")
+            ->assert_is_op_nth_input("concat", "X", i)
+            ->AsIntermediate());
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("box_var_out" + std::to_string(i)))
+            ->assert_is_op_output("density_prior_box", "Variances")
+            ->assert_is_op_input("reshape2", "X")
+            ->AsIntermediate());
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("reshape2" + std::to_string(i)))
+            ->assert_is_op("reshape2"));
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("reshape2_out" + std::to_string(i)))
+            ->assert_is_op_output("reshape2")
+            ->assert_is_op_nth_input("concat", "X", i)
+            ->AsIntermediate());
+  }
+  auto concat_op1 = pattern->NewNode(GetNodeName("concat1"))
+                        ->assert_is_op("concat")
+                        ->assert_op_has_n_inputs("concat", times);
+  auto concat_out1 = pattern->NewNode(GetNodeName("concat1_out"))
+                         ->assert_is_op_output("concat")
+                         ->AsIntermediate();
+  auto concat_op2 = pattern->NewNode(GetNodeName("concat2"))
+                        ->assert_is_op("concat")
+                        ->assert_op_has_n_inputs("concat", times);
+  auto concat_out2 = pattern->NewNode(GetNodeName("concat2_out"))
+                         ->assert_is_op_output("concat")
+                         ->AsIntermediate();
+  auto box_coder_op = pattern->NewNode(GetNodeName("box_coder"))
+                          ->assert_is_op("box_coder")
+                          ->assert_op_has_n_inputs("box_coder", 3);
+  auto box_coder_out = pattern->NewNode(GetNodeName("box_coder_out"))
+                           ->assert_is_op_output("box_coder")
+                           ->AsIntermediate();
+  auto transpose_before_nms =
+      pattern->NewNode(GetNodeName("transpose_before_nms"))
+          ->assert_is_op("transpose2");
+  auto transpose_before_nms_out =
+      pattern->NewNode(GetNodeName("transpose_before_nms_out"))
+          ->assert_is_op_output("transpose2")
+          ->assert_is_op_input("multiclass_nms", "Scores")
+          ->AsIntermediate();
+  auto multiclass_nms_op = pattern->NewNode(GetNodeName("multiclass_nms"))
+                               ->assert_is_op("multiclass_nms")
+                               ->assert_op_has_n_inputs("multiclass_nms", 2);
+  auto multiclass_nms_out = pattern->NewNode(GetNodeName("multiclass_nms_out"))
+                                ->assert_is_op_output("multiclass_nms")
+                                ->AsOutput();
+  std::vector<PDNode *> reshape1_outs;
+  std::vector<PDNode *> reshape2_outs;
+  for (int i = 0; i < times; i++) {
+    conv_in[i]->AsInput();
+    // prior_box
+    nodes[i * kNumFields]->LinksFrom({conv_in[i]});
+    // prior_box box out
+    nodes[i * kNumFields + kPriorBoxLocOffset]->LinksFrom(
+        {nodes[i * kNumFields]});
+    // reshape
+    nodes[i * kNumFields + kReshape1Offset]->LinksFrom(
+        {nodes[i * kNumFields + kPriorBoxLocOffset]});
+    // reshape_out
+    nodes[i * kNumFields + kReshape1OutOffset]->LinksFrom(
+        {nodes[i * kNumFields + kReshape1Offset]});
+    nodes[i * kNumFields + kPriorBoxVarOffset]->LinksFrom(
+        {nodes[i * kNumFields]});
+    // reshape
+    nodes[i * kNumFields + kReshape2Offset]->LinksFrom(
+        {nodes[i * kNumFields + kPriorBoxVarOffset]});
+    // reshape_out
+    nodes[i * kNumFields + kReshape2OutOffset]->LinksFrom(
+        {nodes[i * kNumFields + kReshape2Offset]});
+    reshape1_outs.push_back(nodes[i * kNumFields + kReshape1OutOffset]);
+    reshape2_outs.push_back(nodes[i * kNumFields + kReshape2OutOffset]);
+  }
+  concat_op1->LinksFrom(reshape1_outs);
+  concat_op2->LinksFrom(reshape2_outs);
+  concat_out1->LinksFrom({concat_op1});
+  concat_out2->LinksFrom({concat_op2});
+  conv_in[kBoxCoderThirdInputOffset]->AsInput();
+  conv_in[kMultiClassSecondInputNmsOffset]->AsInput();
+  box_coder_op->LinksFrom(
+      {concat_out1, concat_out2, conv_in[kBoxCoderThirdInputOffset]});
+  box_coder_out->LinksFrom({box_coder_op});
+  transpose_before_nms->LinksFrom({conv_in[kMultiClassSecondInputNmsOffset]});
+  transpose_before_nms_out->LinksFrom({transpose_before_nms});
+  multiclass_nms_op->LinksFrom({box_coder_out, transpose_before_nms_out})
+      .LinksTo({multiclass_nms_out});
+  return multiclass_nms_out;
+}
+PDNode *patterns::AnakinFillConstantElementWiseMulFuse::operator()(
+    PDNode *elementwise_op_input) {
+  auto fill_constant =
+      pattern->NewNode(fill_constant_repr())->assert_is_op("fill_constant");
+  auto fill_constant_out = pattern->NewNode(fill_constant_out_repr())
+                               ->assert_is_op_output("fill_constant")
+                               ->assert_is_op_input("elementwise_mul", "Y")
+                               ->AsIntermediate();
+  auto elementwise_mul_op =
+      pattern->NewNode(elementwise_mul_repr())->assert_is_op("elementwise_mul");
+  auto elementwise_mul_out = pattern->NewNode(elementwise_mul_out_repr())
+                                 ->assert_is_op_output("elementwise_mul")
+                                 ->AsOutput();
+  fill_constant_out->LinksFrom({fill_constant});
+  elementwise_mul_op->LinksFrom({elementwise_op_input, fill_constant_out});
+  elementwise_mul_out->LinksFrom({elementwise_mul_op});
+  return elementwise_mul_out;
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -844,6 +844,36 @@ struct TransposeFlattenConcat : public PatternBase {
  }
 };
+struct AnakinDetectionPattern : public PatternBase {
+  AnakinDetectionPattern(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "anakin_detect_pattern") {}
+  PDNode* operator()(std::vector<PDNode*> conv_inputs, int times);
+  std::string GetNodeName(const std::string& op_type) {
+    return PDNodeName(name_scope_, repr_, id_, op_type);
+  }
+  PDNode* GetPDNode(const std::string& op_type) {
+    return pattern->RetrieveNode(GetNodeName(op_type));
+  }
+};
+struct AnakinFillConstantElementWiseMulFuse : public PatternBase {
+  AnakinFillConstantElementWiseMulFuse(PDPattern* pattern,
+                                       const std::string& name_scope)
+      : PatternBase(pattern, name_scope,
+                    "anakin_fillconstant_elementwisemul_fuse") {}
+  PDNode* operator()(PDNode* elementwise_op_input);
+  // declare operator node's name
+  PATTERN_DECL_NODE(fill_constant);
+  PATTERN_DECL_NODE(fill_constant_out);
+  PATTERN_DECL_NODE(elementwise_mul);
+  PATTERN_DECL_NODE(elementwise_mul_out);
+};
 }  // namespace patterns
 // Link two ir::Nodes from each other.

--- a/paddle/fluid/framework/ir/graph_to_program_pass.cc
+++ b/paddle/fluid/framework/ir/graph_to_program_pass.cc
@@ -15,7 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph_to_program_pass.h"
 #include <map>
+#include <memory>
 #include <string>
+#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph.h"
@@ -26,8 +28,7 @@ namespace paddle {
 namespace framework {
 namespace ir {
-std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl(
+void GraphToProgramPass::ApplyImpl(ir::Graph* graph) const {
-    std::unique_ptr<Graph> graph) const {
  // Remove the unneeded variables after memory optimization.
  std::unordered_set<std::string> vars2remove;
  if (graph->Has(kGraphToProgramVarsToRemove)) {
@@ -73,7 +74,6 @@ std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl(
  }
  program.CopyFrom(*program_pb);
-  return graph;
 }
 }  // namespace ir

--- a/paddle/fluid/framework/ir/graph_to_program_pass.h
+++ b/paddle/fluid/framework/ir/graph_to_program_pass.h
@@ -26,7 +26,7 @@ const char kGraphToProgramSortKind[] = "__graph_to_program_sort_kind__";
 class GraphToProgramPass : public Pass {
 protected:
-  std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const override;
+  void ApplyImpl(ir::Graph* graph) const override;
 };
 }  // namespace ir

--- a/paddle/fluid/framework/ir/graph_to_program_pass_test.cc
+++ b/paddle/fluid/framework/ir/graph_to_program_pass_test.cc
@@ -14,7 +14,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph_to_program_pass.h"
+#include <memory>
 #include <string>
+#include <unordered_set>
 #include <vector>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -84,7 +86,7 @@ TEST(GraphToProgramPass, Basic) {
  ProgramDesc compiled_prog;
  pass->SetNotOwned<paddle::framework::ProgramDesc>("program", &compiled_prog);
-  pass->Apply(std::move(g));
+  pass->Apply(g.get());
  std::vector<OpDesc*> ops = compiled_prog.Block(0).AllOps();
  EXPECT_EQ(ops[0]->Type(), "op1");
  EXPECT_EQ(ops[1]->Type(), "op2");

--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
--- a/paddle/fluid/framework/ir/graph_viz_pass.h
+++ b/paddle/fluid/framework/ir/graph_viz_pass.h
--- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
+++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
--- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
+++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h
--- a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
+++ b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
--- a/paddle/fluid/framework/ir/is_test_pass.cc
+++ b/paddle/fluid/framework/ir/is_test_pass.cc
--- a/paddle/fluid/framework/ir/is_test_pass.h
+++ b/paddle/fluid/framework/ir/is_test_pass.h
--- a/paddle/fluid/framework/ir/is_test_pass_tester.cc
+++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc
--- a/paddle/fluid/framework/ir/lock_free_optimize_pass.cc
+++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.cc
--- a/paddle/fluid/framework/ir/lock_free_optimize_pass.h
+++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
--- a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc
--- a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h
--- a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc
--- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
--- a/paddle/fluid/framework/ir/multi_batch_merge_pass.h
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.h
--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
--- a/paddle/fluid/framework/ir/pass_test.cc
+++ b/paddle/fluid/framework/ir/pass_test.cc
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h
--- a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
+++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
--- a/paddle/fluid/framework/ir/runtime_context_cache_pass.h
+++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.h
--- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
--- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h
--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h
--- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc
--- a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc
+++ b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc
--- a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h
+++ b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h
--- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
--- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
+++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
--- a/paddle/fluid/framework/ir/sync_batch_norm_pass.cc
+++ b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc
--- a/paddle/fluid/framework/ir/sync_batch_norm_pass.h
+++ b/paddle/fluid/framework/ir/sync_batch_norm_pass.h
--- a/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc
+++ b/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
--- a/paddle/fluid/framework/no_need_buffer_vars_inference.h
+++ b/paddle/fluid/framework/no_need_buffer_vars_inference.h
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
--- a/paddle/fluid/framework/op_info.h
+++ b/paddle/fluid/framework/op_info.h
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
--- a/paddle/fluid/inference/anakin/CMakeLists.txt
+++ b/paddle/fluid/inference/anakin/CMakeLists.txt
--- a/paddle/fluid/inference/anakin/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/anakin/convert/CMakeLists.txt
--- a/paddle/fluid/inference/anakin/convert/activation.cc
+++ b/paddle/fluid/inference/anakin/convert/activation.cc
--- a/paddle/fluid/inference/anakin/convert/registrar.h
+++ b/paddle/fluid/inference/anakin/convert/registrar.h
--- a/paddle/fluid/inference/anakin/convert/batch_norm.cc
+++ b/paddle/fluid/inference/anakin/convert/batch_norm.cc
--- a/paddle/fluid/inference/anakin/convert/batch_norm.h
+++ b/paddle/fluid/inference/anakin/convert/batch_norm.h
--- a/paddle/fluid/inference/anakin/convert/concat.cc
+++ b/paddle/fluid/inference/anakin/convert/concat.cc
--- a/paddle/fluid/inference/anakin/convert/concat.h
+++ b/paddle/fluid/inference/anakin/convert/concat.h
--- a/paddle/fluid/inference/anakin/convert/conv2d.cc
+++ b/paddle/fluid/inference/anakin/convert/conv2d.cc
--- a/paddle/fluid/inference/anakin/convert/registrar.cc
+++ b/paddle/fluid/inference/anakin/convert/registrar.cc
--- a/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc
+++ b/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc
--- a/paddle/fluid/inference/anakin/convert/conv2d_fusion.h
+++ b/paddle/fluid/inference/anakin/convert/conv2d_fusion.h
--- a/paddle/fluid/inference/anakin/convert/density_prior_box.cc
+++ b/paddle/fluid/inference/anakin/convert/density_prior_box.cc
--- a/paddle/fluid/inference/anakin/convert/density_prior_box.h
+++ b/paddle/fluid/inference/anakin/convert/density_prior_box.h
--- a/paddle/fluid/inference/anakin/convert/detection_out.cc
+++ b/paddle/fluid/inference/anakin/convert/detection_out.cc
--- a/paddle/fluid/inference/anakin/convert/detection_out.h
+++ b/paddle/fluid/inference/anakin/convert/detection_out.h
--- a/paddle/fluid/inference/anakin/convert/dropout.cc
+++ b/paddle/fluid/inference/anakin/convert/dropout.cc
--- a/paddle/fluid/inference/anakin/convert/dropout.h
+++ b/paddle/fluid/inference/anakin/convert/dropout.h
--- a/paddle/fluid/inference/anakin/convert/elementwise.cc
+++ b/paddle/fluid/inference/anakin/convert/elementwise.cc
--- a/paddle/fluid/inference/anakin/convert/elementwise.h
+++ b/paddle/fluid/inference/anakin/convert/elementwise.h
--- a/paddle/fluid/inference/anakin/convert/fc.cc
+++ b/paddle/fluid/inference/anakin/convert/fc.cc
--- a/paddle/fluid/inference/anakin/convert/fc.h
+++ b/paddle/fluid/inference/anakin/convert/fc.h
--- a/paddle/fluid/inference/anakin/convert/flatten.cc
+++ b/paddle/fluid/inference/anakin/convert/flatten.cc
--- a/paddle/fluid/inference/anakin/convert/flatten.h
+++ b/paddle/fluid/inference/anakin/convert/flatten.h
--- a/paddle/fluid/inference/anakin/convert/im2sequence.cc
+++ b/paddle/fluid/inference/anakin/convert/im2sequence.cc
--- a/paddle/fluid/inference/anakin/convert/im2sequence.h
+++ b/paddle/fluid/inference/anakin/convert/im2sequence.h
--- a/paddle/fluid/inference/anakin/convert/op_converter.h
+++ b/paddle/fluid/inference/anakin/convert/op_converter.h
--- a/paddle/fluid/inference/anakin/convert/pool2d.cc
+++ b/paddle/fluid/inference/anakin/convert/pool2d.cc
--- a/paddle/fluid/inference/anakin/convert/pool2d.h
+++ b/paddle/fluid/inference/anakin/convert/pool2d.h
--- a/paddle/fluid/inference/anakin/convert/relu.cc
+++ b/paddle/fluid/inference/anakin/convert/relu.cc
--- a/paddle/fluid/inference/anakin/convert/relu.h
+++ b/paddle/fluid/inference/anakin/convert/relu.h
--- a/paddle/fluid/inference/anakin/convert/reshape.cc
+++ b/paddle/fluid/inference/anakin/convert/reshape.cc
--- a/paddle/fluid/inference/anakin/convert/reshape.h
+++ b/paddle/fluid/inference/anakin/convert/reshape.h
--- a/paddle/fluid/inference/anakin/convert/scale.cc
+++ b/paddle/fluid/inference/anakin/convert/scale.cc
--- a/paddle/fluid/inference/anakin/convert/scale.h
+++ b/paddle/fluid/inference/anakin/convert/scale.h
--- a/paddle/fluid/inference/anakin/convert/softmax.cc
+++ b/paddle/fluid/inference/anakin/convert/softmax.cc
--- a/paddle/fluid/inference/anakin/convert/softmax.h
+++ b/paddle/fluid/inference/anakin/convert/softmax.h
--- a/paddle/fluid/inference/anakin/convert/split.cc
+++ b/paddle/fluid/inference/anakin/convert/split.cc
--- a/paddle/fluid/inference/anakin/convert/split.h
+++ b/paddle/fluid/inference/anakin/convert/split.h
--- a/paddle/fluid/inference/anakin/convert/sum.cc
+++ b/paddle/fluid/inference/anakin/convert/sum.cc
--- a/paddle/fluid/inference/anakin/convert/sum.h
+++ b/paddle/fluid/inference/anakin/convert/sum.h
--- a/paddle/fluid/inference/anakin/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_activation_op.cc
--- a/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
--- a/paddle/fluid/inference/anakin/convert/test_concat_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_concat_op.cc
--- a/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc
--- a/paddle/fluid/inference/anakin/convert/test_dropout_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_dropout_op.cc
--- a/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc
--- a/paddle/fluid/inference/anakin/convert/test_fc_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_fc_op.cc
--- a/paddle/fluid/inference/anakin/convert/test_flatten_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_flatten_op.cc
--- a/paddle/fluid/inference/anakin/convert/test_im2sequence_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_im2sequence_op.cc
--- a/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
--- a/paddle/fluid/inference/anakin/convert/test_relu_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_relu_op.cc
--- a/paddle/fluid/inference/anakin/convert/test_reshape_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_reshape_op.cc
--- a/paddle/fluid/inference/anakin/convert/test_softmax_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_softmax_op.cc
--- a/paddle/fluid/inference/anakin/convert/test_split_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_split_op.cc
--- a/paddle/fluid/inference/anakin/convert/test_sum_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_sum_op.cc
--- a/paddle/fluid/inference/anakin/convert/test_transpose_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_transpose_op.cc
--- a/paddle/fluid/inference/anakin/convert/transpose.cc
+++ b/paddle/fluid/inference/anakin/convert/transpose.cc
--- a/paddle/fluid/inference/anakin/convert/transpose.h
+++ b/paddle/fluid/inference/anakin/convert/transpose.h
--- a/paddle/fluid/inference/anakin/convert/ut_helper.h
+++ b/paddle/fluid/inference/anakin/convert/ut_helper.h
--- a/paddle/fluid/inference/anakin/engine.cc
+++ b/paddle/fluid/inference/anakin/engine.cc
--- a/paddle/fluid/inference/anakin/engine.h
+++ b/paddle/fluid/inference/anakin/engine.h
--- a/paddle/fluid/inference/anakin/op_teller.cc
+++ b/paddle/fluid/inference/anakin/op_teller.cc
--- a/paddle/fluid/inference/anakin/op_teller.h
+++ b/paddle/fluid/inference/anakin/op_teller.h
--- a/paddle/fluid/inference/anakin/test_anakin_engine.cc
+++ b/paddle/fluid/inference/anakin/test_anakin_engine.cc
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
--- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
--- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
--- a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
--- a/paddle/fluid/inference/api/mkldnn_quantizer.h
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.h
--- a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
--- a/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h
+++ b/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
--- a/paddle/fluid/inference/tensorrt/convert/io_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/io_converter.h
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
--- a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
--- a/paddle/fluid/inference/utils/singleton.h
+++ b/paddle/fluid/inference/utils/singleton.h
--- a/paddle/fluid/memory/allocation/aligned_allocator.h
+++ b/paddle/fluid/memory/allocation/aligned_allocator.h
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
--- a/paddle/fluid/memory/allocation/allocator_strategy.cc
+++ b/paddle/fluid/memory/allocation/allocator_strategy.cc
--- a/paddle/fluid/memory/allocation/buffered_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/buffered_allocator_test.cc
--- a/paddle/fluid/memory/allocation/locked_allocator.cc
+++ b/paddle/fluid/memory/allocation/locked_allocator.cc
--- a/paddle/fluid/memory/allocation/retry_allocator.h
+++ b/paddle/fluid/memory/allocation/retry_allocator.h
--- a/paddle/fluid/memory/allocation/zero_size_allocator.h
+++ b/paddle/fluid/memory/allocation/zero_size_allocator.h
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
--- a/paddle/fluid/operators/add_position_encoding_op.cc
+++ b/paddle/fluid/operators/add_position_encoding_op.cc
--- a/paddle/fluid/operators/alloc_continuous_space_op.cc
+++ b/paddle/fluid/operators/alloc_continuous_space_op.cc
--- a/paddle/fluid/operators/anakin/CMakeLists.txt
+++ b/paddle/fluid/operators/anakin/CMakeLists.txt
--- a/paddle/fluid/operators/anakin/anakin_engine_op.cc
+++ b/paddle/fluid/operators/anakin/anakin_engine_op.cc
--- a/paddle/fluid/operators/anakin/anakin_engine_op.h
+++ b/paddle/fluid/operators/anakin/anakin_engine_op.h
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
--- a/paddle/fluid/operators/bpr_loss_op.cc
+++ b/paddle/fluid/operators/bpr_loss_op.cc
--- a/paddle/fluid/operators/clip_by_norm_op.cc
+++ b/paddle/fluid/operators/clip_by_norm_op.cc
--- a/paddle/fluid/operators/clip_by_norm_op.h
+++ b/paddle/fluid/operators/clip_by_norm_op.h
--- a/paddle/fluid/operators/clip_op.cc
+++ b/paddle/fluid/operators/clip_op.cc
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
--- a/paddle/fluid/operators/cudnn_lstm_op.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cc
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
--- a/paddle/fluid/operators/dgc_clip_by_norm_op.cc
+++ b/paddle/fluid/operators/dgc_clip_by_norm_op.cc
--- a/paddle/fluid/operators/dgc_clip_by_norm_op.cu
+++ b/paddle/fluid/operators/dgc_clip_by_norm_op.cu
--- a/paddle/fluid/operators/dgc_clip_by_norm_op.h
+++ b/paddle/fluid/operators/dgc_clip_by_norm_op.h
--- a/paddle/fluid/operators/dgc_op.cc
+++ b/paddle/fluid/operators/dgc_op.cc
--- a/paddle/fluid/operators/dgc_op.cu
+++ b/paddle/fluid/operators/dgc_op.cu
--- a/paddle/fluid/operators/dgc_op.h
+++ b/paddle/fluid/operators/dgc_op.h
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
--- a/paddle/fluid/operators/gather.cu.h
+++ b/paddle/fluid/operators/gather.cu.h
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
--- a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
--- a/paddle/fluid/operators/group_norm_op.cc
+++ b/paddle/fluid/operators/group_norm_op.cc
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
--- a/paddle/fluid/operators/jit/README.en.md
+++ b/paddle/fluid/operators/jit/README.en.md
--- a/paddle/fluid/operators/jit/README.md
+++ b/paddle/fluid/operators/jit/README.md
--- a/paddle/fluid/operators/l1_norm_op.cc
+++ b/paddle/fluid/operators/l1_norm_op.cc
--- a/paddle/fluid/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
--- a/paddle/fluid/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
--- a/paddle/fluid/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
--- a/paddle/fluid/operators/lstm_op.cc
+++ b/paddle/fluid/operators/lstm_op.cc
--- a/paddle/fluid/operators/margin_rank_loss_op.cc
+++ b/paddle/fluid/operators/margin_rank_loss_op.cc
--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
--- a/paddle/fluid/operators/multiplex_op.cc
+++ b/paddle/fluid/operators/multiplex_op.cc
--- a/paddle/fluid/operators/multiplex_op.cu
+++ b/paddle/fluid/operators/multiplex_op.cu
--- a/paddle/fluid/operators/multiplex_op.h
+++ b/paddle/fluid/operators/multiplex_op.h
--- a/paddle/fluid/operators/pad_op.cc
+++ b/paddle/fluid/operators/pad_op.cc
--- a/paddle/fluid/operators/psroi_pool_op.cc
+++ b/paddle/fluid/operators/psroi_pool_op.cc
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
--- a/paddle/fluid/operators/reader/ctr_reader.h
+++ b/paddle/fluid/operators/reader/ctr_reader.h
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
--- a/paddle/fluid/operators/save_combine_op.cu
+++ b/paddle/fluid/operators/save_combine_op.cu
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
--- a/paddle/fluid/operators/shuffle_channel_op.cc
+++ b/paddle/fluid/operators/shuffle_channel_op.cc
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
--- a/paddle/fluid/operators/spectral_norm_op.cc
+++ b/paddle/fluid/operators/spectral_norm_op.cc
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
--- a/paddle/fluid/platform/assert.h
+++ b/paddle/fluid/platform/assert.h
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
--- a/paddle/fluid/platform/init.h
+++ b/paddle/fluid/platform/init.h
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
--- a/paddle/fluid/platform/mkldnn_utils.h
+++ b/paddle/fluid/platform/mkldnn_utils.h
--- a/paddle/fluid/platform/temporary_allocator.cc
+++ b/paddle/fluid/platform/temporary_allocator.cc
--- a/paddle/fluid/platform/temporary_allocator.h
+++ b/paddle/fluid/platform/temporary_allocator.h
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/contrib/int8_inference/README.md
+++ b/python/paddle/fluid/contrib/int8_inference/README.md
--- a/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py
+++ b/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py
--- a/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py
+++ b/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
--- a/python/paddle/fluid/contrib/slim/tests/quantization/compress.yaml
+++ b/python/paddle/fluid/contrib/slim/tests/quantization/compress.yaml
--- a/python/paddle/fluid/contrib/slim/tests/test_distillation_strategy.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_distillation_strategy.py
--- a/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/imperative/base.py
+++ b/python/paddle/fluid/imperative/base.py
--- a/python/paddle/fluid/imperative/layer_object_helper.py
+++ b/python/paddle/fluid/imperative/layer_object_helper.py
--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
--- a/python/paddle/fluid/imperative/tracer.py
+++ b/python/paddle/fluid/imperative/tracer.py
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
--- a/python/paddle/fluid/install_check.py
+++ b/python/paddle/fluid/install_check.py
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
--- a/python/paddle/fluid/tests/unittests/test_alloc_continuous_space_op.py
+++ b/python/paddle/fluid/tests/unittests/test_alloc_continuous_space_op.py
--- a/python/paddle/fluid/tests/unittests/test_dgc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dgc_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
--- a/python/paddle/fluid/tests/unittests/test_dist_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_save_load.py
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py
--- a/python/paddle/fluid/tests/unittests/test_fsp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fsp_op.py
--- a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
--- a/python/paddle/fluid/tests/unittests/test_install_check.py
+++ b/python/paddle/fluid/tests/unittests/test_install_check.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
--- a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py
--- a/python/paddle/fluid/tests/unittests/test_roi_align_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_align_op.py
--- a/python/paddle/fluid/tests/unittests/test_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor.py
--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py