merge upstream to my develop

test=develop Merge remote-tracking branch 'upstream/develop' into develop

merge upstream to my develop
test=develop Merge remote-tracking branch 'upstream/develop' into develop
5c7a8aee · heqiaozhi · b3cf476d · aa6e9c30 · 5c7a8aee · 5c7a8aee
90 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -208,10 +208,10 @@ include(external/xxhash)    # download xxhash
 include(external/dlpack)
 include(external/snappy)    # download snappy
 include(external/snappystream) # download snappystream
+include(external/warpctc)   # download, build, install warpctc
 if (NOT WIN32)
-# there is no official support of warpctc, nccl, cupti in windows
+# there is no official support of nccl, cupti in windows
-include(external/warpctc)   # download, build, install warpctc
 include(cupti)
 include(external/gzstream)
 endif (NOT WIN32)

--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -18,8 +18,8 @@ ENDIF()
 INCLUDE(python_module)
-FIND_PACKAGE(PythonInterp ${PY_VERSION})
+FIND_PACKAGE(PythonInterp ${PY_VERSION} REQUIRED)
-FIND_PACKAGE(PythonLibs ${PY_VERSION})
+FIND_PACKAGE(PythonLibs ${PY_VERSION} REQUIRED)
 if(WIN32)
    execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
@@ -79,6 +79,5 @@ IF(PYTHONINTERP_FOUND)
        "please use pip to upgrade protobuf. pip install -U protobuf")
    ENDIF()
 ENDIF(PYTHONINTERP_FOUND)
 INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
 INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -26,25 +26,33 @@ SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
 # Used in unit test test_WarpCTCLayer
 SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib"
    CACHE PATH "Warp-ctc Library Directory" FORCE)
-SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
-    CACHE FILEPATH "Warp-ctc Library" FORCE)
-IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" )
+IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR WIN32)
    SET(USE_OMP OFF)
 ELSE()
    SET(USE_OMP ON)
 ENDIF()
+IF(WIN32)
+    SET(WARPCTC_REPOSITORY "https://github.com/wopeizl/warp-ctc.git")
+ELSE()
+    SET(WARPCTC_REPOSITORY "https://github.com/dzhwinter/warp-ctc.git")
+ENDIF()
 ExternalProject_Add(
    extern_warpctc
    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/dzhwinter/warp-ctc.git"
+    GIT_REPOSITORY ${WARPCTC_REPOSITORY}
    PREFIX          ${WARPCTC_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                    -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
                    -DWITH_GPU=${WITH_GPU}
                    -DWITH_OMP=${USE_OMP}
@@ -59,6 +67,18 @@ ExternalProject_Add(
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                     -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
 )
+IF(WIN32)
+    IF(NOT EXISTS "${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}")
+        add_custom_command(TARGET extern_warpctc POST_BUILD
+                COMMAND cmake -E copy ${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX} ${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}
+                )
+    ENDIF()
+    SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
+            CACHE FILEPATH "Warp-ctc Library" FORCE)
+else(WIN32)
+    SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
+            CACHE FILEPATH "Warp-ctc Library" FORCE)
+ENDIF(WIN32)
 MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
 INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers.

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -200,6 +200,13 @@ if (WITH_ANAKIN AND WITH_MKL)
    list(APPEND inference_deps anakin_inference_lib)
 endif ()
+if (TENSORRT_FOUND)
+    copy(tensorrt_lib DEPS ${inference_deps} 
+        SRCS ${TENSORRT_ROOT}/include/Nv*.h ${TENSORRT_ROOT}/lib/libnvinfer*
+        DSTS ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/include ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/lib)
+endif ()
 set(module "inference")
 if(WIN32)
    set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.*)

--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -84,7 +84,7 @@ function(op_library TARGET)
    endif()
    if (WIN32)
    # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
-    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op")
+    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op")
        if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
          return()
        endif()

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -209,6 +209,7 @@ paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None
 paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
 paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.huber_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
@@ -375,7 +376,7 @@ paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learnin
 paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None))
 paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None))
+paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False))
 paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None))
 paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -50,8 +50,10 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
+cc_library(memory_optimize_pass SRCS analysis_var_pass.cc memory_reuse_types.cc DEPS graph graph_helper pass)
 cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
+cc_library(memory_early_delete_pass SRCS memory_early_delete_pass.cc DEPS memory_optimize_pass computation_op_handle scale_loss_grad_op_handle rpc_op_handle
+        all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
 cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle)
 cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
 cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass)
@@ -63,7 +65,12 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
-set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass) 
+set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass memory_early_delete_pass)
+if (WITH_GPU)
+  list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
+endif()
+cc_test(memory_reuse_types_test SRCS memory_reuse_types_test.cc memory_reuse_types.cc DEPS framework_proto graph)
+cc_test(analysis_var_pass_test SRCS analysis_var_pass_test.cc analysis_var_pass.cc memory_reuse_types.cc DEPS framework_proto graph graph_helper op_registry pass)
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
@@ -84,4 +91,5 @@ cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fuse
 cc_library(build_strategy SRCS build_strategy.cc DEPS
        graph_viz_pass multi_devices_graph_pass
        multi_devices_graph_print_pass multi_devices_graph_check_pass
-        fuse_elewise_add_act_pass multi_batch_merge_pass)
+        fuse_elewise_add_act_pass multi_batch_merge_pass
+        memory_optimize_pass)
--- a/paddle/fluid/framework/details/analysis_var_pass.cc
+++ b/paddle/fluid/framework/details/analysis_var_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/analysis_var_pass.h"
+#include <algorithm>
+#include <atomic>
+#include <deque>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <memory>
+#include <queue>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <vector>
+#include "gflags/gflags.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+DEFINE_bool(enable_subgraph_optimize, false,
+            "SubGraph also reuse global graph variables, it will reduce the "
+            "memory occupation"
+            "but a higher risk of memory reuse error. default disabled.");
+DEFINE_string(memory_optimize_debug, "",
+              "debug the operator output variable when do the variable reuse."
+              "memory reuse pass."
+              "only for debug, default disabled.");
+namespace paddle {
+namespace framework {
+namespace details {
+static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
+  return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
+         op1->Outputs() == op2->Outputs();
+}
+template <typename Container, typename Callback>
+class FilterVariableImpl {
+ public:
+  void operator()(const Container& nodes, Callback callback) {
+    for (auto* node : nodes) {
+      callback(node);
+    }
+  }
+};
+// filter var node for op->inputs/outputs
+template <typename Callback>
+class FilterVariableImpl<std::vector<ir::Node*>, Callback> {
+ public:
+  void operator()(const std::vector<ir::Node*>& nodes, Callback callback) {
+    for (auto* var : nodes) {
+      if (var->IsVar() && !var->IsCtrlVar()) {
+        callback(var);
+      }
+    }
+  }
+};
+template <typename Container, typename Callback>
+void FilterVariables(const Container& nodes, Callback callback) {
+  FilterVariableImpl<Container, Callback>()(nodes, callback);
+}
+std::unique_ptr<ir::Graph> AnalysisVarPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  auto nodes = graph->Nodes();
+  auto subblock_vars = GetSubBlockVars(nodes);
+  skip_set_.insert(subblock_vars.begin(), subblock_vars.end());
+  cfg_.reset(new details::ControlFlowGraph(*graph));
+  cfg_->LiveVariableAnalysis();
+  InitSSAGraphNodes();
+  int reuse_id = 0;
+  for (size_t idx = 0; idx < cfg_->Ops().size(); ++idx) {
+    auto& op = cfg_->Ops()[idx];
+    auto* op_desc = op->Op();
+    // some op in graph has no op desc
+    if (op_desc == nullptr) continue;
+    if (OpHasSubBlock(op_desc)) {
+      if (FLAGS_enable_subgraph_optimize) {
+        SubGraphOptimize(op_desc);
+      } else {
+        VLOG(3) << op->Name()
+                << " has subblock, but disable subgraph optimize. skipped.";
+        continue;
+      }
+    }
+    for (auto& var : op->outputs) {
+      if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) {
+        ir::Node* cache = pool_.NodeMatch(var);
+        if (var->Name() == FLAGS_memory_optimize_debug) {
+          VLOG(3) << "start match var " << DebugString(var) << " of op "
+                  << op->Name();
+          VLOG(3) << pool_.ToString();
+          VLOG(3) << "matched in pool : "
+                  << ((cache == nullptr) ? "False" : "True");
+        }
+        if (cache != nullptr) {
+          if (var->Name() == cache->Name()) {
+            VLOG(3) << "The same cache variable is cascade reused."
+                    << var->Name() << " is re-filled to the pool after"
+                    << "the reused op is finished. Current op can not "
+                    << "replace it again. Skip this candidate.";
+            continue;
+          }
+          int node_idx_in_pool = pool_.GetIndex(cache);
+          VLOG(3) << string::Sprintf(
+              "!!! %s,  %s => %s, cache idx %d, pool size %d",
+              std::to_string(reuse_id++), DebugString(var), DebugString(cache),
+              node_idx_in_pool, static_cast<int>(pool_.size()));
+          // update CFG Graph on the fly.
+          // reused var maybe re-fill into the pool
+          cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx);
+          // NOTE(dzhwinter): we need to both update the ProgramDesc
+          // and IR Graph. because op_desc/var_desc is used in CreateOp,
+          // CreateVar when running happens. But IR Graph
+          // define the dependence relationship between nodes.
+          RenameVarInGraphDesc(var->Name(), cache->Name(), idx);
+          RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get());
+          pool_.Erase(cache);
+        }
+      }
+    }
+    // fill the pool
+    for (auto var : cfg_->LiveIn(op)) {
+      if (cfg_->LiveOut(op).count(var) == 0) {
+        ir::Node* var_node = cfg_->GetNodeFromVarName(var, op);
+        if (var_node == nullptr) continue;
+        if (NodeCanReused(var_node) && !pool_.Has(var_node)) {
+          pool_.Insert(var_node, op);
+        }
+      }
+    }
+  }
+  graph->ResolveHazard(var_nodes_);
+  // For early delete pass. use GraphNodePool load the unlived vars.
+  // 1. find all deps op for each unlived var in memory pool.
+  for (auto& op : graph->Nodes()) {
+    for (auto& var : op->inputs) {
+      if (pool_.Has(var)) {
+        pool_.Insert(var, op);
+      }
+    }
+  }
+  // 2. convert ir node based memory pool to graph node
+  // because Node* maybe released bettwen passes.
+  auto& graph_pool = graph->Get<GraphNodePool>(kGraphNodePool);
+  for (auto it = pool_.begin(); it != pool_.end(); ++it) {
+    std::unordered_set<OpDesc*> descs;
+    for (auto& op : it->second) {
+      PADDLE_ENFORCE(op->IsOp());
+      descs.insert(op->Op());
+    }
+    graph_pool.push_back(std::make_pair(it->first->Name(), descs));
+  }
+  return graph;
+}
+void AnalysisVarPass::SubGraphOptimize(OpDesc* op_desc) const {
+  // conditional block, while op and their grad op
+  auto* sub_block_desc =
+      AttrReader(op_desc->GetAttrMap()).Get<BlockDesc*>("sub_block");
+  // create a mirror block to construct an IR Graph.
+  ProgramDesc prog;
+  auto* copy_block = prog.MutableBlock(0);
+  for (auto* op : sub_block_desc->AllOps()) {
+    auto* copy_op = copy_block->AppendOp();
+    copy_op->CopyFrom(*op);
+    copy_op->Flush();
+  }
+  for (auto* var : sub_block_desc->AllVars()) {
+    auto* copy_var = copy_block->Var(var->Name());
+    copy_var->SetDataType(var->GetDataType());
+    // only lod tensor can be reused. So ignore the multiple dims case.
+    copy_var->SetType(var->GetType());
+    copy_var->SetShape(var->GetShape());
+    copy_var->SetPersistable(var->Persistable());
+  }
+  ir::Graph sub_graph(prog);
+  std::unordered_set<ir::Node*> sub_graph_all_ops;
+  FilterVariables(sub_graph.Nodes(), [&](ir::Node* var) {
+    // sub_graph_all_ops.emplace(var);
+    if (var->IsVar() && !var->IsCtrlVar()) {
+      sub_graph_all_ops.emplace(var);
+    }
+  });
+  int sub_reuse_id = 0;
+  // subgraph nodes is unordered, reuse need to follow the desc order.
+  // find the right op node through the descs
+  for (auto* sub_op_desc : sub_block_desc->AllOps()) {
+    ir::Node* sub_op = nullptr;
+    for (auto* node : sub_graph_all_ops) {
+      if (node->Op() == sub_op_desc) {
+        sub_op = node;
+        break;
+      }
+    }
+    PADDLE_ENFORCE(sub_op != nullptr);
+    for (auto* var : sub_op->outputs) {
+      if (NodeCanReused(var)) {
+        ir::Node* cache = pool_.NodeMatch(var);
+        if (cache != nullptr) {
+          if (var->Var()->GetDataType() != cache->Var()->GetDataType()) {
+            continue;
+          }
+          int node_idx_in_pool = pool_.GetIndex(cache);
+          VLOG(3) << string::Sprintf(
+              "!!! %s,  %s => %s, cache idx %d, pool size %d",
+              std::to_string(sub_reuse_id++), DebugString(var),
+              DebugString(cache), node_idx_in_pool,
+              static_cast<int>(pool_.size()));
+          // NOTE(dzh): subblock is not in IR graph. Modify the block_desc
+          // immediately to make the subblock variable reuse strategy take
+          // effect. Because it is a single op in graph. No need to
+          // update the ir nodes.
+          sub_op_desc->Rename(var->Name(), cache->Name());
+          if (sub_op_desc->Block()->HasVar(var->Name())) {
+            sub_op_desc->Block()->RemoveVar(var->Name());
+          }
+        }
+      }
+    }
+  }
+}
+std::unordered_set<std::string> AnalysisVarPass::GetSubBlockVars(
+    const std::unordered_set<ir::Node*>& nodes) const {
+  std::unordered_set<std::string> vars;
+  for (auto& op : nodes) {
+    if (!op->IsOp() || op->Op() == nullptr) continue;
+    auto* op_desc = op->Op();
+    if (OpHasSubBlock(op_desc)) {
+      auto inputs = op_desc->InputArgumentNames();
+      auto outputs = op_desc->OutputArgumentNames();
+      vars.insert(inputs.begin(), inputs.end());
+      vars.insert(outputs.begin(), outputs.end());
+    }
+  }
+  return vars;
+}
+void AnalysisVarPass::RenameVarInGraphDesc(const std::string& var,
+                                           const std::string& cache_var,
+                                           size_t idx) const {
+  for (size_t i = idx; i < cfg_->Ops().size(); ++i) {
+    auto* op = cfg_->Ops()[i];
+    PADDLE_ENFORCE(op->IsOp() && op->Op());
+    auto* op_desc = op->Op();
+    op_desc->RenameInput(var, cache_var);
+    op_desc->RenameOutput(var, cache_var);
+    if (op_desc->Block()->HasVar(var)) op_desc->Block()->RemoveVar(var);
+    op_desc->Flush();
+  }
+}
+void AnalysisVarPass::InitSSAGraphNodes() const {
+  std::unordered_map<std::string, std::unordered_set<ir::Node*>> all_vars;
+  if (var_nodes_.empty()) {
+    for (auto* op : cfg_->Ops()) {
+      for (auto* node : op->inputs) {
+        if (all_vars[node->Name()].count(node) == 0) {
+          all_vars[node->Name()].emplace(node);
+          var_nodes_[node->Name()].emplace_back(node);
+        }
+      }
+      for (auto* node : op->outputs) {
+        if (all_vars[node->Name()].count(node) == 0) {
+          all_vars[node->Name()].emplace(node);
+          var_nodes_[node->Name()].emplace_back(node);
+        }
+      }
+    }
+  }
+}
+void AnalysisVarPass::RenameVarInGraphNode(const std::string& var,
+                                           const std::string& cache_var,
+                                           size_t idx, ir::Graph* graph) const {
+  // if replace happens, we need to create a newer version cache_var
+  // but use the same dims/data_type with var.
+  PADDLE_ENFORCE(var_nodes_[var].size() >= 1 &&
+                 var_nodes_[var].at(0)->Var() != nullptr);
+  std::unique_ptr<VarDesc> var_desc(new VarDesc(*var_nodes_[var].at(0)->Var()));
+  var_desc->SetName(cache_var);
+  for (size_t i = idx; i < cfg_->Ops().size(); ++i) {
+    auto* op = cfg_->Ops()[i];
+    // redirect the input to the latest version of cache_var
+    for (auto* node : op->inputs) {
+      if (node->Name() == var) {
+        ir::Node* cache_node = graph->CreateVarNode(var_desc.get());
+        var_nodes_[cache_var].emplace_back(cache_node);
+        // swap node to cache_node
+        cache_node->outputs.insert(cache_node->outputs.end(),
+                                   node->outputs.begin(), node->outputs.end());
+        PADDLE_ENFORCE(node->inputs.size() == 1 && node->inputs[0]->IsOp());
+        auto* prev_op = node->inputs[0];
+        std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node,
+                     cache_node);
+        cache_node->inputs.emplace_back(prev_op);
+        for (auto* next_op : node->outputs) {
+          std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
+                       cache_node);
+        }
+      }
+    }
+    // if we need to rename the output,
+    // always create a newer version of cache_var
+    for (auto* node : op->outputs) {
+      if (node->Name() == var) {
+        ir::Node* cache_node = graph->CreateVarNode(var_desc.get());
+        var_nodes_[cache_var].emplace_back(cache_node);
+        // swap node to cache node
+        cache_node->outputs.insert(cache_node->outputs.end(),
+                                   node->outputs.begin(), node->outputs.end());
+        cache_node->inputs.emplace_back(op);
+        std::replace(op->outputs.begin(), op->outputs.end(), node, cache_node);
+        for (auto* next_op : node->outputs) {
+          std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
+                       cache_node);
+        }
+      }
+    }
+  }
+  // release node of unused var in graph
+  for (auto* node : var_nodes_[var]) {
+    graph->RemoveNode(node);
+  }
+  var_nodes_.at(var).clear();
+}
+bool AnalysisVarPass::NodeCanReused(ir::Node* node) const {
+  if (!node->IsVar() || node->IsCtrlVar()) return false;
+  auto* desc = node->Var();
+  auto type = desc->GetType();
+  if (desc->Persistable() || type != proto::VarType::LOD_TENSOR ||
+      desc->GetShape().empty()) {
+    return false;
+  }
+  // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
+  std::string name = node->Name();
+  if (!name.empty() && name[0] == '@' && name[name.size() - 1] == '@')
+    return false;
+  if (skip_set_.count(name)) return false;
+  for (auto* op : node->inputs) {
+    if (op->Op()->HasAttr("force_cpu")) {
+      // op output force generated in cpu, can not be reused.
+      return framework::AttrReader(op->Op()->GetAttrMap())
+                 .Get<bool>("force_cpu") == 0;
+    }
+  }
+  return true;
+}
+bool AnalysisVarPass::OpHasSubBlock(OpDesc* desc) const {
+  const AttributeMap& attrs = desc->GetAttrMap();
+  for (auto& attr : attrs) {
+    if (attr.second.type() == typeid(BlockDesc*) ||             // NOLINT
+        attr.second.type() == typeid(std::vector<BlockDesc*>))  // NOLINT
+      return true;
+  }
+  return false;
+}
+std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph) {
+  PADDLE_ENFORCE(graph.Has(kAllOpDescs),
+                 "Graph has no attribute of kAllOpDescs.");
+  // 1. get op desc order
+  auto& op_descs = graph.Get<const std::vector<OpDesc*>>(kAllOpDescs);
+  // 2. topology sort order
+  auto nodes = graph.Nodes();
+  std::deque<ir::Node*> ops;
+  FilterVariables(nodes, [&](ir::Node* op) {
+    if (op->IsOp() && op->Op() != nullptr) {
+      ops.emplace_back(op);
+    }
+  });
+  std::unordered_map<ir::Node*, size_t> op_deps;
+  std::list<ir::Node*> ready_ops;
+  std::unordered_map<ir::Node*, std::unordered_set<ir::Node*>> pending_ops;
+  for (auto* op : ops) {
+    std::unordered_set<ir::Node*> preceding_op;
+    for (auto* in : op->inputs) {
+      if (in->inputs.empty()) continue;
+      PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp());
+      preceding_op.emplace(in->inputs[0]);
+      pending_ops[in->inputs[0]].emplace(op);
+    }
+    op_deps[op] = preceding_op.size();
+    if (preceding_op.empty()) {
+      ready_ops.emplace_back(op);
+    }
+  }
+  // 3. generated op list based desc order and the topology order
+  std::vector<ir::Node*> ret;
+  std::list<OpDesc*> op_descs_list(op_descs.begin(), op_descs.end());
+  auto update_by_found_node = [&](ir::Node* found_node) {
+    for (auto* pending_op : pending_ops[found_node]) {
+      if (--op_deps[pending_op] == 0) {
+        ready_ops.emplace_back(pending_op);
+      }
+    }
+    ready_ops.remove(found_node);
+    ret.emplace_back(found_node);
+  };
+  while (!ready_ops.empty()) {
+    bool all_of_ready_op_unmatched = true;
+    for (auto it = op_descs_list.begin(); it != op_descs_list.end();) {
+      auto op_desc = *it;
+      ir::Node* found_node = nullptr;
+      for (auto* op : ready_ops) {
+        if (IsSameDesc(op->Op(), op_desc)) {
+          found_node = op;
+          break;
+        }
+      }
+      // 3.1 op desc deleted by other pass
+      if (found_node == nullptr) {
+        ++it;
+        continue;
+      } else {
+        all_of_ready_op_unmatched = false;
+        it = op_descs_list.erase(it);
+      }
+      update_by_found_node(found_node);
+    }
+    // 3.2 op descs are added by other pass
+    // preceding op non empty means some new op descs are
+    // created, but not contained in return node list.
+    // these new op desc may depend on each other.
+    std::list<ir::Node*> prev_ready_ops(ready_ops);
+    if (all_of_ready_op_unmatched) {
+      for (auto op : prev_ready_ops) {
+        update_by_found_node(op);
+      }
+    }
+  }
+  PADDLE_ENFORCE(std::all_of(
+      op_deps.begin(), op_deps.end(),
+      [&](const std::pair<ir::Node*, size_t>& p) { return p.second == 0; }));
+  return ret;
+}
+ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) {
+  ops_ = SortOpLikeDescOrder(graph);
+  ConnectNodes();
+}
+void ControlFlowGraph::BuildCFGGraph() {
+  // FIXME(dzh): same effect with ConnectNodes, but use the control
+  // link to build dependency graph, it goes wrong in transformer.
+  for (ir::Node* op : ops_) {
+    for (auto& input_var : op->inputs) {
+      if (!input_var->inputs.empty()) {
+        PADDLE_ENFORCE(
+            input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(),
+            "Preceding Op Node of Var Node must be unique");
+        auto* pred_op = input_var->inputs[0];
+        if (pred_op->Op() != nullptr) {
+          predecessors_[op].insert(pred_op);
+          successors_[pred_op].insert(op);
+        }
+      }
+      if (input_var->IsVar() && !input_var->IsCtrlVar()) {
+        uses_[op].insert(input_var->Name());
+      }
+    }
+    for (auto& output_var : op->outputs) {
+      // output var may be used by many op
+      for (auto* succ_op : output_var->outputs) {
+        if (succ_op->Op() != nullptr) {
+          successors_[op].insert(succ_op);
+          predecessors_[succ_op].insert(op);
+        }
+      }
+      if (output_var->IsVar() && !output_var->IsCtrlVar()) {
+        defs_[op].insert(output_var->Name());
+      }
+    }
+  }
+}
+void ControlFlowGraph::ConnectNodes() {
+  for (size_t i = 0; i < ops_.size(); ++i) {
+    auto& op = ops_[i];
+    try {
+      auto& next_op = ops_.at(i + 1);
+      successors_[op].insert(next_op);
+      predecessors_[next_op].insert(op);
+    } catch (...) {
+      // do nothing
+    }
+    FilterVariables(op->inputs,
+                    [&](ir::Node* var) { uses_[op].emplace(var->Name()); });
+    FilterVariables(op->outputs,
+                    [&](ir::Node* var) { defs_[op].emplace(var->Name()); });
+  }
+}
+void ControlFlowGraph::LiveVariableAnalysis() {
+  // NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm)
+  // compute the liveness of for each variable though reversed_ops algorithm.
+  // It iterates the operators from end to begin, compute the live in/live out
+  // variable set for each op, then the diff between in/out will be used for
+  // the variable reuse. For detail refer to
+  // http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf
+  std::list<ir::Node*> work_list(ops_.rbegin(), ops_.rend());
+  while (!work_list.empty()) {
+    ir::Node* op = work_list.front();
+    work_list.pop_front();
+    // get the live_in calculated before. Empty if first.
+    auto prev_live_in = std::move(live_in_[op]);
+    for (auto& s : successors_[op]) {
+      for (auto& var : live_in_[s]) {
+        live_out_[op].insert(var);
+      }
+    }
+    for (auto& var : uses_[op]) {
+      live_in_[op].insert(var);
+    }
+    for (auto& var : live_out_[op]) {
+      live_in_[op].insert(var);
+    }
+    for (auto& var : defs_[op]) {
+      live_in_[op].erase(var);
+    }
+    // If the live_in is not changed, then the liveness analysis of
+    // predecessors is completed.
+    //
+    // Otherwise, recalculate the predecessors liveness
+    if (live_in_[op] != prev_live_in) {
+      for (auto& pre : predecessors_[op]) {
+        work_list.push_back(pre);
+      }
+    }
+  }
+}
+void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
+                                           const std::string& new_node,
+                                           int begin_idx) {
+  // update graph from begin idx to the end
+  for (size_t i = begin_idx; i != ops_.size(); ++i) {
+    auto* op = ops_[i];
+    if (uses_[op].find(old_node) != uses_[op].end()) {
+      uses_[op].erase(old_node);
+      uses_[op].insert(new_node);
+    }
+    if (defs_[op].find(old_node) != defs_[op].end()) {
+      defs_[op].erase(old_node);
+      defs_[op].insert(new_node);
+    }
+    if (live_in_[op].find(old_node) != live_in_[op].end()) {
+      live_in_[op].erase(old_node);
+      live_in_[op].insert(new_node);
+    }
+    if (live_out_[op].find(old_node) != live_out_[op].end()) {
+      live_out_[op].erase(old_node);
+      live_out_[op].insert(new_node);
+    }
+  }
+}
+const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const {
+  auto it = live_in_.find(op);
+  PADDLE_ENFORCE(
+      it != live_in_.end(),
+      string::Sprintf("Expect %s in live_in, but Not Found.", op->Name()));
+  return it->second;
+}
+const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const {
+  auto it = live_out_.find(op);
+  PADDLE_ENFORCE(
+      it != live_out_.end(),
+      string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
+  return it->second;
+}
+const std::set<std::string> ControlFlowGraph::Use(ir::Node* op) const {
+  auto it = uses_.find(op);
+  PADDLE_ENFORCE(
+      it != uses_.end(),
+      string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
+  return it->second;
+}
+const std::vector<ir::Node*> ControlFlowGraph::Ops() const { return ops_; }
+std::vector<ir::Node*>& ControlFlowGraph::Ops() { return ops_; }
+ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name,
+                                               ir::Node* op) const {
+  // in ssa-graph, different version nodes have same name,
+  // this function get the latest version var before target op
+  // It may return nullptr, such as data node.
+  ir::Node* found_node = nullptr;
+  for (auto* node : ops_) {
+    if (node == op) break;
+    for (auto& output : node->outputs) {
+      if (output->Name() == name) {
+        found_node = output;
+      }
+    }
+  }
+  return found_node;
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(analysis_var_pass, paddle::framework::details::AnalysisVarPass)
+    .RequireGraphAttr(paddle::framework::details::kGraphNodePool)
+    .RequireGraphAttr(paddle::framework::details::kAllOpDescs);
--- a/paddle/fluid/framework/details/analysis_var_pass.h
+++ b/paddle/fluid/framework/details/analysis_var_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <algorithm>
+#include <list>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/details/memory_reuse_types.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+namespace paddle {
+namespace framework {
+namespace details {
+constexpr char kAllOpDescs[] = "all_op_descs";
+std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);
+// sort op in bfs order
+std::vector<ir::Node*> BFSSortGraphOps(const ir::Graph& graph);
+class ControlFlowGraph;
+class AnalysisVarPass : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+ private:
+  // fill the variable map(var_nodes) by version.
+  void InitSSAGraphNodes() const;
+  // update program descs
+  void RenameVarInGraphDesc(const std::string& var,
+                            const std::string& cache_var, size_t idx) const;
+  // update ir nodes
+  void RenameVarInGraphNode(const std::string& var,
+                            const std::string& cache_var, size_t idx,
+                            ir::Graph* graph) const;
+  void SubGraphOptimize(OpDesc* op_desc) const;
+  // valid a tensor can be reuse or not
+  bool NodeCanReused(ir::Node* node) const;
+  // scan subblock and collect the output/input variables.
+  std::unordered_set<std::string> GetSubBlockVars(
+      const std::unordered_set<ir::Node*>&) const;
+  // check op has subblock or not
+  bool OpHasSubBlock(OpDesc* desc) const;
+ private:
+  // Reuse Node Pool, Owned.
+  mutable OrderedNodePairPool pool_;
+  // controlflow Graph
+  mutable std::unique_ptr<ControlFlowGraph> cfg_;
+  // skip set
+  mutable std::unordered_set<std::string> skip_set_;
+  // var nodes
+  mutable std::map<std::string, std::vector<ir::Node*>> var_nodes_;
+};
+class ControlFlowGraph {
+ public:
+  ControlFlowGraph() = default;
+  // For IR Graph in parallelexecutor
+  explicit ControlFlowGraph(const ir::Graph& graph);
+  void LiveVariableAnalysis();
+  void RenameVarInCFGGraph(const std::string& old_node,
+                           const std::string& new_node, int begin_idx);
+  const std::set<std::string> LiveIn(ir::Node* op) const;
+  const std::set<std::string> LiveOut(ir::Node* op) const;
+  const std::set<std::string> Use(ir::Node* op) const;
+  const std::vector<ir::Node*> Ops() const;
+  std::vector<ir::Node*>& Ops();
+  // for ssa-graph nodes
+  ir::Node* GetNodeFromVarName(const std::string& name, ir::Node* op) const;
+ private:
+  void BuildCFGGraph();
+  void ConnectNodes();
+  using NodeListMap = std::unordered_map<ir::Node*, std::set<ir::Node*>>;
+  using VarSetMap = std::map<ir::Node*, std::set<std::string>>;
+  // successors ops use the output variables.
+  NodeListMap successors_;
+  // predecessors ops generated input variables.
+  NodeListMap predecessors_;
+  // variables lived before run current op.
+  VarSetMap live_in_;
+  // variables lived after run current op.
+  VarSetMap live_out_;
+  VarSetMap uses_;  // op inputs
+  VarSetMap defs_;  // op outputs
+  std::vector<ir::Node*> ops_;  // op sequence by topology sort
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/analysis_var_pass_test.cc
+++ b/paddle/fluid/framework/details/analysis_var_pass_test.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/analysis_var_pass.h"
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+namespace paddle {
+namespace framework {
+class DummyOp : public OperatorBase {
+ public:
+  DummyOp(const std::string& type, const VariableNameMap& inputs,
+          const VariableNameMap& outputs, const AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+ private:
+  void RunImpl(const Scope& scope,
+               const platform::Place& place) const override {}
+};
+class SumOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "").AsDuplicable();
+    AddOutput("Out", "");
+    AddComment("");
+  }
+};
+class AssignOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "").AsDuplicable();
+    AddOutput("Out", "");
+    AddComment("");
+  }
+};
+class DummyVarTypeInference : public VarTypeInference {
+ public:
+  void operator()(const OpDesc& op_desc, BlockDesc* block) const override {
+    auto& inputs = op_desc.Input("X");
+    auto type = block->Var(inputs.front())->GetType();
+    auto out_var_name = op_desc.Output("Out").front();
+    block->Var(out_var_name)->SetType(type);
+  }
+};
+}  // namespace framework
+}  // namespace paddle
+REGISTER_OPERATOR(sum, paddle::framework::DummyOp,
+                  paddle::framework::SumOpMaker,
+                  paddle::framework::DummyVarTypeInference);
+REGISTER_OPERATOR(assign, paddle::framework::DummyOp,
+                  paddle::framework::AssignOpMaker,
+                  paddle::framework::DummyVarTypeInference);
+REGISTER_OPERATOR(dummy, paddle::framework::DummyOp,
+                  paddle::framework::SumOpMaker,
+                  paddle::framework::DummyVarTypeInference);
+/*
+  https://en.wikipedia.org/wiki/Live_variable_analysis
+  Create a customed classical dependency graph, left row is the instruction
+  number.
+  1. a = 1
+  2. b = a
+  3. c = a
+  4. d = b + c
+  5. e = d
+  a--------+
+  |        |
+  b        c
+  |        |
+  d--------+
+  |
+  e
+  Then analysis these variable's liveness range
+ */
+namespace paddle {
+namespace framework {
+namespace details {
+static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
+  return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
+         op1->Outputs() == op2->Outputs();
+}
+inline static ProgramDesc FillProgramDesc() {
+  ProgramDesc prog;
+  prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("d")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("e")->SetType(proto::VarType::LOD_TENSOR);
+  {
+    auto* op = prog.MutableBlock(0)->AppendOp();
+    op->SetType("assign");
+    op->SetInput("X", {"a"});
+    op->SetOutput("Out", {"b"});
+  }
+  {
+    auto* op = prog.MutableBlock(0)->AppendOp();
+    op->SetType("assign");
+    op->SetInput("X", {"a"});
+    op->SetOutput("Out", {"c"});
+  }
+  {
+    auto* op = prog.MutableBlock(0)->AppendOp();
+    op->SetType("sum");
+    op->SetInput("X", {"b", "c"});
+    op->SetOutput("Out", {"d"});
+  }
+  {
+    auto* op = prog.MutableBlock(0)->AppendOp();
+    op->SetType("assign");
+    op->SetInput("X", {"d"});
+    op->SetOutput("Out", {"e"});
+  }
+  return prog;
+}
+template <typename Container>
+inline static std::string DebugString(const Container& c) {
+  std::stringstream ss;
+  for (auto& item : c) {
+    ss << item << " ";
+  }
+  return ss.str();
+}
+TEST(CFGGraph, IRGraph) {
+  // prepare ir graph
+  auto prog = FillProgramDesc();
+  ir::Graph graph(prog);
+  const std::vector<OpDesc*>* all_op_descs =
+      new std::vector<OpDesc*>(prog.Block(0).AllOps());
+  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+  ControlFlowGraph cfg(graph);
+  cfg.LiveVariableAnalysis();
+  // test assign op
+  ASSERT_TRUE((std::set<std::string>{"a"} == cfg.LiveIn(cfg.Ops()[0])));
+  ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveOut(cfg.Ops()[0])));
+  // test assign op
+  ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveIn(cfg.Ops()[1])));
+  ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveOut(cfg.Ops()[1])));
+  // test sum op
+  ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveIn(cfg.Ops()[2])));
+  ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveOut(cfg.Ops()[2])));
+  // test assign op
+  ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveIn(cfg.Ops()[3])));
+  ASSERT_TRUE((std::set<std::string>{} == cfg.LiveOut(cfg.Ops()[3])));
+}
+// 1. normal test
+TEST(SortOpLikeDescOrder, NormalTest) {
+  auto prog = FillProgramDesc();
+  ir::Graph graph(prog);
+  const std::vector<OpDesc*>* all_op_descs =
+      new std::vector<OpDesc*>(prog.Block(0).AllOps());
+  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+  auto nodes = SortOpLikeDescOrder(graph);
+  auto op_descs = prog.Block(0).AllOps();
+  for (size_t i = 0; i < nodes.size(); ++i) {
+    auto node = nodes[i];
+    auto op_desc = op_descs[i];
+    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
+  }
+}
+// 2. remove some op_desc
+TEST(SortOpLikeDescOrder, RemoveOpDesc) {
+  auto prog = FillProgramDesc();
+  ir::Graph graph(prog);
+  const std::vector<OpDesc*>* all_op_descs =
+      new std::vector<OpDesc*>(prog.Block(0).AllOps());
+  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+  auto nodes = graph.Nodes();
+  auto op_descs = prog.Block(0).AllOps();
+  ir::Node* found_node = nullptr;
+  for (auto node : nodes) {
+    if (node->IsOp() && node->outputs.back()->Name() == "e") {
+      found_node = node;
+      break;
+    }
+  }
+  PADDLE_ENFORCE(found_node != nullptr);
+  for (auto it = op_descs.begin(); it != op_descs.end();) {
+    if (IsSameDesc(*it, found_node->Op())) {
+      it = op_descs.erase(it);
+    } else {
+      ++it;
+    }
+  }
+  auto find_node_in_graph = [&](std::string s) {
+    ir::Node* ret = nullptr;
+    for (auto n : graph.Nodes()) {
+      if (n->Name() == s) {
+        ret = n;
+        break;
+      }
+    }
+    PADDLE_ENFORCE(ret != nullptr);
+    return ret;
+  };
+  ir::Node* e = find_node_in_graph("e");
+  ir::Node* d = find_node_in_graph("d");
+  std::remove(d->outputs.begin(), d->outputs.end(), found_node);
+  graph.RemoveNode(found_node);
+  graph.RemoveNode(e);
+  // other node keeps the same order
+  auto remain_nodes = SortOpLikeDescOrder(graph);
+  for (size_t i = 0; i < remain_nodes.size(); ++i) {
+    auto node = remain_nodes[i];
+    auto op_desc = op_descs[i];
+    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
+  }
+}
+// 3. add some op_desc
+TEST(SortOpLikeDescOrder, AddOpDesc) {
+  auto prog = FillProgramDesc();
+  const std::vector<OpDesc*>* all_op_descs =
+      new std::vector<OpDesc*>(prog.Block(0).AllOps());
+  ir::Graph graph(prog);
+  auto find_node_in_graph = [&](std::string s) {
+    ir::Node* ret = nullptr;
+    for (auto n : graph.Nodes()) {
+      if (n->Name() == s) {
+        ret = n;
+        break;
+      }
+    }
+    PADDLE_ENFORCE(ret != nullptr);
+    return ret;
+  };
+  // cached desc different with real one
+  // mimic the intermidiete pass modify the programdesc.
+  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+  auto op_descs = prog.Block(0).AllOps();
+  auto op = prog.MutableBlock(0)->AppendOp();
+  prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
+  op->SetType("sum");
+  op->SetInput("X", {"b", "c"});
+  op->SetOutput("Out", {"d1"});
+  ir::Node* node = graph.CreateOpNode(op);
+  ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
+  ir::Node* b = find_node_in_graph("b");
+  ir::Node* c = find_node_in_graph("c");
+  node->outputs.emplace_back(d1);
+  node->inputs.emplace_back(b);
+  node->inputs.emplace_back(c);
+  d1->inputs.emplace_back(node);
+  b->outputs.emplace_back(node);
+  c->outputs.emplace_back(node);
+  op_descs.insert(op_descs.begin() + 4, op);
+  auto nodes = SortOpLikeDescOrder(graph);
+  for (size_t i = 0; i < nodes.size(); ++i) {
+    auto node = nodes[i];
+    auto op_desc = op_descs[i];
+    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
+  }
+}
+// 4. add and delete some op_desc
+TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
+  auto prog = FillProgramDesc();
+  ir::Graph graph(prog);
+  const std::vector<OpDesc*>* all_op_descs =
+      new std::vector<OpDesc*>(prog.Block(0).AllOps());
+  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+  auto find_node_in_graph = [&](std::string s) {
+    ir::Node* ret = nullptr;
+    for (auto n : graph.Nodes()) {
+      if (n->Name() == s) {
+        ret = n;
+        break;
+      }
+    }
+    PADDLE_ENFORCE(ret != nullptr);
+    return ret;
+  };
+  // remove sum node
+  auto op_descs = prog.Block(0).AllOps();
+  ir::Node* found_node = nullptr;
+  auto nodes = graph.Nodes();
+  for (auto node : nodes) {
+    if (node->Name() == "sum") {
+      found_node = node;
+      break;
+    }
+  }
+  PADDLE_ENFORCE(found_node != nullptr);
+  for (auto it = op_descs.begin(); it != op_descs.end();) {
+    if (IsSameDesc(*it, found_node->Op())) {
+      it = op_descs.erase(it);
+    } else {
+      ++it;
+    }
+  }
+  {
+    ir::Node* d = find_node_in_graph("d");
+    ir::Node* c = find_node_in_graph("c");
+    ir::Node* e = find_node_in_graph("e");
+    std::remove(d->outputs.begin(), d->outputs.end(), found_node);
+    std::remove(c->outputs.begin(), c->outputs.end(), found_node);
+    ir::Node* pending_op = found_node->outputs[0]->outputs[0];
+    graph.RemoveNode(e);
+    graph.RemoveNode(pending_op);
+    graph.RemoveNode(found_node);
+  }
+  // add node
+  auto op = prog.MutableBlock(0)->AppendOp();
+  prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
+  op->SetType("sum");
+  op->SetInput("X", {"b", "c"});
+  op->SetOutput("Out", {"d1"});
+  {
+    ir::Node* node = graph.CreateOpNode(op);
+    ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
+    ir::Node* b = find_node_in_graph("b");
+    ir::Node* c = find_node_in_graph("c");
+    node->outputs.emplace_back(d1);
+    node->inputs.emplace_back(b);
+    node->inputs.emplace_back(c);
+    b->outputs.emplace_back(node);
+    c->outputs.emplace_back(node);
+  }
+  op_descs.insert(op_descs.begin() + 2, op);
+  // check the order
+  auto mynodes = SortOpLikeDescOrder(graph);
+  for (size_t i = 0; i < mynodes.size(); ++i) {
+    auto node = mynodes[i];
+    auto op_desc = op_descs[i];
+    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
+  }
+}
+// 5. add and replace some op_desc inplace.
+TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
+  auto prog = FillProgramDesc();
+  ir::Graph graph(prog);
+  const std::vector<OpDesc*>* all_op_descs =
+      new std::vector<OpDesc*>(prog.Block(0).AllOps());
+  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+  auto find_node_in_graph = [&](std::string s) {
+    ir::Node* ret = nullptr;
+    for (auto n : graph.Nodes()) {
+      if (n->Name() == s) {
+        ret = n;
+        break;
+      }
+    }
+    PADDLE_ENFORCE(ret != nullptr);
+    return ret;
+  };
+  auto op_descs = prog.Block(0).AllOps();
+  // add node
+  auto op = prog.MutableBlock(0)->AppendOp();
+  prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
+  op->SetType("sum");
+  op->SetInput("X", {"b", "c"});
+  op->SetOutput("Out", {"d1"});
+  {
+    ir::Node* node = graph.CreateOpNode(op);
+    ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
+    ir::Node* b = find_node_in_graph("b");
+    ir::Node* c = find_node_in_graph("c");
+    node->outputs.emplace_back(d1);
+    node->inputs.emplace_back(b);
+    node->inputs.emplace_back(c);
+    d1->inputs.emplace_back(node);
+    b->outputs.emplace_back(node);
+    c->outputs.emplace_back(node);
+  }
+  op_descs.emplace_back(op);
+  // replace op_desc inplace
+  auto nodes = graph.Nodes();
+  ir::Node* found_node = nullptr;
+  for (auto node : nodes) {
+    if (node->IsOp() && node->Op() && node->Name() == "assign") {
+      if (node->outputs.size() == 1 && node->outputs[0]->Name() == "e") {
+        found_node = node;
+        break;
+      }
+    }
+  }
+  {
+    ir::Node* d = find_node_in_graph("d");
+    ir::Node* e = find_node_in_graph("e");
+    std::remove(d->outputs.begin(), d->outputs.end(), found_node);
+    std::remove(e->inputs.begin(), e->inputs.end(), found_node);
+    graph.RemoveNode(found_node);
+  }
+  op_descs.erase(op_descs.begin() + 3);
+  auto replace_op = prog.MutableBlock(0)->AppendOp();
+  replace_op->SetType("sum");
+  replace_op->SetInput("X", {"d", "d1"});
+  replace_op->SetOutput("Out", {"e"});
+  {
+    ir::Node* sum2 = graph.CreateOpNode(replace_op);
+    ir::Node* e = find_node_in_graph("e");
+    ir::Node* d = find_node_in_graph("d");
+    ir::Node* d1 = find_node_in_graph("d1");
+    sum2->inputs.emplace_back(d);
+    sum2->inputs.emplace_back(d1);
+    sum2->outputs.emplace_back(e);
+    e->inputs.emplace_back(sum2);
+    d->outputs.emplace_back(sum2);
+    d1->outputs.emplace_back(sum2);
+  }
+  op_descs.emplace_back(replace_op);
+  // compare op order
+  auto graph_nodes = SortOpLikeDescOrder(graph);
+  for (size_t i = 0; i < graph_nodes.size(); ++i) {
+    auto node = graph_nodes[i];
+    auto op_desc = op_descs[i];
+    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
+  }
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -14,11 +14,16 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/build_strategy.h"
+#include <glog/logging.h>
+#include <memory>
+#include "paddle/fluid/framework/details/memory_reuse_types.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/details/sequential_execution_pass.h"
 #include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 namespace paddle {
@@ -69,6 +74,14 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
    }
    VLOG(1) << "CollectiveContext:" << context->String();
+    // NOTE(dzh): memory optimize should be a runtime pass.
+    // However, after multi_devices_pass, VarHandle, OpHandle is
+    // the de-fact IR, any reuse on Graph is meaningless.
+    // A side-effect of that, memory optimize cannot forsee the fetched vars
+    // , so fetchlist should be set persistable before call the Run interface.
+    if (strategy.memory_optimize_) {
+      auto analysis_var_pass = AppendPass("analysis_var_pass");
+    }
    // Convert graph to run on multi-devices.
    auto multi_devices_pass = AppendPass("multi_devices_pass");
    multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
@@ -79,8 +92,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
    // Add a graph print pass to record a graph with device info.
    if (!strategy_.debug_graphviz_path_.empty()) {
      auto multi_devices_print_pass = AppendPass("multi_devices_print_pass");
-      multi_devices_print_pass->SetNotOwned<const std::string>(
+      const std::string graph_path =
-          "debug_graphviz_path", &strategy_.debug_graphviz_path_);
+          string::Sprintf("%s%s", strategy_.debug_graphviz_path_.c_str(),
+                          "_multi_devices_graph");
+      multi_devices_print_pass->Set<std::string>(kGraphvizPath,
+                                                 new std::string(graph_path));
      multi_devices_print_pass->Set<details::GraphvizSSAGraphPrinter>(
          "graph_printer", new details::GraphvizSSAGraphPrinter);
    }
@@ -127,7 +143,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
  CreatePassesFromStrategy(false);
  std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
  for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
    if (pass->Type() == "multi_devices_pass") {
      pass->Erase("places");
@@ -145,6 +160,17 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
      pass->Erase("nccl_ctxs");
      pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
 #endif
+    } else if (pass->Type() == "analysis_var_pass") {
+      const std::vector<OpDesc *> *all_op_descs =
+          new std::vector<OpDesc *>(main_program.Block(0).AllOps());
+      graph->Set<const std::vector<OpDesc *>>(kAllOpDescs,
+                                              all_op_descs);  // take ownership
+      graph->Set<GraphNodePool>(kGraphNodePool,
+                                new GraphNodePool);  // take ownership
+      pass->Erase(kAllOpDescs);
+      pass->SetNotOwned<const std::vector<OpDesc *>>(kAllOpDescs, all_op_descs);
    } else if (pass->Type() == "sequential_execution_pass") {
      LOG(INFO) << "set enable_sequential_execution:"
                << enable_sequential_execution_;
@@ -166,6 +192,7 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
  }
  return graph;
 }
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
@@ -176,6 +203,7 @@ USE_PASS(multi_batch_merge_pass);
 USE_PASS(multi_devices_pass);
 USE_PASS(multi_devices_check_pass);
 USE_PASS(multi_devices_print_pass);
+USE_PASS(analysis_var_pass);
 USE_PASS(sequential_execution_pass);
 USE_PASS(all_reduce_deps_pass);
 USE_PASS(modify_op_lock_and_record_event_pass);
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -60,8 +60,15 @@ struct BuildStrategy {
    kCustomized = 2,
  };
+  enum class OptimizeStrategy {
+    // To be Implemented,bruteforce, recursive compute unused var names.
+    kBruteForce = 0,
+    kControlFlowGraph = 1,  // use cfg_graph algorithm, faster speed.
+  };
  ReduceStrategy reduce_{ReduceStrategy::kAllReduce};
  GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice};
+  OptimizeStrategy strategy_{OptimizeStrategy::kControlFlowGraph};
  std::string debug_graphviz_path_{""};
@@ -69,6 +76,10 @@ struct BuildStrategy {
  bool enable_data_balance_{false};
+  bool memory_optimize_{false};
+  bool memory_early_delete_{false};
  bool enable_sequential_execution_{false};
  bool fuse_broadcast_op_{false};

--- a/paddle/fluid/framework/details/early_delete_op_handle.h
+++ b/paddle/fluid/framework/details/early_delete_op_handle.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/details/var_handle.h"
+#include "paddle/fluid/framework/garbage_collector.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/tensor.h"
+namespace paddle {
+namespace framework {
+namespace details {
+class EarlyDeleteOpHandle : public OpHandleBase {
+ public:
+  EarlyDeleteOpHandle(ir::Node* node, const Scope* scope,
+                      const platform::Place& place,
+                      const std::vector<std::string>& names,
+                      GarbageCollector* gc)
+      : OpHandleBase(node),
+        scope_(scope),
+        place_(place),
+        names_(names),
+        gc_(gc) {
+#ifdef PADDLE_WITH_CUDA
+    if (IsStreamGarabageCollector()) {
+      auto gpu_place = boost::get<platform::CUDAPlace>(place);
+      PADDLE_ENFORCE(cudaSetDevice(gpu_place.device));
+      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+    }
+#endif
+  }
+  ~EarlyDeleteOpHandle() {
+#ifdef PADDLE_WITH_CUDA
+    if (IsStreamGarabageCollector()) {
+      auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
+      PADDLE_ENFORCE(cudaSetDevice(gpu_place.device));
+      PADDLE_ENFORCE(cudaEventDestroy(event_));
+    }
+#endif
+  }
+  std::string Name() const override { return "early_delete"; }
+ protected:
+  void RunImpl() override {
+    std::vector<std::shared_ptr<memory::Allocation>> tensors;
+    auto* local_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope*>();
+    for (auto& var_name : names_) {
+      auto* var = local_scope->FindVar(var_name);
+      PADDLE_ENFORCE(var != nullptr,
+                     string::Sprintf("Local Scope not has var %s", var_name));
+      if (var->IsType<LoDTensor>()) {
+        tensors.emplace_back(var->GetMutable<LoDTensor>()->MoveMemoryHolder());
+      } else if (var->IsType<SelectedRows>()) {
+        tensors.emplace_back(var->GetMutable<SelectedRows>()
+                                 ->mutable_value()
+                                 ->MoveMemoryHolder());
+      } else if (var->IsType<LoDTensorArray>()) {
+        LoDTensorArray* tensor_array = var->GetMutable<LoDTensorArray>();
+        for (auto& tensor : *tensor_array) {
+          tensors.emplace_back(tensor.MoveMemoryHolder());
+        }
+      }
+    }
+    if (!tensors.empty()) {
+      ClearTensors(tensors);
+    }
+  }
+ private:
+  void ClearTensors(
+      const std::vector<std::shared_ptr<memory::Allocation>>& tensors) {
+    if (platform::is_cpu_place(place_)) {
+      ClearCPUTensors(tensors);
+    } else {
+      ClearGPUTensors(tensors);
+    }
+  }
+  void ClearCPUTensors(
+      const std::vector<std::shared_ptr<memory::Allocation>>& tensors) {
+    auto* gc = dynamic_cast<CPUGarbageCollector*>(gc_);
+    if (gc != nullptr) {
+      gc->Add(tensors);
+    }
+  }
+  void ClearGPUTensors(
+      const std::vector<std::shared_ptr<memory::Allocation>>& tensors) {
+#ifdef PADDLE_WITH_CUDA
+    auto* gc = dynamic_cast<StreamGarbageCollector*>(gc_);
+    if (gc != nullptr) {
+      auto compute_stream = dev_ctx_->stream();
+      auto callback_stream = gc->stream();
+      auto callback_func = [=]() {
+        PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
+        PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
+      };
+      gc_->Add(tensors, callback_func);
+    } else {
+      gc_->Add(tensors);
+    }
+  }
+  bool IsStreamGarabageCollector() const {
+    return dynamic_cast<const StreamGarbageCollector*>(gc_) != nullptr;
+#endif
+  }
+  const Scope* scope_;
+  const platform::Place place_;
+  std::vector<std::string> names_;
+  GarbageCollector* gc_;
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDADeviceContext* dev_ctx_;
+  cudaEvent_t event_;
+#endif
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/memory_early_delete_pass.cc
+++ b/paddle/fluid/framework/details/memory_early_delete_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/memory_early_delete_pass.h"
+#include <queue>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/memory_reuse_types.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+namespace paddle {
+namespace framework {
+namespace details {
+static ComputationOpHandle* FindNextComputationOpHandle(VarHandle* var_in) {
+  std::queue<VarHandleBase*> queue;
+  queue.push(var_in);
+  do {
+    auto* var = queue.front();
+    queue.pop();
+    for (auto* op : var->PendingOps()) {
+      auto* compute_op = dynamic_cast<ComputationOpHandle*>(op);
+      if (compute_op != nullptr && compute_op->GetPlace() == var_in->place_) {
+        return compute_op;
+      }
+      for (auto* out_var : op->Outputs()) {
+        queue.push(out_var);
+      }
+    }
+  } while (!queue.empty());
+  return nullptr;
+}
+std::unique_ptr<ir::Graph> MemoryEarlyDeletePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  auto& graph_pool = Get<GraphNodePool>(kGraphNodePool);
+  auto& gcs = Get<GarbageCollectorMap>(kGarbageCollector);
+  std::unordered_map<std::string, std::unordered_set<OpDesc*>> unlived_vars;
+  unlived_vars.reserve(graph_pool.size());
+  for (auto& pair : graph_pool) {
+    unlived_vars.insert(std::make_pair(pair.first, pair.second));
+  }
+  auto compare_and_insert_early_delete_op = [&](
+      OpHandleBase* op, const std::vector<VarHandleBase*>& vars) {
+    if (unlived_vars.empty()) return;
+    // unlived vars can be deleted after the last used op has finished.
+    auto* compute_op = dynamic_cast<ComputationOpHandle*>(op);
+    const auto& places = Get<std::vector<platform::Place>>(kAllPlaces);
+    for (auto& var : vars) {
+      auto* var_handle = dynamic_cast<VarHandle*>(var);
+      auto var_name = var->Node()->Name();
+      auto& var_place = var_handle->place_;
+      if (unlived_vars.count(var_name) == 0) continue;
+      if (!unlived_vars[var_name].empty()) {
+        if (compute_op != nullptr &&
+            unlived_vars[var_name].count(compute_op->Node()->Op()) != 0) {
+          unlived_vars[var_name].erase(compute_op->Node()->Op());
+        }
+        continue;
+      }
+      if (var_handle == nullptr || !var_handle->Node()->IsVar() ||
+          var_handle->Node()->IsCtrlVar())
+        continue;
+      // shameless copyed from reference count pass.
+      if (compute_op == nullptr) {
+        // use next computation op scope
+        compute_op = FindNextComputationOpHandle(var_handle);
+      }
+      auto* early_delete_node =
+          graph->CreateEmptyNode("early_delete", ir::Node::Type::kOperation);
+      GarbageCollector* gc = gcs.at(places[compute_op->GetScopeIdx()]).get();
+      auto* early_delete_handle = new EarlyDeleteOpHandle(
+          early_delete_node, compute_op->GetScope(), var_place, {var_name}, gc);
+      if (compute_op->Outputs().empty()) {
+        auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar());
+        compute_op->AddOutput(dep_var);
+        graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
+      }
+      early_delete_handle->AddInput(compute_op->Outputs().front());
+      VLOG(5) << "Add early delete op " << var_name << " to Operator"
+              << compute_op->Name();
+    }
+  };
+  auto all_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
+  for (auto& op : all_ops) {
+    compare_and_insert_early_delete_op(op, op->Inputs());
+    compare_and_insert_early_delete_op(op, op->Outputs());
+  }
+  return graph;
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(memory_early_delete_pass,
+              paddle::framework::details::MemoryEarlyDeletePass)
+    .RequireGraphAttr(paddle::framework::details::kGraphNodePool)
+    .RequireGraphAttr(paddle::framework::details::kGarbageCollector);
--- a/paddle/fluid/framework/details/memory_early_delete_pass.h
+++ b/paddle/fluid/framework/details/memory_early_delete_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/framework/details/early_delete_op_handle.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+namespace paddle {
+namespace framework {
+namespace details {
+class MemoryEarlyDeletePass : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/memory_reuse_types.cc
+++ b/paddle/fluid/framework/details/memory_reuse_types.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/memory_reuse_types.h"
+#include <iostream>
+#include <sstream>
+#include <string>
+namespace paddle {
+namespace framework {
+namespace details {
+size_t NodeSizeInBytes(ir::Node* n) {
+  auto* desc = FindVarDescInBlock(n);
+  auto shape = desc->GetShape();
+  size_t type_size = SizeOfType(desc->GetDataType());
+  int size = 1;
+  for (auto& s : shape) {
+    size *= s;
+  }
+  return type_size * std::abs(size);
+}
+std::string DebugStringImpl(VarDesc* var) {
+  std::stringstream ss;
+  ss << var->Name();
+  ss << "[";
+  try {
+    auto shape = var->GetShape();
+    for (size_t i = 0; i < shape.size(); ++i) {
+      if (i != shape.size() - 1) {
+        ss << shape[i] << ",";
+      } else {
+        ss << shape[i];
+      }
+    }
+    ss << "]";
+  } catch (...) {
+    ss << "Var has no VarDesc !!! Name:" << var->Name();
+  }
+  return ss.str();
+}
+std::string DebugString(ir::Node* var) {
+  return DebugStringImpl(FindVarDescInBlock(var));
+}
+// return DebugString(var->Var()); }
+// NOTE(dzh): based ir node, if a large node has been reused
+// by a small size node, then next time it appear in pool, it will
+// have the small size. Find the original node shap from blockdesc.
+VarDesc* FindVarDescInBlock(ir::Node* n) {
+  PADDLE_ENFORCE(n->IsVar() && !n->IsCtrlVar() && n->inputs.size() == 1);
+  BlockDesc* block = n->inputs[0]->Op()->Block();
+  PADDLE_ENFORCE(block->HasVar(n->Name()),
+                 string::Sprintf("Block do not has var %s", n->Name()));
+  return block->FindVar(n->Name());
+}
+struct NodeComparator {
+  bool operator()(ir::Node* lhs, ir::Node* rhs) const {
+    auto* lhs_desc = FindVarDescInBlock(lhs);
+    auto* rhs_desc = FindVarDescInBlock(rhs);
+    auto lhs_shape = lhs_desc->GetShape();
+    auto rhs_shape = rhs_desc->GetShape();
+    if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) ||
+        (lhs_shape[0] != -1 && rhs_shape[0] != -1)) {
+      return NodeSizeInBytes(lhs) <= NodeSizeInBytes(rhs);
+    } else {
+      return false;
+    }
+  }
+};
+void OrderedNodePairPool::Insert(ir::Node* var, ir::Node* op) {
+  PADDLE_ENFORCE(var->IsVar() && !var->IsCtrlVar());
+  PADDLE_ENFORCE(op->IsOp());
+  if (mark_table_.count(var->Name()) != 0) {
+    mark_table_[var->Name()]->second.insert(op);
+    return;
+  }
+  auto* var_desc = FindVarDescInBlock(var);
+  auto var_shape = var_desc->GetShape();
+  int batch_size = static_cast<int>(var_shape[0]);
+  NodeComparator compare_node;
+  Iter it = nodes_.begin();
+  while (it != nodes_.end()) {
+    auto* cache_desc = FindVarDescInBlock(it->first);
+    int cache_batch_size = cache_desc->GetShape()[0];
+    if ((cache_batch_size == -1 && batch_size == -1) ||
+        (cache_batch_size != -1 && batch_size != -1)) {
+      if (compare_node(it->first, var)) {
+        ++it;
+      } else {
+        break;
+      }
+    } else if (cache_batch_size == -1 && batch_size != -1) {
+      ++it;
+    } else if (cache_batch_size != -1 && batch_size == -1) {
+      break;
+    }
+  }
+  it =
+      nodes_.insert(it, std::make_pair(var, std::unordered_set<ir::Node*>{op}));
+  mark_table_[var->Name()] = it;
+}
+int OrderedNodePairPool::GetIndex(ir::Node* var) {
+  return std::distance(nodes_.begin(), mark_table_[var->Name()]);
+}
+ir::Node* OrderedNodePairPool::NodeMatch(ir::Node* var) const {
+  ir::Node* found_node = nullptr;
+  NodeComparator compare_node;
+  for (auto it = nodes_.begin(); it != nodes_.end(); ++it) {
+    if (compare_node(var, it->first)) {
+      found_node = it->first;
+      break;
+    }
+  }
+  return found_node;
+}
+void OrderedNodePairPool::Erase(ir::Node* var) {
+  PADDLE_ENFORCE(mark_table_.count(var->Name()));
+  nodes_.erase(mark_table_[var->Name()]);
+  mark_table_.erase(var->Name());
+}
+std::string OrderedNodePairPool::ToString() const {
+  std::stringstream ss;
+  for (auto it = nodes_.begin(); it != nodes_.end(); ++it) {
+    ss << DebugString(it->first) << " ";
+  }
+  return ss.str();
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/memory_reuse_types.h
+++ b/paddle/fluid/framework/details/memory_reuse_types.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include <list>
+#include <string>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/ir/graph.h"
+namespace paddle {
+namespace framework {
+namespace details {
+constexpr char kFetchedVars[] = "fetched_vars";
+constexpr char kGraphNodePool[] = "graph_node_pool";
+// NOTE(dzh): Variable and the operators use the var.
+// for early delete pass.
+// Because analysis var pass build base on ir::Node, which maybe released
+// or modified between passes, so we use OpDesc* to mark ops.
+using GraphNodePool = std::vector<
+    std::pair<std::string /*var node*/, std::unordered_set<OpDesc*> /* ops */>>;
+// NOTE(dzh): by default, it sort node in ascend order(by node bytes size).
+// in fluid, -1 means the batch_size is determined in runtime.
+// the node batch_size equal -1 always ranking in the front than the node not.
+// For example,
+// node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], ..
+// O(1) insert, delete
+class OrderedNodePairPool {
+ public:
+  using NodePair = std::pair<ir::Node*, std::unordered_set<ir::Node*>>;
+  using Iter = typename std::list<NodePair>::iterator;
+  using ConstIter = typename std::list<NodePair>::const_iterator;
+  void Insert(ir::Node* var, ir::Node* op);
+  void Erase(ir::Node* var);
+  bool Has(ir::Node* var) { return mark_table_.count(var->Name()); }
+  ir::Node* NodeMatch(ir::Node* var) const;
+  // map store non-const iterator, can not promise const
+  int GetIndex(ir::Node* var);
+  // pool all node to string
+  std::string ToString() const;
+  Iter begin() { return nodes_.begin(); }
+  Iter end() { return nodes_.end(); }
+  ConstIter begin() const { return nodes_.begin(); }
+  ConstIter end() const { return nodes_.end(); }
+  size_t size() const { return nodes_.size(); }
+ private:
+  // for searching.
+  std::unordered_map<std::string, Iter> mark_table_;
+  // node swap pairs. var -> ops dep var
+  std::list<NodePair> nodes_;
+};
+// node memory size in bytes
+size_t NodeSizeInBytes(ir::Node* n);
+std::string DebugString(ir::Node* var);
+// std::string DebugString(VarDesc* var);
+VarDesc* FindVarDescInBlock(ir::Node* n);
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/memory_reuse_types_test.cc
+++ b/paddle/fluid/framework/details/memory_reuse_types_test.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/memory_reuse_types.h"
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+namespace paddle {
+namespace framework {
+namespace details {
+TEST(OrderedNodePairPool, Normal) {
+  OrderedNodePairPool pool;
+  std::vector<std::unique_ptr<ir::Node>> nodes;
+  // clang-format off
+  std::vector<std::vector<int64_t>> shapes = {{-1, 10},
+                                              {-1, 20},
+                                              {1, 2},
+                                              {5, 2},
+                                              {10, 20},
+                                              {-1, 2, 5},
+                                              {-1, 1, 5},
+                                              {-1, 1}};
+  // clang-format on
+  const int COUNT = shapes.size();
+  ProgramDesc prog;
+  BlockDesc* block_desc = prog.MutableBlock(0);
+  auto* op_desc = block_desc->AppendOp();
+  op_desc->SetType("dummy");
+  std::unique_ptr<ir::Node> op = ir::CreateNodeForTest(op_desc);
+  for (int i = 0; i < COUNT; ++i) {
+    auto desc = block_desc->Var(std::to_string(i));
+    desc->SetShape(shapes[i]);
+    std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
+    node->inputs.emplace_back(op.get());
+    nodes.emplace_back(std::move(node));
+  }
+  for (auto& node : nodes) {
+    pool.Insert(node.get(), op.get());
+  }
+  // assert its order and interface.
+  std::cout << pool.ToString() << std::endl;
+  pool.Erase(nodes.front().get());
+  std::cout << pool.ToString() << std::endl;
+  ASSERT_EQ(pool.size(), static_cast<size_t>(COUNT - 1));
+  ASSERT_EQ(pool.GetIndex(nodes.back().get()), 0);
+  {
+    auto v1 = block_desc->Var("11");
+    v1->SetShape({-1, 256, 56, 56});
+    std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v1);
+    node1->inputs.emplace_back(op.get());
+    auto* cache = pool.NodeMatch(node1.get());
+    ASSERT_EQ(cache, nullptr);
+  }
+  {
+    auto v2 = block_desc->Var("12");
+    v2->SetShape({-1, 2, 5});
+    std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v2);
+    node1->inputs.emplace_back(op.get());
+    auto* cache = pool.NodeMatch(node1.get());
+    ASSERT_EQ(pool.GetIndex(cache), 2);  // match 6:[-1,2,5]
+  }
+  {
+    auto v3 = block_desc->Var("13");
+    v3->SetShape({2, 5});
+    std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v3);
+    node1->inputs.emplace_back(op.get());
+    auto* cache = pool.NodeMatch(node1.get());
+    ASSERT_EQ(pool.GetIndex(cache), 5);  // match  4:[5,2]
+  }
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc
@@ -85,4 +85,5 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph,
 }  // namespace paddle
 REGISTER_PASS(multi_devices_print_pass,
-              paddle::framework::details::SSAGraghBuilderWithPrinter);
+              paddle::framework::details::SSAGraghBuilderWithPrinter)
+    .RequirePassAttr(paddle::framework::details::kGraphvizPath);
--- a/paddle/fluid/framework/details/multi_devices_graph_print_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.h
@@ -14,6 +14,7 @@
 #pragma once
+#include <glog/logging.h>
 #include <fstream>
 #include <iosfwd>
 #include <ostream>
@@ -24,6 +25,8 @@ namespace paddle {
 namespace framework {
 namespace details {
+constexpr char kGraphvizPath[] = "debug_graphviz_path";
 class SSAGraphPrinter {
 public:
  virtual ~SSAGraphPrinter() {}
@@ -40,7 +43,7 @@ class SSAGraghBuilderWithPrinter : public ir::Pass {
  std::unique_ptr<ir::Graph> ApplyImpl(
      std::unique_ptr<ir::Graph> graph) const override {
    std::unique_ptr<std::ostream> fout(
-        new std::ofstream(Get<const std::string>("debug_graphviz_path")));
+        new std::ofstream(Get<std::string>(kGraphvizPath)));
    PADDLE_ENFORCE(fout->good());
    Get<GraphvizSSAGraphPrinter>("graph_printer").Print(*graph, *fout);
    return graph;

--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -25,7 +25,7 @@ namespace paddle {
 namespace framework {
 namespace details {
-constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@";
+constexpr char kLocalExecScopeName[] = "@LOCAL_SCOPE@";
 // Wraps ir::Node and provide helper utilities.
 // It's responsible for populating necessary fields of ir::Node.

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -100,7 +100,7 @@ static void DeleteUnusedTensors(
          continue;
        }
        auto* var = scope.FindVar(name);
-        if (var != nullptr) {
+        if (var == nullptr) {
          continue;
        }

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -44,6 +44,7 @@ pass_library(seqconv_eltadd_relu_fuse_pass inference)
 pass_library(is_test_pass base)
 pass_library(conv_elementwise_add_act_fuse_pass inference)
 pass_library(conv_elementwise_add2_act_fuse_pass inference)
+pass_library(conv_elementwise_add_fuse_pass inference)
 if(WITH_MKLDNN)
    pass_library(mkldnn_placement_pass base)
    pass_library(depthwise_conv_mkldnn_pass base)

--- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <string>
+#include "paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h"
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
+#define GET_NODES                    \
+  GET_IR_NODE(conv_op);              \
+  GET_IR_NODE(conv_out);             \
+  GET_IR_NODE(conv_filter);          \
+  GET_IR_NODE(elementwise_add_op);   \
+  GET_IR_NODE(elementwise_add_in_y); \
+  GET_IR_NODE(elementwise_add_out);
+std::unique_ptr<ir::Graph> ConvElementwiseAddFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  const std::string pattern_name = "conv_elementwise_add_fuse";
+  FusePassBase::Init(pattern_name, graph.get());
+  GraphPatternDetector gpd;
+  auto* x = gpd.mutable_pattern()
+                ->NewNode("x")
+                ->assert_is_op_input("conv2d", "Input")
+                ->AsInput();
+  patterns::ConvElementwiseadd pattern(gpd.mutable_pattern(), pattern_name);
+  pattern(x);
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_NODES;
+    auto base_op_desc = *conv_op->Op()->Proto();
+    std::string bias_name = elementwise_add_in_y->Name();
+    std::string output_name = elementwise_add_out->Name();
+    std::string act_type = "identity";
+    framework::OpDesc new_op_desc(base_op_desc, nullptr);
+    new_op_desc.SetType("conv2d_fusion");
+    new_op_desc.SetInput("Bias", {bias_name});
+    new_op_desc.SetInput("ResidualData", {});
+    new_op_desc.SetAttr("activation", act_type);
+    new_op_desc.SetOutput("Output", {output_name});
+    new_op_desc.SetAttr("is_test", true);
+    new_op_desc.SetAttr("use_cudnn", false);
+    new_op_desc.Flush();
+    // Create a new node for the fused op.
+    auto* new_conv_op = graph->CreateOpNode(&new_op_desc);
+    // Link inputs and outputs.
+    PADDLE_ENFORCE(subgraph.count(x));
+    auto* conv_in_node = subgraph.at(x);
+    IR_NODE_LINK_TO(conv_in_node, new_conv_op);          // Input
+    IR_NODE_LINK_TO(conv_filter, new_conv_op);           // Filter
+    IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op);  // Bias
+    IR_NODE_LINK_TO(new_conv_op, elementwise_add_out);   // Output
+    // Delete the unneeded nodes.
+    GraphSafeRemoveNodes(graph.get(), {conv_op, conv_out, elementwise_add_op});
+  };
+  gpd(graph.get(), handler);
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(conv_elementwise_add_fuse_pass,
+              paddle::framework::ir::ConvElementwiseAddFusePass);
--- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+class ConvElementwiseAddFusePass : public FusePassBase {
+ public:
+  virtual ~ConvElementwiseAddFusePass() {}
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -162,7 +162,10 @@ void Graph::ResolveHazard(
          (*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0];
      const auto &read_ops = (*it_old)->outputs;
-      PADDLE_ENFORCE(write_op, "The write_op should not be empty.");
+      PADDLE_ENFORCE(
+          write_op,
+          string::Sprintf("The write_op of var %s should not be empty.",
+                          (*it_new)->Name()));
      // Add write after write dependence
      ir::Node *upstream_op =

--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <fstream>
 #include <iosfwd>
 #include <ostream>
+#include <unordered_map>
 #include <unordered_set>
 DEFINE_string(print_sub_graph_dir, "",
@@ -121,7 +122,7 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
 }
 size_t GraphNum(const Graph &graph) {
-  std::unordered_set<ir::Node *> nodes = graph.Nodes();
+  std::unordered_set<ir::Node *> nodes(graph.Nodes());
  std::unordered_set<ir::Node *> visited_nodes;
  visited_nodes.reserve(nodes.size());
  std::deque<ir::Node *> q_nodes;

--- a/paddle/fluid/framework/ir/graph_helper.h
+++ b/paddle/fluid/framework/ir/graph_helper.h
@@ -24,6 +24,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
 // Test if the graph contains circle.
 bool HasCircle(const Graph &graph);

--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -17,7 +17,6 @@
 #include <string>
 #include <vector>
-#include "graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/graph_traits.h"
@@ -1210,6 +1209,33 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) {
  return act_out;
 }
+PDNode *patterns::ConvElementwiseadd::operator()(PDNode *conv_in) {
+  conv_in->AsInput();
+  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
+  auto conv_out = pattern->NewNode(conv_out_repr())
+                      ->assert_is_op_output("conv2d")
+                      ->assert_is_op_input("elementwise_add", "X")
+                      ->AsIntermediate();
+  auto conv_filter = pattern->NewNode(conv_filter_repr())
+                         ->assert_is_op_input("conv2d", "Filter")
+                         ->AsInput();
+  auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr())
+                                ->assert_is_op("elementwise_add");
+  auto elementwise_add_in_y = pattern->NewNode(elementwise_add_in_y_repr())
+                                  ->assert_is_op_input("elementwise_add", "Y")
+                                  ->AsInput();
+  auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr())
+                                 ->assert_is_op_output("elementwise_add")
+                                 ->AsOutput();
+  conv_op->LinksFrom({conv_in, conv_filter});
+  conv_out->LinksFrom({conv_op});
+  elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y})
+      .LinksTo({elementwise_add_out});
+  return elementwise_add_out;
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -716,6 +716,24 @@ struct ConvElementwiseadd2Act : public PatternBase {
  PATTERN_DECL_NODE(act_out);
 };
+// Conv + ElementwiseAdd
+// This pattern should be used after ConvElementwiseadd2Act or
+// ConvElementwiseadd pass
+struct ConvElementwiseadd : public PatternBase {
+  ConvElementwiseadd(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "conv_elementwiseadd") {}
+  PDNode* operator()(PDNode* conv_in);
+  PATTERN_DECL_NODE(conv_op);
+  PATTERN_DECL_NODE(conv_out);
+  PATTERN_DECL_NODE(conv_filter);
+  PATTERN_DECL_NODE(elementwise_add_op);
+  PATTERN_DECL_NODE(elementwise_add_in_y);
+  PATTERN_DECL_NODE(elementwise_add_out);
+};
 }  // namespace patterns
 // Link two ir::Nodes from each other.

--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
@@ -30,6 +30,14 @@ std::unique_ptr<Node> CreateNodeForTest(const std::string &name,
  return std::unique_ptr<Node>(new Node(name, type));
 }
+std::unique_ptr<Node> CreateNodeForTest(VarDesc *var_desc) {
+  return std::unique_ptr<Node>(new Node(var_desc));
+}
+std::unique_ptr<Node> CreateNodeForTest(OpDesc *op_desc) {
+  return std::unique_ptr<Node>(new Node(op_desc));
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <typeindex>
 #include <typeinfo>
 #include <vector>
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/platform/macros.h"
@@ -125,6 +124,8 @@ class Node {
  friend class Graph;
  friend std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
                                                 Node::Type type);
+  friend std::unique_ptr<Node> CreateNodeForTest(VarDesc* var_desc);
+  friend std::unique_ptr<Node> CreateNodeForTest(OpDesc* op_desc);
  explicit Node(const std::string& name, Type type)
      : name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {}
@@ -152,7 +153,9 @@ class Node {
 std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
                                        Node::Type type);
+std::unique_ptr<Node> CreateNodeForTest(VarDesc* var_desc);
+std::unique_ptr<Node> CreateNodeForTest(OpDesc* op_desc);
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ngraph_operator.cc
+++ b/paddle/fluid/framework/ngraph_operator.cc
@@ -278,7 +278,8 @@ std::shared_ptr<ngraph::runtime::Backend> NgraphEngine::backend_ =
    ngraph::runtime::Backend::create("CPU");
 void NgraphEngine::GetNgInputShape(std::shared_ptr<OperatorBase> op) {
-  op->RuntimeInferShape(scope_, place_);
+  RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_);
+  op->RuntimeInferShape(scope_, place_, ctx);
  for (auto& var_name_item : op->Inputs()) {
    for (auto& var_name : var_name_item.second) {
      auto* var = scope_.FindVar(var_name);

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -137,6 +137,23 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
  }
 }
+RuntimeContext::RuntimeContext(const VariableNameMap& innames,
+                               const VariableNameMap& outnames,
+                               const Scope& scope) {
+  for (auto& var_name_item : innames) {
+    std::vector<Variable*>& input_vars = inputs[var_name_item.first];
+    for (auto& var_name : var_name_item.second) {
+      input_vars.push_back(scope.FindVar(var_name));
+    }
+  }
+  for (auto& var_name_item : outnames) {
+    std::vector<Variable*>& output_vars = outputs[var_name_item.first];
+    for (auto& var_name : var_name_item.second) {
+      output_vars.push_back(scope.FindVar(var_name));
+    }
+  }
+}
 void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
  VLOG(4) << place << " " << DebugStringEx(&scope);
  if (platform::is_gpu_place(place)) {
@@ -412,11 +429,48 @@ bool ExecutionContext::HasOutput(const std::string& name) const {
  return var != nullptr;
 }
+const Variable* ExecutionContext::InputVar(const std::string& name) const {
+  auto it = ctx_.inputs.find(name);
+  if (it == ctx_.inputs.end()) return nullptr;
+  PADDLE_ENFORCE_LE(it->second.size(), 1UL,
+                    "Operator %s's input %s should contain only one variable.",
+                    op_.Type(), name);
+  return it->second.empty() ? nullptr : it->second[0];
+}
+const Variable* ExecutionContext::LegacyInputVar(
+    const std::string& name) const {
+  auto ipt = op_.Input(name);
+  return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
+}
+Variable* ExecutionContext::OutputVar(const std::string& name) const {
+  auto it = ctx_.outputs.find(name);
+  if (it == ctx_.outputs.end()) return nullptr;
+  PADDLE_ENFORCE_LE(it->second.size(), 1UL,
+                    "Operator %s's output %s should contain only one variable.",
+                    op_.Type(), name);
+  return it->second.empty() ? nullptr : it->second[0];
+}
+Variable* ExecutionContext::LegacyOutputVar(const std::string& name) const {
+  auto opt = op_.Output(name);
+  return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt);
+}
 template <>
 const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
  return Input<LoDTensor>(name);
 }
+template <>
+const Tensor* ExecutionContext::LegacyInput<Tensor>(
+    const std::string& name) const {
+  return LegacyInput<LoDTensor>(name);
+}
 template <>
 const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
    const std::string& name) const {
@@ -441,6 +495,11 @@ Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const {
  return Output<LoDTensor>(name);
 }
+template <>
+Tensor* ExecutionContext::LegacyOutput<Tensor>(const std::string& name) const {
+  return LegacyOutput<LoDTensor>(name);
+}
 template <>
 std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
    const std::string& name) const {
@@ -477,23 +536,22 @@ bool OpSupportGPU(const std::string& op_type) {
 class RuntimeInferShapeContext : public InferShapeContext {
 public:
-  RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope)
+  RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope,
-      : op_(op), scope_(scope) {}
+                           const RuntimeContext& ctx)
+      : op_(op), scope_(scope), ctx_(ctx) {}
  bool HasInput(const std::string& name) const override {
    // has only one input
-    const auto& ins = op_.Inputs();
+    const auto& ins = ctx_.inputs;
    auto it = ins.find(name);
    if (it == ins.end()) {
      return false;
    }
    const auto& in = it->second;
-    if (in.size() == 0 || in[0] == kEmptyVarName) {
+    if (in.size() == 0) return false;
-      return false;
-    }
    PADDLE_ENFORCE_EQ(in.size(), 1UL,
                      "Input %s should not have more than one inputs", name);
-    return scope_.FindVar(in[0]) != nullptr;
+    return in[0] != nullptr;
  }
  bool HasOutput(const std::string& name) const override {
@@ -678,6 +736,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
 private:
  const OperatorBase& op_;
  const Scope& scope_;
+  const RuntimeContext& ctx_;
 };
 static void CheckTensorNANOrInf(const std::string& name,
@@ -696,15 +755,15 @@ static void CheckTensorNANOrInf(const std::string& name,
 }
 void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
-                                           const platform::Place& place) const {
+                                           const platform::Place& place,
-  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
+                                           const RuntimeContext& ctx) const {
+  RuntimeInferShapeContext infer_shape_ctx(*this, scope, ctx);
  this->InferShape(&infer_shape_ctx);
 }
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                 const platform::Place& place) const {
-  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
+  RuntimeContext ctx(Inputs(), Outputs(), scope);
-  this->InferShape(&infer_shape_ctx);
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  auto* dev_ctx = pool.Get(place);
@@ -718,15 +777,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
  OpKernelMap& kernels = kernels_iter->second;
-  // TODO(dzhwinter) : kernel fallback mechanism will be added when all the
+  auto expected_kernel_key = this->GetExpectedKernelType(
-  // transform functions are ready.
+      ExecutionContext(*this, scope, *dev_ctx, ctx));
-  // for (auto& candidate : kKernelPriority) {
-  //   Do selection
-  // }
-  auto expected_kernel_key =
-      this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
  auto kernel_iter = kernels.find(expected_kernel_key);
@@ -748,7 +800,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
  // do data transformScope &transfer_scope;
  std::vector<std::string> transfered_inplace_vars;
  auto* transfer_scope =
-      TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars);
+      PrepareData(scope, expected_kernel_key, &transfered_inplace_vars, &ctx);
  // exec scope is the scope that kernel actually executed on.
  const Scope& exec_scope =
@@ -758,7 +810,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
    dev_ctx = pool.Get(expected_kernel_key.place_);
  }
-  kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx));
+  RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx);
+  this->InferShape(&infer_shape_ctx);
+  // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
+  // not Scope. Imperative mode only pass inputs and get outputs.
+  kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx, ctx));
  if (!transfered_inplace_vars.empty()) {
    // there is inplace variable has been transfered.
@@ -782,6 +838,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
    }
  }
 }
 void OperatorWithKernel::TransferInplaceVarsBack(
    const Scope& scope, const std::vector<std::string>& inplace_vars,
    const Scope& transfer_scope) const {
@@ -797,13 +854,19 @@ void OperatorWithKernel::TransferInplaceVarsBack(
  }
 }
-Scope* OperatorWithKernel::TryTransferData(
+Scope* OperatorWithKernel::PrepareData(
    const Scope& scope, const OpKernelType& expected_kernel_key,
-    std::vector<std::string>* transfered_inplace_vars) const {
+    std::vector<std::string>* transfered_inplace_vars,
+    RuntimeContext* ctx) const {
  Scope* new_scope = nullptr;
  for (auto& var_name_item : Inputs()) {
-    for (auto& var_name : var_name_item.second) {
+    std::vector<Variable*>& input_vars = ctx->inputs[var_name_item.first];
+    for (size_t i = 0; i < var_name_item.second.size(); ++i) {
+      auto& var_name = var_name_item.second[i];
      auto* var = scope.FindVar(var_name);
+      input_vars[i] = var;
      // Only tensor can be tranfer to another device.
      if (var == nullptr || !VarIsTensor(*var)) {
        continue;
@@ -851,6 +914,7 @@ Scope* OperatorWithKernel::TryTransferData(
      }
      auto* trans_var = new_scope->Var(var_name);
+      input_vars[i] = trans_var;
      Tensor out;
      TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out);

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -70,6 +70,15 @@ Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var);
 class OperatorBase;
 class ExecutionContext;
+class RuntimeContext {
+ public:
+  RuntimeContext(const VariableNameMap& innames,
+                 const VariableNameMap& outnames, const Scope& scope);
+  VariableValueMap inputs;
+  VariableValueMap outputs;
+};
 /**
 * OperatorBase has the basic elements that Net will call to do computation.
 * Only CreateOperator from OpRegistry will new Operator directly. User
@@ -129,7 +138,8 @@ class OperatorBase {
  void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; }
  virtual void RuntimeInferShape(const Scope& scope,
-                                 const platform::Place& place) const {}
+                                 const platform::Place& place,
+                                 const RuntimeContext& ctx) const {}
 protected:
  std::string type_;
@@ -156,8 +166,9 @@ class OperatorBase {
 class ExecutionContext {
 public:
  ExecutionContext(const OperatorBase& op, const Scope& scope,
-                   const platform::DeviceContext& device_context)
+                   const platform::DeviceContext& device_context,
-      : op_(op), scope_(scope), device_context_(device_context) {}
+                   const RuntimeContext& ctx)
+      : op_(op), scope_(scope), device_context_(device_context), ctx_(ctx) {}
  const OperatorBase& op() const { return op_; }
@@ -180,15 +191,9 @@ class ExecutionContext {
    return op_.Outputs(name).size();
  }
-  const Variable* InputVar(const std::string& name) const {
+  const Variable* InputVar(const std::string& name) const;
-    auto ipt = op_.Input(name);
-    return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
-  }
-  Variable* OutputVar(const std::string& name) const {
+  Variable* OutputVar(const std::string& name) const;
-    auto opt = op_.Output(name);
-    return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt);
-  }
  const std::vector<const Variable*> MultiInputVar(
      const std::string& name) const {
@@ -227,6 +232,22 @@ class ExecutionContext {
    return var == nullptr ? nullptr : var->GetMutable<T>();
  }
+  template <typename T>
+  const T* LegacyInput(const std::string& name) const {
+    auto* var = LegacyInputVar(name);
+    return var == nullptr ? nullptr : &var->Get<T>();
+  }
+  template <typename T>
+  T* LegacyOutput(const std::string& name) const {
+    auto var = LegacyOutputVar(name);
+    return var == nullptr ? nullptr : var->GetMutable<T>();
+  }
+  const Variable* LegacyInputVar(const std::string& name) const;
+  Variable* LegacyOutputVar(const std::string& name) const;
  template <typename T>
  const std::vector<const T*> MultiInput(const std::string& name) const {
    auto names = op_.Inputs(name);
@@ -286,11 +307,16 @@ class ExecutionContext {
  const OperatorBase& op_;
  const Scope& scope_;
  const platform::DeviceContext& device_context_;
+  const RuntimeContext& ctx_;
 };
 template <>
 const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const;
+template <>
+const Tensor* ExecutionContext::LegacyInput<Tensor>(
+    const std::string& name) const;
 template <>
 const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
    const std::string& name) const;
@@ -298,6 +324,9 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
 template <>
 Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const;
+template <>
+Tensor* ExecutionContext::LegacyOutput<Tensor>(const std::string& name) const;
 template <>
 std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
    const std::string& name) const;
@@ -350,8 +379,8 @@ class OperatorWithKernel : public OperatorBase {
    OpInfoMap::Instance().Get(Type()).infer_shape_(ctx);
  }
-  void RuntimeInferShape(const Scope& scope,
+  void RuntimeInferShape(const Scope& scope, const platform::Place& place,
-                         const platform::Place& place) const override;
+                         const RuntimeContext& ctx) const override;
 protected:
  virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
@@ -371,9 +400,10 @@ class OperatorWithKernel : public OperatorBase {
   *
   * * transfered_inplace_vars is a output vector.
   */
-  Scope* TryTransferData(
+  Scope* PrepareData(const Scope& scope,
-      const Scope& scope, const OpKernelType& expected_kernel_key,
+                     const OpKernelType& expected_kernel_key,
-      std::vector<std::string>* transfered_inplace_vars) const;
+                     std::vector<std::string>* transfered_inplace_vars,
+                     RuntimeContext* ctx) const;
  void TransferInplaceVarsBack(const Scope& scope,
                               const std::vector<std::string>& inplace_vars,

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/framework/parallel_executor.h"
+#include <algorithm>
 #include <string>
 #include <tuple>
 #include <vector>
@@ -93,6 +94,7 @@ class ParallelExecutorPrivate {
    }
  }
+  BuildStrategy build_strategy_;
  std::vector<platform::Place> places_;
  std::vector<Scope *> local_scopes_;
  Scope *global_scope_;  // not owned
@@ -169,6 +171,14 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
    eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_);
    graph = eager_deletion_pass->Apply(std::move(graph));
    VLOG(10) << "EagerDeletionPass Applied";
+    if (build_strategy_.memory_early_delete_) {
+      auto early_delete_pass =
+          ir::PassRegistry::Instance().Get("memory_early_delete_pass");
+      early_delete_pass->SetNotOwned(details::kGarbageCollector, &gcs_);
+      graph = early_delete_pass->Apply(std::move(graph));
+    }
+    VLOG(10) << "MemoryEarlyDeletePass Applied.";
  }
  return graph;
@@ -189,6 +199,7 @@ ParallelExecutor::ParallelExecutor(
    : member_(new ParallelExecutorPrivate(places)) {
  member_->global_scope_ = scope;
  member_->use_cuda_ = exec_strategy.use_cuda_;
+  member_->build_strategy_ = build_strategy;
  member_->use_all_reduce_ =
      build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce;
@@ -245,7 +256,6 @@ ParallelExecutor::ParallelExecutor(
      build_strategy.Apply(main_program, member_->places_, loss_var_name,
                           params, member_->local_scopes_, member_->use_cuda_);
 #endif
  auto max_memory_size = GetEagerDeletionThreshold();
  if (max_memory_size >= 0) {
    graph = member_->PrepareGCAndRefCnts(std::move(graph),
@@ -280,10 +290,12 @@ ParallelExecutor::ParallelExecutor(
  if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
    member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, places, std::move(graph)));
+        exec_strategy, member_->local_scopes_, member_->places_,
+        std::move(graph)));
  } else {
    member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, places, std::move(graph)));
+        exec_strategy, member_->local_scopes_, member_->places_,
+        std::move(graph)));
  }
  member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
@@ -423,5 +435,6 @@ ParallelExecutor::~ParallelExecutor() {
 }  // namespace framework
 }  // namespace paddle
+USE_PASS(memory_early_delete_pass);
 USE_PASS(reference_count_pass);
 USE_PASS(eager_deletion_pass);
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -74,6 +74,22 @@ TEST(Tensor, MutableData) {
    p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
                                        platform::CPUPlace());
    EXPECT_EQ(p1, p2);
+    float* p3 = nullptr;
+    float* p4 = nullptr;
+    // set src_tensor a different type but smaller size.
+    // memory block is supposed to be unchanged.
+    auto* tmp = src_tensor.mutable_data<uint8_t>(framework::make_ddim({2, 2}),
+                                                 platform::CPUPlace());
+    p3 = reinterpret_cast<float*>(tmp);
+    EXPECT_EQ(p1, p3);
+    // set src_tensor a different type but bigger size.
+    // memory block is supposed to be changed.
+    auto* tmp2 = src_tensor.mutable_data<double>(
+        framework::make_ddim({2, 2, 3}), platform::CPUPlace());
+    p4 = reinterpret_cast<float*>(tmp2);
+    EXPECT_NE(p1, p4);
  }
  // Not sure if it's desired, but currently, Tensor type can be changed.
  {

--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -28,8 +28,11 @@ class OperatorBase;
 class OpDesc;
 class InferShapeContext;
 class BlockDesc;
+class Variable;
 using VariableNameMap = std::map<std::string, std::vector<std::string>>;
+// TODO(panyx0718): Replace vector with something like gtl::Vector.
+using VariableValueMap = std::map<std::string, std::vector<Variable*>>;
 // The order should be as same as framework.proto
 using Attribute =

--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -63,7 +63,6 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
 void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
                                            Graph *graph) const {
  auto *op_desc = node->Op();
-  static int counter{0};
  auto &subgraph = *Agent(node).subgraph();
  PADDLE_ENFORCE(!subgraph.empty());
@@ -192,8 +191,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
          block_desc.Proto()->SerializeAsString());
  SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size"));
  SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size"));
-  SetAttr(op_desc->Proto(), "engine_uniq_key",
-          "trt-" + std::to_string(counter++));
  SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes()));
  SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
 }

--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -122,6 +122,7 @@ class GpuPassStrategy : public PassStrategy {
        "conv_bn_fuse_pass",                    //
        "conv_elementwise_add_act_fuse_pass",   //
        "conv_elementwise_add2_act_fuse_pass",  //
+        "conv_elementwise_add_fuse_pass",       //
    });
  }

--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -103,6 +103,7 @@ class OpConverter {
  void ConvertBlock(const framework::proto::BlockDesc& block,
                    const std::unordered_set<std::string>& parameters,
                    const framework::Scope& scope, TensorRTEngine* engine) {
+    std::unique_lock<std::mutex> lk(mut_);
    for (int i = 0; i < block.ops_size(); i++) {
      const auto& op = block.ops(i);
      ConvertOp(op, parameters, scope, engine);
@@ -125,6 +126,7 @@ class OpConverter {
  std::unordered_map<std::string, OpConverter*> converters_;
  // fluid inference scope
  framework::Scope* scope_{nullptr};
+  std::mutex mut_;
 };
 }  // namespace tensorrt

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -30,6 +30,13 @@ function(inference_analysis_api_test_with_fake_data target install_dir filename
        ARGS --infer_model=${install_dir}/model)
 endfunction()
+function(inference_analysis_api_test_with_refer_result target install_dir filename)
+    inference_analysis_test(${target} SRCS ${filename}
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+        ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt
+             --refer_result=${install_dir}/result.txt)
+endfunction()
 # RNN1
 if(NOT APPLE AND WITH_MKLML)
    set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
@@ -83,14 +90,21 @@ set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
 if (NOT EXISTS ${OCR_INSTALL_DIR})
    inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
 endif()
-inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
+inference_analysis_api_test_with_refer_result(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
+# mobilenet with transpose op
+set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet")
+if (NOT EXISTS ${MOBILENET_INSTALL_DIR})
+    inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz")
+endif()
+inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc)
 # resnet50
 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
  "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz")
 # mobilenet with depthwise_conv op
-inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet
+inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv
  "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz")
 # anakin

--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -93,18 +93,20 @@ void profile(bool use_mkldnn = false) {
  SetInput(&input_slots_all);
  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
                 input_slots_all, &outputs, FLAGS_num_threads);
  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
-    const float ocr_result_data[] = {
+    std::string line;
-        5.273636460856323538e-08, 3.296741795111302054e-07,
+    std::ifstream file(FLAGS_refer_result);
-        1.873261190610264748e-08, 3.403730275408634043e-08,
+    std::getline(file, line);
-        3.383312474625199684e-08};
+    auto refer = ProcessALine(line);
-    PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
+    file.close();
-    size_t size = GetSize(outputs[0]);
-    PADDLE_ENFORCE_GT(size, 0);
+    auto &output = outputs.front();
-    float *result = static_cast<float *>(outputs[0].data.data());
+    size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
-    for (size_t i = 0; i < std::min(5UL, size); i++) {
+    CHECK_EQ(numel, refer.data.size());
-      EXPECT_NEAR(result[i], ocr_result_data[i], 1e-3);
+    for (size_t i = 0; i < numel; ++i) {
+      CHECK_LT(
+          fabs(static_cast<float *>(output.data.data())[i] - refer.data[i]),
+          1e-5);
    }
  }
 }

--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -36,6 +36,7 @@
 DEFINE_string(model_name, "", "model name");
 DEFINE_string(infer_model, "", "model path");
 DEFINE_string(infer_data, "", "data file");
+DEFINE_string(refer_result, "", "reference result for comparison");
 DEFINE_int32(batch_size, 1, "batch size.");
 DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
 DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -64,9 +64,7 @@ endif()
 set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
-if (NOT WIN32)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
-    set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
-endif()
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
 if (WITH_GPU)

--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -122,7 +122,8 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
    auto& dev_ctx = *pool.Get(dev_place);
-    framework::ExecutionContext ctx(*this, scope, dev_ctx);
+    framework::RuntimeContext run_ctx(Inputs(), Outputs(), scope);
+    framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx);
    const LoDTensorArray* ids = ctx.Input<LoDTensorArray>("Ids");
    const LoDTensorArray* scores = ctx.Input<LoDTensorArray>("Scores");

--- a/paddle/fluid/operators/controlflow/parallel_do_op.cc
+++ b/paddle/fluid/operators/controlflow/parallel_do_op.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <vector>
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
-namespace paddle {
-namespace operators {
-static constexpr char kInputs[] = "inputs";
-static constexpr char kParameters[] = "parameters";
-static constexpr char kPlaces[] = "places";
-static constexpr char kOutputs[] = "outputs";
-static constexpr char kParallelScopes[] = "parallel_scopes";
-static constexpr char kParallelBlock[] = "sub_block";
-static constexpr char kUseNCCL[] = "use_nccl";
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-static void SplitTensorAndMoveTensorToScopes(
-    const framework::Scope &scope, std::vector<framework::Scope *> *sub_scopes,
-    const std::vector<platform::Place> &places,
-    const std::vector<std::string> &names) {
-  size_t num_sub_scopes = 0;
-  for (auto &argu : names) {
-    const auto &tensor =
-        detail::Ref(scope.FindVar(argu),
-                    "Cannot find variable %s in the parent scope", argu)
-            .Get<LoDTensor>();
-    auto lod_tensors = tensor.SplitLoDTensor(places);
-    for (auto &lod : lod_tensors) {
-      VLOG(3) << lod.dims();
-    }
-    if (num_sub_scopes == 0) {
-      num_sub_scopes = lod_tensors.size();
-    } else {
-      PADDLE_ENFORCE_EQ(num_sub_scopes, lod_tensors.size());
-    }
-    PADDLE_ENFORCE_NE(num_sub_scopes, 0);
-    if (sub_scopes->size() == 0) {
-      sub_scopes->reserve(num_sub_scopes);
-      for (size_t i = 0; i < num_sub_scopes; ++i) {
-        sub_scopes->emplace_back(&scope.NewScope());
-      }
-    }
-    for (size_t i = 0; i < lod_tensors.size(); ++i) {
-      *detail::Ref(sub_scopes->at(i)->Var(argu),
-                   "Cannot find variable in the sub-scope", argu)
-           .GetMutable<LoDTensor>() = lod_tensors[i];
-    }
-  }
-}
-inline void CopyOrShare(const framework::Variable &src,
-                        const platform::Place &dst_place,
-                        framework::Variable *dst) {
-  if (src.IsType<LoDTensor>()) {
-    if (src.Get<LoDTensor>().place() == dst_place) {
-      dst->GetMutable<LoDTensor>()->ShareDataWith(src.Get<LoDTensor>());
-      dst->GetMutable<LoDTensor>()->set_lod(src.Get<LoDTensor>().lod());
-    } else {
-      TensorCopy(src.Get<LoDTensor>(), dst_place, dst->GetMutable<LoDTensor>());
-    }
-  } else if (src.IsType<SelectedRows>()) {
-    auto &src_sr = src.Get<SelectedRows>();
-    auto *dst_sr = dst->GetMutable<SelectedRows>();
-    dst_sr->set_height(src_sr.height());
-    if (src_sr.value().place() == dst_place) {
-      dst_sr->mutable_value()->ShareDataWith(src_sr.value());
-      dst_sr->set_rows(src_sr.rows());
-    } else {
-      TensorCopy(src_sr.value(), dst_place, dst_sr->mutable_value());
-    }
-  } else {
-    PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name());
-  }
-}
-void WaitOnPlace(const platform::Place place) {
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &dev_ctx = *pool.Get(place);
-  dev_ctx.Wait();
-}
-void WaitOnPlaces(const std::vector<platform::Place> places) {
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  for (auto &place : places) {
-    auto &dev_ctx = *pool.Get(place);
-    dev_ctx.Wait();
-  }
-}
-class ParallelDoOp : public framework::OperatorBase {
- public:
-  ParallelDoOp(const std::string &type,
-               const framework::VariableNameMap &inputs,
-               const framework::VariableNameMap &outputs,
-               const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-    auto *block = Attr<framework::BlockDesc *>(kParallelBlock);
-    auto *program = block->Program();
-    auto &places = scope.FindVar(Input(kPlaces))->Get<platform::PlaceList>();
-    auto &sub_scopes = *scope.FindVar(Output(kParallelScopes))
-                            ->GetMutable<std::vector<framework::Scope *>>();
-    // split input
-    SplitTensorAndMoveTensorToScopes(scope, &sub_scopes, places,
-                                     Inputs(kInputs));
-    // copy parameter
-    for (auto &param : Inputs(kParameters)) {
-      PADDLE_ENFORCE(scope.FindVar(param)->IsType<LoDTensor>(),
-                     "Only support parameter type as LoDTensor");
-      auto &src = scope.FindVar(param)->Get<LoDTensor>();
-      auto *sub_scope0 = sub_scopes[0];
-      auto *dst0 = sub_scope0->Var(param)->GetMutable<LoDTensor>();
-      dst0->ShareDataWith(src);
-      for (size_t i = 1; i < sub_scopes.size(); ++i) {
-        auto &place = places[i];
-        auto *sub_scope = sub_scopes[i];
-        auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>();
-        framework::TensorCopy(src, place, dst);
-      }
-    }
-    WaitOnPlaces(places);
-    std::vector<std::future<void>> workers;
-    workers.reserve(places.size());
-    for (size_t place_idx = 0; place_idx < sub_scopes.size(); ++place_idx) {
-      auto &place = places[place_idx];
-      auto *cur_scope = sub_scopes[place_idx];
-      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
-        framework::Executor executor(place);
-        executor.Run(*program, cur_scope, block->ID(),
-                     false /*create_local_scope*/);
-      }));
-    }
-    for (auto &worker : workers) {
-      worker.wait();
-    }
-    WaitOnPlaces(places);
-    // merge output
-    for (auto &o_name : Outputs(kOutputs)) {
-      std::vector<const framework::LoDTensor *> lod_tensors;
-      lod_tensors.reserve(sub_scopes.size());
-      for (auto *sub_scope : sub_scopes) {
-        lod_tensors.emplace_back(&sub_scope->FindVar(o_name)->Get<LoDTensor>());
-      }
-      auto *lod_tensor_to_be_merged =
-          scope.FindVar(o_name)->GetMutable<LoDTensor>();
-      lod_tensor_to_be_merged->MergeLoDTensor(lod_tensors, dev_ctx.GetPlace());
-    }
-    WaitOnPlaces(places);
-  }
-};
-class ParallelDoOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(kInputs, "").AsDuplicable();
-    AddInput(kParameters, "").AsDuplicable();
-    AddInput(kPlaces, "");
-    AddOutput(kOutputs, "").AsDuplicable();
-    AddOutput(kParallelScopes, "");
-    AddAttr<framework::BlockDesc *>(kParallelBlock, "");
-    AddAttr<bool>(kUseNCCL, "true if we use nccl on backward")
-        .SetDefault(false);
-    AddComment(R"DOC(
-ParallelDo Operator.
-)DOC");
-  }
-};
-class ParallelDoGradOp : public framework::OperatorBase {
- public:
-  ParallelDoGradOp(const std::string &type,
-                   const framework::VariableNameMap &inputs,
-                   const framework::VariableNameMap &outputs,
-                   const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    auto *block = Attr<framework::BlockDesc *>(kParallelBlock);
-    auto *program = block->Program();
-    auto &sub_scopes = scope.FindVar(Input(kParallelScopes))
-                           ->Get<std::vector<framework::Scope *>>();
-    auto &places = scope.FindVar(Input(kPlaces))->Get<platform::PlaceList>();
-    // feed output@grad
-    SplitTensorAndMoveTensorToScopes(
-        scope, const_cast<std::vector<framework::Scope *> *>(&sub_scopes),
-        places, Inputs(framework::GradVarName(kOutputs)));
-    WaitOnPlaces(places);
-    // exe run
-    std::vector<std::future<void>> workers;
-    for (size_t i = 0; i < sub_scopes.size(); ++i) {
-      auto &place = places[i];
-      auto *cur_scope = sub_scopes[i];
-      // execute
-      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
-        framework::Executor executor(place);
-        executor.Run(*program, cur_scope, block->ID(),
-                     false /*create_local_scope*/);
-      }));
-    }
-    for (auto &worker : workers) {
-      worker.wait();
-    }
-    WaitOnPlaces(places);
-    // NCCL allreduce op will be added by backward,
-    // so no need to explicitly accumulate grad
-    if (!(Attr<bool>(kUseNCCL))) {
-      AccumulateGrad(scope, place, sub_scopes, places);
-    } else {
-      for (auto &place : places) {
-        PADDLE_ENFORCE(platform::is_gpu_place(place),
-                       "NCCL only supports cuda place");
-      }
-    }
-    for (auto &s : Outputs(framework::GradVarName(kParameters))) {
-      if (s == framework::kEmptyVarName) {
-        continue;
-      }
-      VLOG(3) << "Moving " << s;
-      CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s));
-    }
-    WaitOnPlaces(places);
-  }
-  void AccumulateGrad(const framework::Scope &scope,
-                      const platform::Place &place,
-                      const std::vector<framework::Scope *> &sub_scopes,
-                      const platform::PlaceList &places) const {
-    for (auto &s : Outputs(framework::GradVarName(kParameters))) {
-      if (s == framework::kEmptyVarName) {
-        continue;
-      }
-      VLOG(3) << "Accumulating " << s;
-      if (s == framework::kEmptyVarName) continue;
-      std::string tmp_name;
-      auto *tmp = sub_scopes[0]->Var(&tmp_name);
-      for (size_t i = 1; i < sub_scopes.size(); ++i) {
-        CopyOrShare(*sub_scopes[i]->FindVar(s), places[0], tmp);
-        WaitOnPlaces(places);
-        auto sum_op = framework::OpRegistry::CreateOp(
-            "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
-            framework::AttributeMap{{"use_mkldnn", {false}}});
-        VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]);
-        sum_op->Run(*sub_scopes[0], places[0]);
-        WaitOnPlace(places[0]);
-      }
-      CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s));
-    }
-    WaitOnPlaces(places);
-  }
-};
-std::ostream &operator<<(std::ostream &sout,
-                         const std::vector<std::string> &strs) {
-  std::copy(strs.begin(), strs.end(),
-            std::ostream_iterator<std::string>(sout, ","));
-  return sout;
-}
-class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
- protected:
-  virtual std::unique_ptr<framework::OpDesc> Apply() const {
-    auto *grad = new framework::OpDesc();
-    grad->SetType("parallel_do_grad");
-    for (auto &input_param : this->InputNames()) {
-      VLOG(3) << input_param;
-      grad->SetInput(input_param, this->Input(input_param));
-      if (input_param != kPlaces) {
-        grad->SetOutput(framework::GradVarName(input_param),
-                        this->InputGrad(input_param, false));
-      }
-    }
-    auto *g_block = this->grad_block_[0];
-    // All variable name that needed by gradient operators
-    std::unordered_set<std::string> all_inputs_in_grad_blocks;
-    for (size_t i = 0; i < g_block->OpSize(); ++i) {
-      auto *op = g_block->Op(i);
-      for (auto &var_name : op->InputArgumentNames()) {
-        all_inputs_in_grad_blocks.insert(var_name);
-      }
-    }
-    for (auto &output_param : this->OutputNames()) {
-      if (output_param == kParallelScopes) {
-        grad->SetInput(output_param, this->Output(output_param));
-        grad->SetInput(framework::GradVarName(output_param),
-                       this->Output(output_param));
-      } else {
-        grad->SetInput(output_param, this->Output(output_param));
-        std::vector<std::string> og_names;
-        for (auto &og_name : this->OutputGrad(output_param)) {
-          if (all_inputs_in_grad_blocks.count(og_name) != 0) {
-            // there are some gradient operators who need the OG. So make this
-            // OG as an input of parallel.do
-            og_names.push_back(og_name);
-          }
-          // else, there is no operator who need the OG. Do not use this OG as
-          // an input
-        }
-        grad->SetInput(framework::GradVarName(output_param), og_names);
-      }
-    }
-    grad->SetInput("Communicator", {"nccl_com__do_not_change_"});
-    grad->SetAttrMap(this->Attrs());
-    grad->SetBlockAttr(kParallelBlock, grad_block_[0]);
-    return std::unique_ptr<framework::OpDesc>(grad);
-  }
-};
-class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInputs(kParameters));
-    PADDLE_ENFORCE(ctx->HasInputs(kInputs));
-    PADDLE_ENFORCE(ctx->HasInputs(kOutputs));
-    ctx->SetOutputsDim(framework::GradVarName(kParameters),
-                       ctx->GetInputsDim(kParameters));
-    auto i_dims = ctx->GetInputsDim(kInputs);
-    auto ig_names = ctx->Outputs(framework::GradVarName(kInputs));
-    for (size_t i = 0; i < ig_names.size(); ++i) {
-      auto &ig_name = ig_names[i];
-      if (ig_name == framework::kEmptyVarName) {
-        continue;
-      }
-      ctx->SetDims({ig_name}, {i_dims[i]});
-    }
-    auto p_dims = ctx->GetInputsDim(kParameters);
-    auto pg_names = ctx->Outputs(framework::GradVarName(kParameters));
-    for (size_t i = 0; i < pg_names.size(); ++i) {
-      auto &pg_name = pg_names[i];
-      if (pg_name == framework::kEmptyVarName) {
-        continue;
-      }
-      ctx->SetDims({pg_name}, {p_dims[i]});
-    }
-  }
-};
-class ParallelDoGradOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    framework::BlockDesc *sub_block =
-        boost::get<framework::BlockDesc *>(op_desc.GetAttr(kParallelBlock));
-    for (auto &out_vars : op_desc.Outputs()) {
-      for (auto &out_var : out_vars.second) {
-        auto &var = block->FindRecursiveOrCreateVar(out_var);
-        auto sub_var = sub_block->FindRecursiveOrCreateVar(out_var);
-        if (sub_var.GetType() != var.GetType()) {
-          var.SetType(sub_var.GetType());
-        }
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-REGISTER_OPERATOR(parallel_do, paddle::operators::ParallelDoOp,
-                  paddle::operators::ParallelDoOpProtoMaker,
-                  paddle::operators::ParallelDoGradOpDescMaker);
-REGISTER_OPERATOR(parallel_do_grad, paddle::operators::ParallelDoGradOp,
-                  paddle::operators::ParallelDoGradOpShapeInference,
-                  paddle::operators::ParallelDoGradOpVarTypeInference);
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -150,19 +150,27 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
                                                       label.data<int64_t>()));
    }
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    // softrelu derivative
-    auto pre_out_mat = EigenMatrix<T>::From(pre_out);
-    auto pre_out_grad_mat = EigenMatrix<T>::From(pre_out_grad);
-    auto out_grad_mat = EigenMatrix<T>::From(out_grad);
-    Eigen::array<int, 2> bcast{1, static_cast<int>(pre_out_grad.dims()[1])};
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
-    // softrelu derivative
+    auto* pre_out_grad_data = pre_out_grad.data<T>();
-    pre_out_grad_mat.device(place) =
+    auto* pre_out_data = pre_out.data<T>();
-        static_cast<T>(1.0) - static_cast<T>(1.0) / pre_out_mat.exp();
+    auto n = pre_out.numel();
+    blas.VEXP(n, pre_out_data, pre_out_grad_data);
+    blas.VINV(n, pre_out_grad_data, pre_out_grad_data);
+    for (int64_t i = 0; i < n; ++i) {
+      pre_out_grad_data[i] = 1.0 - pre_out_grad_data[i];
+    }
    bit_code->Sub(&pre_out_grad);  // the gradient of clip(w * x + b)
-    pre_out_grad_mat.device(place) =
+    auto* out_grad_data = out_grad.data<T>();
-        pre_out_grad_mat * out_grad_mat.broadcast(bcast);
+    int64_t dim0 = pre_out_grad.dims()[0];
+    int64_t dim1 = pre_out_grad.dims()[1];
+    for (int64_t i = 0; i < dim0; ++i) {
+      T tmp = out_grad_data[i];
+      blas.SCAL(dim1, tmp, pre_out_grad_data + i * dim1);
+    }
    // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
    // be consistent with the clipping in forward.

--- a/paddle/fluid/operators/huber_loss_op.cc
+++ b/paddle/fluid/operators/huber_loss_op.cc
@@ -124,8 +124,9 @@ REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(huber_loss_grad, ops::HuberLossGradOp);
 REGISTER_OP_CPU_KERNEL(
-    huber_loss,
+    huber_loss, ops::HuberLossKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::HuberLossKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::HuberLossKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
    huber_loss_grad,
-    ops::HuberLossGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::HuberLossGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::HuberLossGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -181,6 +181,9 @@ class Blas {
              const framework::Tensor& mat_b, const MatDescriptor& dim_b,
              T alpha, framework::Tensor* mat_out, T beta) const;
+  template <typename T>
+  void VINV(int n, const T* a, T* y) const;
 private:
  const DeviceContext& context_;
 };
@@ -282,6 +285,11 @@ class BlasT : private Blas<DeviceContext> {
    Base()->template BatchedGEMM<T>(args...);
  }
+  template <typename... ARGS>
+  void VINV(ARGS... args) const {
+    Base()->template VINV<T>(args...);
+  }
 private:
  const Blas<DeviceContext>* Base() const {
    return static_cast<const Blas<DeviceContext>*>(this);

--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -118,6 +118,11 @@ struct CBlas<float> {
  static void VPOW(ARGS... args) {
    platform::dynload::vsPowx(args...);
  }
+  template <typename... ARGS>
+  static void VINV(ARGS... args) {
+    platform::dynload::vsInv(args...);
+  }
 };
 template <>
@@ -213,6 +218,11 @@ struct CBlas<double> {
  static void VPOW(ARGS... args) {
    platform::dynload::vdPowx(args...);
  }
+  template <typename... ARGS>
+  static void VINV(ARGS... args) {
+    platform::dynload::vdInv(args...);
+  }
 };
 #else
@@ -603,6 +613,17 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
        dim_a.stride_, dim_b.stride_);
  }
 }
+template <typename DeviceContext>
+template <typename T>
+void Blas<DeviceContext>::VINV(int n, const T *a, T *y) const {
+#ifdef PADDLE_WITH_MKLML
+  CBlas<T>::VINV(n, a, y);
+#else
+  for (int i = 0; i < n; ++i) {
+    y[i] = 1.0 / a[i];
+  }
+#endif
+}
 }  // namespace math
 }  // namespace operators

--- a/paddle/fluid/operators/math/matrix_bit_code.cc
+++ b/paddle/fluid/operators/math/matrix_bit_code.cc
@@ -14,218 +14,380 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/matrix_bit_code.h"
 #include <iostream>
+#include <map>
 namespace paddle {
 namespace operators {
 namespace math {
 template <typename T>
-void MatrixBitCodeFunctor<T>::Add(const framework::Tensor& vec,
+struct MatrixBitCodeFunctorAdd : public boost::static_visitor<void> {
-                                  framework::Tensor* tmat) {
+  const framework::Tensor &vec_;
-  size_t batch_size = tmat->dims()[0];
+  framework::Tensor *tmat_;
-  size_t width = tmat->dims()[1];
-  for (size_t i = 0; i < batch_size; ++i) {
+  MatrixBitCodeFunctorAdd(const framework::Tensor &vec, framework::Tensor *tmat)
-    auto code = code_table_->get_code(i);
+      : vec_(vec), tmat_(tmat) {}
-    int code_length = code->get_length();
-    for (int j = 0; j < code_length; ++j) {
+  template <typename CodeTable>
-      size_t index = code->calc_index(j);
+  void operator()(const CodeTable &code_table) {
-      tmat->data<T>()[i * width + j] += vec.data<T>()[index];
+    size_t batch_size = tmat_->dims()[0];
+    size_t width = tmat_->dims()[1];
+    auto *tmat_data = tmat_->data<T>();
+    auto *vec_data = vec_.data<T>();
+    for (size_t i = 0; i < batch_size; ++i) {
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      for (int j = 0; j < code_length; ++j) {
+        size_t index = code.calc_index(j);
+        tmat_data[i * width + j] += vec_data[index];
+      }
    }
  }
+};
+template <typename T>
+void MatrixBitCodeFunctor<T>::Add(const framework::Tensor &vec,
+                                  framework::Tensor *tmat) {
+  MatrixBitCodeFunctorAdd<T> func(vec, tmat);
+  code_table_.apply_visitor(func);
 }
 template <typename T>
-void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor& tmat,
+struct MatrixBitCodeFunctorAddGrad : public boost::static_visitor<void> {
-                                      framework::Tensor* vec) {
+  const framework::Tensor &tmat_;
-  size_t batch_size = tmat.dims()[0];
+  framework::Tensor *vec_;
-  size_t width = tmat.dims()[1];
+  MatrixBitCodeFunctorAddGrad(const framework::Tensor &tmat,
-  for (size_t i = 0; i < batch_size; ++i) {
+                              framework::Tensor *vec)
-    auto code = code_table_->get_code(i);
+      : tmat_(tmat), vec_(vec) {}
-    int code_length = code->get_length();
-    for (int j = 0; j < code_length; ++j) {
+  template <typename CodeTable>
-      size_t index = code->calc_index(j);
+  void operator()(const CodeTable &table) {
-      vec->data<T>()[index] += tmat.data<T>()[i * width + j];
+    size_t batch_size = tmat_.dims()[0];
+    size_t width = tmat_.dims()[1];
+    auto *vec_data = vec_->data<T>();
+    auto *tmat_data = tmat_.data<T>();
+    for (size_t i = 0; i < batch_size; ++i) {
+      auto code = table.get_code(i);
+      int code_length = code.get_length();
+      for (int j = 0; j < code_length; ++j) {
+        size_t index = code.calc_index(j);
+        vec_data[index] += tmat_data[i * width + j];
+      }
    }
  }
+};
+template <typename T>
+void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor &tmat,
+                                      framework::Tensor *vec) {
+  MatrixBitCodeFunctorAddGrad<T> func(tmat, vec);
+  code_table_.apply_visitor(func);
 }
 template <typename T>
-void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor& tmat,
+struct MatrixBitCodeFunctorSelectedRowsAddGrad
-                                      framework::SelectedRows* vec) {
+    : public boost::static_visitor<void> {
-  size_t batch_size = tmat.dims()[0];
+  const framework::Tensor &tmat_;
-  size_t width = tmat.dims()[1];
+  framework::SelectedRows *vec_;
-  for (size_t i = 0; i < batch_size; ++i) {
-    auto code = code_table_->get_code(i);
+  MatrixBitCodeFunctorSelectedRowsAddGrad(const framework::Tensor &tmat,
-    int code_length = code->get_length();
+                                          framework::SelectedRows *vec)
-    for (int j = 0; j < code_length; ++j) {
+      : tmat_(tmat), vec_(vec) {}
-      size_t index = code->calc_index(j);
-      int64_t row_index = vec->GetIndexFromId(static_cast<int64_t>(index));
+  template <typename CodeTable>
-      vec->mutable_value()->data<T>()[row_index] +=
+  void operator()(const CodeTable &code_table) {
-          tmat.data<T>()[i * width + j];
+    size_t batch_size = tmat_.dims()[0];
+    size_t width = tmat_.dims()[1];
+    auto *vec_data = vec_->mutable_value()->template data<T>();
+    auto *tmat_data = tmat_.data<T>();
+    for (size_t i = 0; i < batch_size; ++i) {
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      for (int j = 0; j < code_length; ++j) {
+        size_t index = code.calc_index(j);
+        int64_t row_index = vec_->GetIndexFromId(static_cast<int64_t>(index));
+        vec_data[row_index] += tmat_data[i * width + j];
+      }
    }
  }
+};
+template <typename T>
+void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor &tmat,
+                                      framework::SelectedRows *vec) {
+  MatrixBitCodeFunctorSelectedRowsAddGrad<T> func(tmat, vec);
+  code_table_.apply_visitor(func);
 }
 template <typename T>
-void MatrixBitCodeFunctor<T>::Sum(const framework::Tensor& tmat,
+struct MatrixBitCodeFunctorSum : public boost::static_visitor<void> {
-                                  framework::Tensor* sum, T scale_sum) {
+  const framework::Tensor &tmat_;
-  size_t num_samples = tmat.dims()[0];
+  framework::Tensor *sum_;
-  size_t o_width = tmat.dims()[1];
+  T scale_sum_;
-  for (size_t i = 0; i < num_samples; ++i) {
-    T sm = static_cast<T>(0.0);
+  MatrixBitCodeFunctorSum(const framework::Tensor &tmat, framework::Tensor *sum,
-    auto code = code_table_->get_code(i);
+                          T scale_sum)
-    int code_length = code->get_length();
+      : tmat_(tmat), sum_(sum), scale_sum_(scale_sum) {}
-    for (int j = 0; j < code_length; ++j) {
-      if (code->calc_bit(j)) {
+  template <typename CodeTable>
-        // calc_bit starts from right most bit, while data in tmat[i] is in the
+  void operator()(const CodeTable &code_table) {
-        // reverse order.
+    size_t num_samples = tmat_.dims()[0];
-        sm += tmat.data<T>()[i * o_width + j];
+    size_t o_width = tmat_.dims()[1];
+    auto *tmat_data = tmat_.data<T>();
+    auto *sum_data = sum_->data<T>();
+    for (size_t i = 0; i < num_samples; ++i) {
+      T sm = static_cast<T>(0.0);
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      for (int j = 0; j < code_length; ++j) {
+        if (code.calc_bit(j)) {
+          // calc_bit starts from right most bit, while data in tmat[i] is in
+          // the
+          // reverse order.
+          sm += tmat_data[i * o_width + j];
+        }
      }
+      sum_data[i] = scale_sum_ * sm;
    }
-    sum->data<T>()[i] = scale_sum * sm;
  }
+};
+template <typename T>
+void MatrixBitCodeFunctor<T>::Sum(const framework::Tensor &tmat,
+                                  framework::Tensor *sum, T scale_sum) {
+  MatrixBitCodeFunctorSum<T> func(tmat, sum, scale_sum);
+  code_table_.apply_visitor(func);
 }
 template <typename T>
-void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat,
+struct MatrixBitCodeFunctorMul : public boost::static_visitor<void> {
-                                  const framework::Tensor& weight,
+  framework::Tensor *tmat_;
-                                  const framework::Tensor& input) {
+  const framework::Tensor &weight_;
-  auto blas =
+  const framework::Tensor &input_;
-      GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
-  size_t num_samples = tmat->dims()[0];
+  MatrixBitCodeFunctorMul(framework::Tensor *tmat,
-  size_t tmat_width = tmat->dims()[1];
+                          const framework::Tensor &weight,
-  size_t input_width = input.dims()[1];
+                          const framework::Tensor &input)
-  size_t weight_width = weight.dims()[1];
+      : tmat_(tmat), weight_(weight), input_(input) {}
-  auto tmat_value = tmat->data<T>();
-  auto weight_value = weight.data<T>();
+  template <typename CodeTable>
-  auto input_value = input.data<T>();
+  void operator()(const CodeTable &code_table) {
-  for (size_t i = 0; i < num_samples; ++i) {
+    auto blas =
-    auto code = code_table_->get_code(i);
+        GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
-    int code_length = code->get_length();
+    size_t num_samples = tmat_->dims()[0];
-    const T* input_row = input_value + input_width * i;
+    size_t tmat_width = tmat_->dims()[1];
-    for (int j = 0; j < code_length; ++j) {
+    size_t input_width = input_.dims()[1];
-      size_t index = code->calc_index(j);
+    size_t weight_width = weight_.dims()[1];
-      const T* weight_row = weight_value + weight_width * index;
+    auto tmat_value = tmat_->data<T>();
-      T sum = static_cast<T>(0.0);
+    auto weight_value = weight_.data<T>();
-      sum = blas.DOT(input_width, weight_row, input_row);
+    auto input_value = input_.data<T>();
-      tmat_value[i * tmat_width + j] += sum;
+    for (size_t i = 0; i < num_samples; ++i) {
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      const T *input_row = input_value + input_width * i;
+      for (int j = 0; j < code_length; ++j) {
+        size_t index = code.calc_index(j);
+        const T *weight_row = weight_value + weight_width * index;
+        T sum = blas.DOT(input_width, weight_row, input_row);
+        tmat_value[i * tmat_width + j] += sum;
+      }
    }
  }
+};
+template <typename T>
+void MatrixBitCodeFunctor<T>::Mul(framework::Tensor *tmat,
+                                  const framework::Tensor &weight,
+                                  const framework::Tensor &input) {
+  MatrixBitCodeFunctorMul<T> func(tmat, weight, input);
+  code_table_.apply_visitor(func);
 }
+template <typename T, size_t N>
+class ReservedVector : public std::vector<T> {
+ public:
+  ReservedVector() { this->reserve(N); }
+};
 template <typename T>
-void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
+struct MatrixBitCodeFunctorMulGradWeight : public boost::static_visitor<void> {
-                                            framework::Tensor* weight,
+  const framework::Tensor &tmat_;
-                                            const framework::Tensor& input) {
+  framework::Tensor *weight_;
-  auto blas =
+  const framework::Tensor &input_;
-      GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
+  MatrixBitCodeFunctorMulGradWeight(const framework::Tensor &tmat,
-  size_t num_samples = tmat.dims()[0];
+                                    framework::Tensor *weight,
-  size_t input_width = input.dims()[1];
+                                    const framework::Tensor &input)
-  size_t tmat_width = tmat.dims()[1];
+      : tmat_(tmat), weight_(weight), input_(input) {}
-  size_t weight_width = weight->dims()[1];
+  template <typename CodeTable>
-  auto tmat_value = tmat.data<T>();
+  void operator()(const CodeTable &code_table) {
-  auto weight_value = weight->data<T>();
+    auto blas =
-  auto input_value = input.data<T>();
+        GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
+    size_t num_samples = tmat_.dims()[0];
-  std::unordered_map<int, std::vector<std::pair<T, const T*>>> ops;
+    size_t input_width = input_.dims()[1];
+    size_t tmat_width = tmat_.dims()[1];
-  for (size_t i = 0; i < num_samples; ++i) {
+    size_t weight_width = weight_->dims()[1];
-    auto code = code_table_->get_code(i);
+    auto tmat_value = tmat_.data<T>();
-    int code_length = code->get_length();
+    auto weight_value = weight_->data<T>();
-    const T* input_value_row = input_value + input_width * i;
+    auto input_value = input_.data<T>();
-    const T* tmat_row = tmat_value + i * tmat_width;
-    for (int j = 0; j < code_length; ++j) {
+    std::map<int, ReservedVector<std::pair<T, const T *>, 8u>> ops;
-      ops[code->calc_index(j)].emplace_back(tmat_row[j], input_value_row);
+    for (size_t i = 0; i < num_samples; ++i) {
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      const T *input_value_row = input_value + input_width * i;
+      const T *tmat_row = tmat_value + i * tmat_width;
+      for (int j = 0; j < code_length; ++j) {
+        ops[code.calc_index(j)].emplace_back(tmat_row[j], input_value_row);
+      }
    }
-  }
+    for (auto &op : ops) {
-  for (auto& op : ops) {
+      auto &op_in_row = op.second;
-    auto& op_in_row = op.second;
+      for (auto &pair : op_in_row) {
-    for (auto& pair : op_in_row) {
+        auto &scale = pair.first;
-      auto& scale = pair.first;
+        auto *input_row = pair.second;
-      auto* input_row = pair.second;
+        T *weight_row = weight_value + op.first * weight_width;
-      T* weight_row = weight_value + op.first * weight_width;
+        blas.AXPY(input_width, scale, input_row, weight_row);
-      blas.AXPY(input_width, scale, input_row, weight_row);
+      }
    }
  }
+};
+template <typename T>
+void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor &tmat,
+                                            framework::Tensor *weight,
+                                            const framework::Tensor &input) {
+  MatrixBitCodeFunctorMulGradWeight<T> func(tmat, weight, input);
+  code_table_.apply_visitor(func);
 }
 template <typename T>
-void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
+struct MatrixBitCodeFunctorMulGradWeightSR
-                                            framework::SelectedRows* weight,
+    : public boost::static_visitor<void> {
-                                            const framework::Tensor& input) {
+  const framework::Tensor &tmat_;
-  auto blas =
+  framework::SelectedRows *weight_;
-      GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
+  const framework::Tensor &input_;
-  size_t num_samples = tmat.dims()[0];
-  size_t input_width = input.dims()[1];
+  MatrixBitCodeFunctorMulGradWeightSR(const framework::Tensor &tmat,
-  size_t tmat_width = tmat.dims()[1];
+                                      framework::SelectedRows *weight,
-  size_t weight_width = weight->value().dims()[1];
+                                      const framework::Tensor &input)
-  auto tmat_value = tmat.data<T>();
+      : tmat_(tmat), weight_(weight), input_(input) {}
-  auto weight_value = weight->mutable_value()->data<T>();
-  auto input_value = input.data<T>();
+  template <typename CodeTable>
+  void operator()(const CodeTable &code_table) {
-  std::unordered_map<int, std::vector<std::pair<T, const T*>>> ops;
+    auto blas =
-  ops.reserve(weight->rows().size());
+        GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
+    size_t num_samples = tmat_.dims()[0];
-  for (size_t i = 0; i < num_samples; ++i) {
+    size_t input_width = input_.dims()[1];
-    auto code = code_table_->get_code(i);
+    size_t tmat_width = tmat_.dims()[1];
-    int code_length = code->get_length();
+    size_t weight_width = weight_->value().dims()[1];
-    const T* input_value_row = input_value + input_width * i;
+    auto tmat_value = tmat_.data<T>();
-    const T* tmat_row = tmat_value + i * tmat_width;
+    auto weight_value = weight_->mutable_value()->data<T>();
-    for (int j = 0; j < code_length; ++j) {
+    auto input_value = input_.data<T>();
-      ops[code->calc_index(j)].emplace_back(tmat_row[j], input_value_row);
+    std::unordered_map<int, std::vector<std::pair<T, const T *>>> ops;
+    ops.reserve(weight_->rows().size());
+    for (size_t i = 0; i < num_samples; ++i) {
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      const T *input_value_row = input_value + input_width * i;
+      const T *tmat_row = tmat_value + i * tmat_width;
+      for (int j = 0; j < code_length; ++j) {
+        ops[code.calc_index(j)].emplace_back(tmat_row[j], input_value_row);
+      }
    }
-  }
-  for (auto& row : weight->rows()) {
+    for (auto &row : weight_->rows()) {
-    auto& op_in_row = ops[row];
+      auto &op_in_row = ops[row];
-    for (auto& pair : op_in_row) {
+      for (auto &pair : op_in_row) {
-      auto& scale = pair.first;
+        auto &scale = pair.first;
-      auto* input_row = pair.second;
+        auto *input_row = pair.second;
-      blas.AXPY(input_width, scale, input_row, weight_value);
+        blas.AXPY(input_width, scale, input_row, weight_value);
+      }
+      weight_value += weight_width;
    }
-    weight_value += weight_width;
  }
+};
+template <typename T>
+void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor &tmat,
+                                            framework::SelectedRows *weight,
+                                            const framework::Tensor &input) {
+  MatrixBitCodeFunctorMulGradWeightSR<T> func(tmat, weight, input);
+  code_table_.apply_visitor(func);
 }
 template <typename T>
-void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor& tmat,
+struct MatrixBitCodeFunctorMulGradError : public boost::static_visitor<void> {
-                                           const framework::Tensor& weight,
+  const framework::Tensor &tmat_;
-                                           framework::Tensor* input) {
+  const framework::Tensor &weight_;
-  size_t num_samples = tmat.dims()[0];
+  framework::Tensor *input_;
-  size_t tmat_width = tmat.dims()[1];
-  size_t input_width = input->dims()[1];
+  MatrixBitCodeFunctorMulGradError(const framework::Tensor &tmat,
-  size_t weight_width = weight.dims()[1];
+                                   const framework::Tensor &weight,
-  auto tmat_value = tmat.data<T>();
+                                   framework::Tensor *input)
-  auto weight_value = weight.data<T>();
+      : tmat_(tmat), weight_(weight), input_(input) {}
-  auto input_value = input->data<T>();
+  template <typename CodeTable>
+  void operator()(const CodeTable &code_table) {
-  for (size_t i = 0; i < num_samples; ++i) {
+    size_t num_samples = tmat_.dims()[0];
-    auto code = code_table_->get_code(i);
+    size_t tmat_width = tmat_.dims()[1];
-    int code_length = code->get_length();
+    size_t input_width = input_->dims()[1];
-    for (int j = 0; j < code_length; ++j) {
+    size_t weight_width = weight_.dims()[1];
-      size_t index = code->calc_index(j);
+    auto tmat_value = tmat_.data<T>();
+    auto weight_value = weight_.data<T>();
-      for (size_t k = 0; k < input_width; ++k) {
+    auto input_value = input_->data<T>();
-        input_value[input_width * i + k] +=
-            tmat_value[i * tmat_width + j] *
+    for (size_t i = 0; i < num_samples; ++i) {
-            weight_value[weight_width * index + k];
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      for (int j = 0; j < code_length; ++j) {
+        size_t index = code.calc_index(j);
+        for (size_t k = 0; k < input_width; ++k) {
+          input_value[input_width * i + k] +=
+              tmat_value[i * tmat_width + j] *
+              weight_value[weight_width * index + k];
+        }
      }
    }
  }
+};
+template <typename T>
+void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor &tmat,
+                                           const framework::Tensor &weight,
+                                           framework::Tensor *input) {
+  MatrixBitCodeFunctorMulGradError<T> func(tmat, weight, input);
+  code_table_.apply_visitor(func);
 }
 template <typename T>
-void MatrixBitCodeFunctor<T>::Sub(framework::Tensor* tmat) {
+struct MatrixBitCodeFunctorSub : public boost::static_visitor<void> {
-  size_t num_samples = tmat->dims()[0];
+  framework::Tensor *tmat_;
-  size_t o_width = tmat->dims()[1];
-  for (size_t i = 0; i < num_samples; ++i) {
+  explicit MatrixBitCodeFunctorSub(framework::Tensor *tmat) : tmat_(tmat) {}
-    auto code = code_table_->get_code(i);
-    int code_length = code->get_length();
+  template <typename CodeTable>
-    for (int j = 0; j < code_length; ++j) {
+  void operator()(const CodeTable &code_table) {
-      if (code->calc_bit(j)) {
+    size_t num_samples = tmat_->dims()[0];
-        tmat->data<T>()[i * o_width + j] -= 1;
+    size_t o_width = tmat_->dims()[1];
+    auto *tmat_data = tmat_->data<T>();
+    for (size_t i = 0; i < num_samples; ++i) {
+      auto code = code_table.get_code(i);
+      int code_length = code.get_length();
+      for (int j = 0; j < code_length; ++j) {
+        if (code.calc_bit(j)) {
+          tmat_data[i * o_width + j] -= 1;
+        }
      }
    }
  }
+};
+template <typename T>
+void MatrixBitCodeFunctor<T>::Sub(framework::Tensor *tmat) {
+  MatrixBitCodeFunctorSub<T> func(tmat);
+  code_table_.apply_visitor(func);
 }
 template class MatrixBitCodeFunctor<float>;

--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <map>
 #include <unordered_map>
 #include <utility>
 #include <vector>
@@ -22,6 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/variant.h"
 #if defined(_WIN32)
 #include <intrin.h>
@@ -98,24 +100,7 @@ inline int clz(const T& value) {
 inline size_t FindLastSet(size_t x) { return sizeof(size_t) * 8 - clz(x); }
 #endif  // !_WIN32
-// set a code interface to create multiple code
+class SimpleCode {
-class Code {
- public:
-  virtual ~Code() {}
-  virtual size_t calc_index(int bit) const = 0;
-  virtual bool calc_bit(int bit) const = 0;
-  virtual int get_length() const = 0;
-};
-// set a CodeTable interface to create multiple code table
-class CodeTable {
- public:
-  virtual std::unique_ptr<Code> get_code(int64_t code) const = 0;
-  virtual size_t size() const = 0;
-  virtual int get_max_code_length() const = 0;
-  virtual ~CodeTable() {}
-};
-class SimpleCode : public Code {
 public:
  SimpleCode(size_t code, size_t num_classes, const int64_t* ids)
      : c_(static_cast<size_t>(ids[code]) + num_classes) {}
@@ -137,16 +122,16 @@ class SimpleCode : public Code {
 };
 template <typename T>
-class CustomCode : public Code {
+class CustomCode {
 public:
  CustomCode(const framework::Tensor& ptable, const framework::Tensor& pcode,
-             const int64_t* ids, int index)
+             const int64_t* ids, int index) {
-      : ids_(ids), index_(index) {
+    seq_len_ = ptable.dims()[1];
-    ptable_ = ptable.Slice(index, index + 1);
+    ptable_data_ = ptable.data<T>() + seq_len_ * index;
-    pcode_ = pcode.Slice(index, index + 1);
+    pcode_data_ = pcode.data<T>() + seq_len_ * index;
  }
  /**
-   * Here the id of root shoud be 1 rather than 0, thus the encoding of class c
+   * Here the id of root should be 1 rather than 0, thus the encoding of class c
   * is `c + num_classes` and all siblings can get the same weight indice using
   * prefixes.
   * Weight index is the prefixes of encoding, thus leave out the right most
@@ -154,36 +139,37 @@ class CustomCode : public Code {
   * Binary classification path is the suffixes of encoding, thus leave out the
   * left most bit in calc_bit.
   */
-  size_t calc_index(int bit) const { return ptable_.data<T>()[bit]; }
+  size_t calc_index(int bit) const { return ptable_data_[bit]; }
-  bool calc_bit(int bit) const { return pcode_.data<T>()[bit]; }
+  bool calc_bit(int bit) const { return pcode_data_[bit]; }
-  int get_length() const {
-    int length = 0;
-    for (int i = 0; i < static_cast<int>(ptable_.dims()[1]); i++) {
+  // NOTE: this function is not thread-safe.
-      if (ptable_.data<T>()[i] >= 0) {
+  int get_length() const {
-        length++;
+    if (length_ < 0) {
-      } else {
+      auto len = seq_len_;
-        return length;
+      length_ =
-      }
+          static_cast<int>(std::find_if(ptable_data_, ptable_data_ + len,
+                                        [](const T& val) { return val < 0; }) -
+                           ptable_data_);
    }
-    return length;
+    return length_;
  }
 private:
-  framework::Tensor ptable_;
+  int64_t seq_len_;
-  framework::Tensor pcode_;
+  const T* ptable_data_;
-  const int64_t* ids_;
+  const T* pcode_data_;
-  const int index_;
+  mutable int length_{-1};
 };
-class SimpleCodeTable : public CodeTable {
+class SimpleCodeTable {
 public:
  SimpleCodeTable(size_t num_classes, const int64_t* ids)
      : num_classes_(num_classes), ids_(ids) {}
-  std::unique_ptr<Code> get_code(int64_t code) const {
-    std::unique_ptr<Code> coder(new SimpleCode(code, num_classes_, ids_));
+  SimpleCode get_code(int64_t code) const {
-    return coder;
+    return SimpleCode(code, num_classes_, ids_);
  }
  size_t size() const { return num_classes_; }
  int get_max_code_length() const { return FindLastSet(num_classes_ - 1); }
@@ -193,15 +179,14 @@ class SimpleCodeTable : public CodeTable {
 };
 template <typename T>
-class CustomCodeTable : public CodeTable {
+class CustomCodeTable {
 public:
  CustomCodeTable(const framework::Tensor& ptable,
                  const framework::Tensor& pcode, const int64_t* ids)
      : ptable_(ptable), pcode_(pcode), ids_(ids) {}
-  std::unique_ptr<Code> get_code(int64_t code) const {
+  CustomCode<T> get_code(int64_t code) const {
-    std::unique_ptr<Code> coder(new CustomCode<T>(ptable_, pcode_, ids_, code));
+    return CustomCode<T>(ptable_, pcode_, ids_, code);
-    return coder;
  }
  size_t size() const { return static_cast<size_t>(ptable_.dims()[1]); }
@@ -215,19 +200,21 @@ class CustomCodeTable : public CodeTable {
  const int64_t* ids_;
 };
+using CodeTable = boost::variant<SimpleCodeTable, CustomCodeTable<int64_t>>;
 template <typename T>
 class MatrixBitCodeFunctor {
 public:
  MatrixBitCodeFunctor(size_t num_classes, const int64_t* ids)
      : num_classes_(num_classes),
        ids_(ids),
-        code_table_(new SimpleCodeTable(num_classes, ids)) {}
+        code_table_(SimpleCodeTable(num_classes, ids)) {}
  MatrixBitCodeFunctor(const framework::Tensor& ptable,
                       const framework::Tensor& pcode, const int64_t* ids)
      : num_classes_(static_cast<size_t>(ptable.dims()[1])),
        ids_(ids),
-        code_table_(new CustomCodeTable<int64_t>(ptable, pcode, ids)) {}
+        code_table_(CustomCodeTable<int64_t>(ptable, pcode, ids)) {}
  /* For j < code_length
       tmat(i, j) += vec(0, index(i, j))
  */
@@ -277,7 +264,7 @@ class MatrixBitCodeFunctor {
  size_t num_classes_;
  const int64_t* ids_;
-  std::unique_ptr<CodeTable> code_table_;
+  CodeTable code_table_;
 };
 }  // namespace math
 }  // namespace operators

--- a/paddle/fluid/operators/optimizers/adam_op.cc
+++ b/paddle/fluid/operators/optimizers/adam_op.cc
@@ -109,6 +109,11 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(float, default 1.0e-8) "
                   "Constant for numerical stability")
        .SetDefault(1.0e-8f);
+    AddAttr<bool>(
+        "lazy_mode",
+        "(bool, default false) "
+        "only update the parameter that has gradient in sparse update")
+        .SetDefault(false);
    AddComment(R"DOC(
 Adam Optimizer.

--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -177,12 +177,13 @@ struct SparseAdamFunctor {
  const int64_t* rows_;
  int64_t row_numel_;
  int64_t row_count_;
+  bool lazy_mode_;
  SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
                    const T* beta2_pow, const T* mom1, T* mom1_out,
                    const T* mom2, T* mom2_out, const T* lr, const T* grad,
                    const T* param, T* param_out, const int64_t* rows,
-                    int64_t row_numel, int64_t row_count)
+                    int64_t row_numel, int64_t row_count, bool lazy_mode)
      : beta1_(beta1),
        beta2_(beta2),
        epsilon_(epsilon),
@@ -198,13 +199,10 @@ struct SparseAdamFunctor {
        param_out_(param_out),
        rows_(rows),
        row_numel_(row_numel),
-        row_count_(row_count) {}
+        row_count_(row_count),
+        lazy_mode_(lazy_mode) {}
-  inline HOSTDEVICE void operator()(size_t i) const {
-    auto row_idx =
-        math::BinarySearch<int64_t>(rows_, row_count_, i / row_numel_);
-    T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
+  inline HOSTDEVICE void adam_update(size_t i, T g) const {
    // The following code is the same as dense
    T mom1 = moment1_[i];
    T mom2 = moment2_[i];
@@ -225,6 +223,17 @@ struct SparseAdamFunctor {
    moment2_out_[i] = mom2;
    param_out_[i] = p;
  }
+  inline HOSTDEVICE void operator()(size_t i) const {
+    auto row_idx =
+        math::BinarySearch<int64_t>(rows_, row_count_, i / row_numel_);
+    if (lazy_mode_ && row_idx < 0) {
+      return;
+    } else {
+      T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
+      adam_update(i, g);
+    }
+  }
 };
 template <typename DeviceContext, typename T>
@@ -240,6 +249,7 @@ class AdamOpKernel : public framework::OpKernel<T> {
    using paddle::framework::LoDTensor;
    using paddle::operators::detail::Ref;
+    bool lazy_mode = ctx.Attr<bool>("lazy_mode");
    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
@@ -351,11 +361,23 @@ class AdamOpKernel : public framework::OpKernel<T> {
          mom2_out.template mutable_data<T>(ctx.GetPlace()),
          lr.template data<T>(), grad_data, param.template data<T>(),
          param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel,
-          grad_merge.rows().size());
+          grad_merge.rows().size(), lazy_mode);
-      platform::ForRange<DeviceContext> for_range(
+      VLOG(3) << "lazy_mode :" << lazy_mode;
-          static_cast<const DeviceContext&>(ctx.device_context()),
+      if (lazy_mode && platform::is_cpu_place(ctx.GetPlace())) {
-          param.numel());
+        size_t row_count = grad_merge.rows().size();
-      for_range(functor);
+        std::vector<int64_t> cpu_rows(grad_merge.rows());
+        for (size_t row_index = 0; row_index < row_count; ++row_index) {
+          for (size_t offset = 0; offset < row_numel; ++offset) {
+            size_t i = cpu_rows[row_index] * row_numel + offset;
+            functor.adam_update(i, grad_data[row_index * row_numel + offset]);
+          }
+        }
+      } else {
+        platform::ForRange<DeviceContext> for_range(
+            static_cast<const DeviceContext&>(ctx.device_context()),
+            param.numel());
+        for_range(functor);
+      }
    } else {
      PADDLE_THROW("Variable type not supported by adam_op");
    }

--- a/paddle/fluid/operators/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/operators/tensorrt/CMakeLists.txt
 op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter)
-file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(tensorrt_engine);\n")
+file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(tensorrt_engine);\n")
 nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
  DEPS tensorrt_engine_op
  analysis)
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
@@ -21,8 +21,6 @@
 namespace paddle {
-DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT");
 namespace operators {
 class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -31,7 +29,6 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Xs", "A list of inputs.").AsDuplicable();
    AddOutput("Ys", "A list of outputs").AsDuplicable();
    AddAttr<std::string>("subgraph", "the subgraph.");
-    AddAttr<std::string>("engine_uniq_key", "unique key for the TRT engine.");
    AddAttr<int>("max_batch_size", "the maximum batch size.");
    AddAttr<int>("workspace_size", "the workspace size.");
    AddComment("TensorRT engine operator.");
@@ -50,6 +47,6 @@ class TensorRTEngineInferVarType : public framework::VarTypeInference {
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp,
-                  ops::TensorRTEngineOpMaker, ops::TensorRTEngineOpMaker);
+                  ops::TensorRTEngineOpMaker);
 #endif  // PADDLE_WITH_CUDA
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cu.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cu.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h"
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    tensorrt_engine,
-    ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, int64_t>);
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -27,8 +27,6 @@
 namespace paddle {
-DECLARE_int32(tensorrt_engine_batch_size);
 namespace operators {
 using FluidDT = framework::proto::VarType_Type;
@@ -49,7 +47,7 @@ TRT_DT FluidDataType2TRT(FluidDT type) {
  return TRT_DT::kINT32;
 }
-nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
+nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
  PADDLE_ENFORCE_GT(shape.size(), 1UL,
                    "TensorRT' tensor input requires at least 2 dimensions");
  PADDLE_ENFORCE_LE(shape.size(), 4UL,
@@ -63,128 +61,119 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
 }  // namespace // NOLINT
 using inference::Singleton;
-using inference::tensorrt::TRT_EngineManager;
+using inference::tensorrt::TensorRTEngine;
+class TensorRTEngineOp : public framework::OperatorBase {
+ private:
+  std::vector<std::string> input_names_;
+  std::unordered_set<std::string> param_names_;
+  mutable std::unique_ptr<TensorRTEngine> trt_engine_;
+  int max_batch_size_;
+  int workspace_size_;
-class TensorRTEngineOp : public framework::OperatorWithKernel {
 public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
+  TensorRTEngineOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {
+    input_names_ = Inputs("Xs");
+    max_batch_size_ = Attr<int>("max_batch_size");
+    workspace_size_ = Attr<int>("workspace_size");
+    auto params = Attr<std::vector<std::string>>("parameters");
+    for (const auto &param : params) {
+      param_names_.insert(param);
+    }
+  }
 protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {}
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
-  framework::OpKernelType GetExpectedKernelType(
+    RunTrt(scope, dev_place);
-      const framework::ExecutionContext& ctx) const override {
-    auto input0 = ctx.Inputs("Xs").front();
-    framework::OpKernelType kt = framework::OpKernelType(
-        ctx.scope().FindVar(input0)->GetMutable<framework::LoDTensor>()->type(),
-        ctx.GetPlace());
-    return kt;
  }
-};
-template <typename DeviceContext, typename T>
+  void RunTrt(const framework::Scope &scope,
-class TensorRTEngineKernel : public framework::OpKernel<T> {
+              const platform::Place &dev_place) const {
- public:
+    int runtime_batch = 1;
-  void Compute(const framework::ExecutionContext& context) const override {
+    if (trt_engine_.get() == nullptr) {
-    auto engine_name = context.Attr<std::string>("engine_uniq_key");
+      trt_engine_.reset(new TensorRTEngine(
-    int max_batch_size = context.Attr<int>("max_batch_size");
+          max_batch_size_, workspace_size_, nullptr,
-    if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
+          boost::get<platform::CUDAPlace>(dev_place).device));
-      Prepare(context);
+      Prepare(scope, dev_place, trt_engine_.get());
    }
-    auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
-    auto input_names = context.op().Inputs("Xs");
+    auto *engine = trt_engine_.get();
-    PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
+    PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs");
-    PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, max_batch_size);
    std::vector<std::string> output_maps =
-        context.Attr<std::vector<std::string>>("output_name_mapping");
+        Attr<std::vector<std::string>>("output_name_mapping");
-    auto params = context.Attr<std::vector<std::string>>("parameters");
-    std::unordered_set<std::string> parameters;
-    for (const auto& param : params) {
-      parameters.insert(param);
-    }
    // Convert input tensor from fluid to engine.
-    for (const auto& x : context.Inputs("Xs")) {
+    for (const auto &x : Inputs("Xs")) {
-      if (parameters.count(x)) continue;
+      if (param_names_.count(x)) continue;
      // convert input and copy to TRT engine's buffer
-      auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(
+      auto &t =
-          context.scope(), x);
+          inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
+      auto t_shape = framework::vectorize(t.dims());
+      runtime_batch = t_shape[0];
      if (platform::is_cpu_place(t.place())) {
-        engine->SetInputFromCPU(x, static_cast<const void*>(t.data<void>()),
+        engine->SetInputFromCPU(x, static_cast<const void *>(t.data<void>()),
                                t.memory_size());
      } else {
-        engine->SetInputFromGPU(x, static_cast<const void*>(t.data<void>()),
+        engine->SetInputFromGPU(x, static_cast<const void *>(t.data<void>()),
                                t.memory_size());
      }
    }
+    PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_);
    // Execute the engine.
-    PADDLE_ENFORCE_GT(FLAGS_tensorrt_engine_batch_size, 0);
+    engine->Execute(runtime_batch);
-    engine->Execute(FLAGS_tensorrt_engine_batch_size);
    // Convert output tensor from engine to fluid
    int output_index = 0;
    VLOG(4) << "TensorRT Engine Op Outputs:";
-    for (const auto& y : context.Outputs("Ys")) {
+    for (const auto &y : Outputs("Ys")) {
      VLOG(4) << y;
      // convert output and copy to fluid.
-      nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]);
+      nvinfer1::ITensor *trt_t = engine->GetITensor(output_maps[output_index]);
      auto dims = trt_t->getDimensions();
      // Use the output ITensor's dims to reshape the Fluid Tensor.
      // The ITensor doesn't contain the batch size dim.
      std::vector<int> ddim;
-      ddim.push_back(FLAGS_tensorrt_engine_batch_size);
+      ddim.push_back(runtime_batch);
      for (int i = 0; i < dims.nbDims; i++) {
        ddim.push_back(dims.d[i]);
      }
-      auto* fluid_v = context.scope().FindVar(y);
+      auto *fluid_v = scope.FindVar(y);
      PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
-      auto* fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
+      auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
      fluid_t->Resize(framework::make_ddim(ddim));
-      // TODO(Superjomn) find some way to determine which device to output the
-      // tensor.
-      // if (platform::is_cpu_place(fluid_t->place())) {
      // TODO(Superjomn) change this float to dtype size.
-      auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) *
+      auto size =
-                  FLAGS_tensorrt_engine_batch_size;
+          inference::analysis::AccuDims(dims.d, dims.nbDims) * runtime_batch;
      engine->GetOutputInGPU(
          output_maps[output_index],
          fluid_t->mutable_data<float>(platform::CUDAPlace(
-              boost::get<platform::CUDAPlace>(context.GetPlace()).device)),
+              boost::get<platform::CUDAPlace>(dev_place).device)),
          size * sizeof(float));
      output_index += 1;
    }
    cudaStreamSynchronize(*engine->stream());
  }
- protected:
+  void Prepare(const framework::Scope &scope, const platform::Place &dev_place,
-  void Prepare(const framework::ExecutionContext& context) const {
+               TensorRTEngine *engine) const {
    VLOG(4) << "Prepare engine";
-    // Get the ProgramDesc and pass to convert.
    framework::proto::BlockDesc block_desc;
-    block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
+    block_desc.ParseFromString(Attr<std::string>("subgraph"));
-    int max_batch_size = context.Attr<int>("max_batch_size");
-    int workspace_size = context.Attr<int>("workspace_size");
-    auto params = context.Attr<std::vector<std::string>>("parameters");
-    std::unordered_set<std::string> parameters;
-    for (const auto& param : params) {
-      parameters.insert(param);
-    }
    std::vector<std::string> output_maps =
-        context.Attr<std::vector<std::string>>("output_name_mapping");
+        Attr<std::vector<std::string>>("output_name_mapping");
-    // TODO(Superjomn) replace this with a different stream
-    auto* engine = Singleton<TRT_EngineManager>::Global().Create(
-        max_batch_size, workspace_size, nullptr /*engine hold its own stream*/,
-        context.Attr<std::string>("engine_uniq_key"),
-        boost::get<platform::CUDAPlace>(context.GetPlace()).device);
    engine->InitNetwork();
@@ -192,39 +181,33 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
    VLOG(4) << "parsed var size " << block.AllVars().size();
    // Add inputs
    VLOG(4) << "declare inputs";
-    for (auto& input : context.Inputs("Xs")) {
+    for (auto &input : Inputs("Xs")) {
-      if (parameters.count(input)) continue;
+      if (param_names_.count(input)) continue;
      VLOG(4) << "declare input " << input;
-      auto* var = block.FindVar(input);
+      auto &t =
+          inference::analysis::GetFromScope<framework::LoDTensor>(scope, input);
+      auto t_shape = framework::vectorize(t.dims());
+      auto *var = block.FindVar(input);
      // TensorRT engine need to create parameters. The parameter's description
      // should be set in
      PADDLE_ENFORCE(var, "no variable called %s", input);
      PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
                        "TensorRT engine only takes LoDTensor as input");
-      auto shape = var->GetShape();
-      // For the special batch_size placeholder -1, drop it and pass the real
-      // shape of data.
-      // TODO(Superjomn) fix this with batch broadcast, or it can't handle
-      // variational batch size.
-      if (shape[0] == -1) {
-        shape[0] = FLAGS_tensorrt_engine_batch_size;
-      }
      engine->DeclareInput(
          input, FluidDataType2TRT(
                     var->Proto()->type().lod_tensor().tensor().data_type()),
-          Vec2TRT_Dims(shape));
+          Vec2TRT_Dims(t_shape));
    }
    inference::Singleton<inference::tensorrt::OpConverter>::Global()
-        .ConvertBlock(block_desc, parameters, context.scope(), engine);
+        .ConvertBlock(block_desc, param_names_, scope, engine);
    // Add outputs
-    for (auto& output : output_maps) {
+    for (auto &output : output_maps) {
-      if (!engine->HasDeclared(output)) {
+      engine->DeclareOutput(output);
-        engine->DeclareOutput(output);
-      }
    }
    engine->FreezeNetwork();
  }
 };

--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -24,8 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
-USE_CUDA_ONLY_OP(tensorrt_engine);
+USE_NO_KERNEL_OP(tensorrt_engine);
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/transpose_mkldnn_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/fluid/framework/data_layout_transform.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+using framework::DataLayout;
+template <typename T>
+class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    const bool is_test = ctx.Attr<bool>("is_test");
+    PADDLE_ENFORCE(
+        is_test == true,
+        "ConvTransposeMKLDNN works only for inference!. Set is_test = True");
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
+    int ndims = axis.size();
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    const T* input_data = input->data<T>();
+    if (ndims == 1) {
+      output->ShareDataWith(*input);
+      return;
+    }
+    std::vector<int> nchw_axis(ndims, 0);
+    for (size_t i = 0; i < nchw_axis.size(); ++i) {
+      nchw_axis[i] = i;
+    }
+    std::vector<int> nchw_tz = paddle::framework::vectorize2int(input->dims());
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    auto src_md =
+        input->format() != mkldnn::memory::format::nchw
+            ? platform::MKLDNNMemDesc(nchw_tz, platform::MKLDNNGetDataType<T>(),
+                                      input->format())
+            : Axis2MemoryDesc(nchw_tz, nchw_axis);
+    this->TransposeKernel(ctx.GetPlace(), Axis2MemoryDesc(nchw_tz, axis),
+                          src_md, output, input_data, nchw_tz, mkldnn_engine);
+  }
+ protected:
+  mkldnn::memory::desc Axis2MemoryDesc(std::vector<int>& nchw_tz,
+                                       std::vector<int>& axis) const {
+    mkldnn_memory_desc_t mem_fmt;
+    mem_fmt.primitive_kind = mkldnn_memory;
+    mem_fmt.ndims = axis.size();
+    for (unsigned int i = 0; i < nchw_tz.size(); ++i) {
+      mem_fmt.dims[i] = nchw_tz[i];  // logical dimensions (nchw format,
+                                     // regardless physical layout)
+    }
+    mem_fmt.data_type = mkldnn_f32;
+    mem_fmt.format = mkldnn_blocked;
+    unsigned int total_stride = 1;
+    for (int i = nchw_tz.size() - 1; i >= 0; --i) {
+      mem_fmt.layout_desc.blocking.padding_dims[i] =
+          nchw_tz[i];  // logical dimensions (nchw format, regardless physical
+                       // layout)
+      mem_fmt.layout_desc.blocking.block_dims[i] = 1;
+      mem_fmt.layout_desc.blocking.offset_padding_to_data[i] = 0;  // no offset
+      mem_fmt.layout_desc.blocking.strides[0][axis[i]] = total_stride;
+      mem_fmt.layout_desc.blocking.strides[1][axis[i]] = 1;
+      total_stride *= nchw_tz[axis[i]];
+    }
+    mem_fmt.layout_desc.blocking.offset_padding = 0;  // no initial offset
+    return mem_fmt;
+  }
+  void TransposeKernel(platform::Place place, mkldnn::memory::desc md_o,
+                       mkldnn::memory::desc md_i, Tensor* output,
+                       const T* data_i, std::vector<int>& nchw_dims,
+                       const mkldnn::engine& eng) const {
+    // Make Memory primitive descriptors
+    auto mpd_o = mkldnn::memory::primitive_desc(md_o, eng);
+    auto mpd_i = mkldnn::memory::primitive_desc(md_i, eng);
+    auto data_o = output->mutable_data<T>(
+        place, paddle::memory::Allocator::kDefault, mpd_o.get_size());
+    auto src = mkldnn::memory(mpd_i, (T*)(data_i));
+    auto dst = mkldnn::memory(mpd_o, data_o);
+    auto r = mkldnn::reorder(src, dst);
+    mkldnn::stream(mkldnn::stream::kind::eager).submit({r}).wait();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(transpose2, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::TransposeMKLDNNOpKernel<float>);
+REGISTER_OP_KERNEL(transpose, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::TransposeMKLDNNOpKernel<float>);
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -16,6 +16,10 @@ limitations under the License. */
 #include <string>
 #include <vector>
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 namespace paddle {
 namespace operators {
@@ -53,11 +57,32 @@ class TransposeOp : public framework::OperatorWithKernel {
    }
    ctx->SetOutputDim("Out", out_dims);
  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+#ifdef PADDLE_WITH_MKLDNN
+    if (library_ == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kMKLDNN;
+      layout_ = framework::DataLayout::kMKLDNN;
+    }
+#endif
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.GetPlace(), layout_, library_);
+  }
 };
 class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
    AddInput(
        "X",
        "(Tensor) The input tensor, tensors with rank up to 6 are supported.");
@@ -67,6 +92,16 @@ class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
        "(vector<int>) A list of values, and the size of the list should be "
        "the same with the input tensor rank. This operator permutes the input "
        "tensor's axes according to the values given.");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "data_format",
+        "(string, default NCHW) Only used in "
+        "An optional string from: \"NHWC\", \"NCHW\". "
+        "Defaults to \"NHWC\". Specify the data format of the output data, "
+        "the input will be transformed automatically. ")
+        .SetDefault("AnyLayout");
    AddComment(R"DOC(
 Transpose Operator.
@@ -144,8 +179,18 @@ class Transpose2Op : public TransposeOp {
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+    framework::LibraryType library_{framework::LibraryType::kPlain};
-                                   ctx.device_context());
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+#ifdef PADDLE_WITH_MKLDNN
+    if (library_ == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kMKLDNN;
+      layout_ = framework::DataLayout::kMKLDNN;
+    }
+#endif
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.GetPlace(), layout_, library_);
  }
 };

--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -16,9 +16,7 @@ if (CUPTI_FOUND)
    list(APPEND CUDA_SRCS cupti.cc)
 endif(CUPTI_FOUND)
 nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
-if (NOT WIN32)
 cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
-endif(NOT WIN32)
 if (WITH_MKLML)
    cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
 endif()

--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -34,7 +34,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
 #define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                            \
  struct DynLoad__##__name {                                               \
    template <typename... Args>                                            \
-    auto operator()(Args... args) -> decltype(__name(args...)) {           \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
      using cudnn_func = decltype(&::__name);                              \
      std::call_once(cudnn_dso_flag, []() {                                \
        cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \

--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -201,6 +201,8 @@ void* GetCurandDsoHandle() {
 void* GetWarpCTCDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
  return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "warpctc.dll");
 #else
  return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so");
 #endif

--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -18,6 +18,12 @@ namespace paddle {
 namespace platform {
 namespace dynload {
+#ifndef _WIN32
+#define DECLARE_TYPE(__name, ...) decltype(__name(__VA_ARGS__))
+#else
+#define DECLARE_TYPE(__name, ...) decltype(auto)
+#endif
 void* GetCublasDsoHandle();
 void* GetCUDNNDsoHandle();
 void* GetCUPTIDsoHandle();

--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -34,7 +34,7 @@ extern void* mklml_dso_handle;
 #define DYNAMIC_LOAD_MKLML_WRAP(__name)                                    \
  struct DynLoad__##__name {                                               \
    template <typename... Args>                                            \
-    auto operator()(Args... args) -> decltype(__name(args...)) {           \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
      using mklmlFunc = decltype(&::__name);                               \
      std::call_once(mklml_dso_flag, []() {                                \
        mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \
@@ -82,6 +82,8 @@ extern void* mklml_dso_handle;
  __macro(vdSqr);                   \
  __macro(vsPowx);                  \
  __macro(vdPowx);                  \
+  __macro(vsInv);                   \
+  __macro(vdInv);                   \
  __macro(MKL_Set_Num_Threads)
 MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);

--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
@@ -33,7 +33,7 @@ extern void* tensorrt_dso_handle;
 #define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name)                      \
  struct DynLoad__##__name {                                            \
    template <typename... Args>                                         \
-    auto operator()(Args... args) -> decltype(__name(args...)) {        \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {    \
      using tensorrt_func = decltype(__name(args...)) (*)(Args...);     \
      std::call_once(tensorrt_dso_flag, []() {                          \
        tensorrt_dso_handle =                                           \

--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
@@ -34,7 +34,7 @@ extern void* warpctc_dso_handle;
 #define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                                      \
  struct DynLoad__##__name {                                                   \
    template <typename... Args>                                                \
-    auto operator()(Args... args) -> decltype(__name(args...)) {               \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {           \
      using warpctcFunc = decltype(&::__name);                                 \
      std::call_once(warpctc_dso_flag, []() {                                  \
        warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \

--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
@@ -55,7 +55,6 @@ static void *dlsym(void *handle, const char *symbol_name) {
 static void *dlopen(const char *filename, int flag) {
  std::string file_name(filename);
-  file_name.replace(0, file_name.size() - 1, '/', '\\');
  HMODULE hModule = LoadLibrary(file_name.c_str());
  if (!hModule) {
    throw std::runtime_error(file_name + " not found.");

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -960,6 +960,14 @@ All parameter, weight, gradient are variables in Paddle.
          R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether
                     to fuse elementwise_add_op and activation_op,
                     it may make the execution faster. Default False)DOC")
+      .def_property(
+          "memory_optimize",
+          [](const BuildStrategy &self) { return self.memory_optimize_; },
+          [](BuildStrategy &self, bool b) { self.memory_optimize_ = b; })
+      .def_property(
+          "memory_early_delete",
+          [](const BuildStrategy &self) { return self.memory_early_delete_; },
+          [](BuildStrategy &self, bool b) { self.memory_early_delete_ = b; })
      .def("_finalize_strategy_and_create_passes",
           [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
             return self.CreatePassesFromStrategy(true);

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -102,6 +102,13 @@ def __bootstrap__():
    import sys
    import os
    import platform
+    if os.name == 'nt':
+        third_lib_path = os.path.abspath(os.path.dirname(
+            __file__)) + os.sep + '..' + os.sep + 'libs'
+        os.environ['path'] += ';' + third_lib_path
+        sys.path.append(third_lib_path)
    from . import core
    in_test = 'unittest' in sys.modules
@@ -128,13 +135,12 @@ def __bootstrap__():
        'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size",
        'eager_delete_tensor_gb', 'fast_eager_deletion_mode',
        'allocator_strategy', 'reader_queue_speed_test_mode',
-        'print_sub_graph_dir', 'pe_profile_fname'
+        'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir'
    ]
    if 'Darwin' not in sysstr:
        read_env_flags.append('use_pinned_memory')
    if os.name != 'nt':
-        read_env_flags.append('warpctc_dir')
        read_env_flags.append('cpu_deterministic')
    if core.is_compiled_with_dist():
@@ -150,7 +156,7 @@ def __bootstrap__():
        read_env_flags += [
            'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
            'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
-            'cudnn_exhaustive_search', 'selected_gpus'
+            'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus'
        ]
    core.init_gflags([sys.argv[0]] +

--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -249,69 +249,6 @@ def serialize_op_decs(op_desc):
    return proto.__str__()
-def _callback_lookup_(op):
-    """
-    Only used in _append_backward_ops_
-    Build and returns a callback function for certain op. For example
-    parallel_do:           AllReduce
-    :param op:
-    :return: callback function
-    """
-    if op.type == 'parallel_do' and op.attr('use_nccl'):
-        all_vars = op.block.vars
-        param_names = set(op.input('parameters'))
-        param_names = [
-            name for name in param_names
-            if all_vars[name].stop_gradient is False
-        ]
-        param_grad_names = [n + "@GRAD" for n in param_names]
-        class ParallelDoCallBack(object):
-            def __init__(self, param_grad_names, parallel_scopes_name):
-                self.has_inserted_nccl_init = False
-                self.param_grad_names = param_grad_names
-                self.parallel_scopes_name = parallel_scopes_name
-            def __call__(self, block, context):
-                if not self.has_inserted_nccl_init:
-                    op_desc = _create_op_desc_(
-                        "ncclInit",
-                        {"parallel_scopes": self.parallel_scopes_name},
-                        {"Communicator": ['nccl_com__do_not_change_']}, {})
-                    block.program.global_block().desc.append_op().copy_from(
-                        op_desc)
-                    self.has_inserted_nccl_init = True
-                current_op_desc = context["__current_op_desc__"]
-                for o_param in current_op_desc.output_names():
-                    for o_argu in current_op_desc.output(o_param):
-                        if o_argu in self.param_grad_names:
-                            allreduce_out_name = o_argu + "__nccl_all_reduce__"
-                            op_desc = _create_op_desc_(
-                                "ncclReduce",
-                                {
-                                    "X": [o_argu],
-                                    "Communicator":
-                                    ['nccl_com__do_not_change_']
-                                },
-                                {"Out": [allreduce_out_name]},
-                                {"reduction": "ncclSum",
-                                 "root": 0}, )
-                            block.desc.append_op().copy_from(op_desc)
-                            op_desc = _create_op_desc_(
-                                "assign", {"X": [allreduce_out_name]},
-                                {"Out": [o_argu]}, {})
-                            block.desc.append_op().copy_from(op_desc)
-        return ParallelDoCallBack(param_grad_names,
-                                  op.output("parallel_scopes"))
-    else:
-        return None
 def _append_backward_ops_(block,
                          ops,
                          target_block,
@@ -349,17 +286,8 @@ def _append_backward_ops_(block,
            sub_block = program.block(op._block_attr_id("sub_block"))
            grad_sub_block = program._create_block()
            grad_sub_block._set_forward_block_idx(sub_block.idx)
-            cb = _callback_lookup_(op)
+            _append_backward_ops_(sub_block, sub_block.ops, grad_sub_block,
-            if cb is not None:
+                                  no_grad_dict, grad_to_var, callbacks)
-                if callbacks is None:
-                    new_callbacks = [cb]
-                else:
-                    new_callbacks = callbacks + [_callback_lookup_(op)]
-                _append_backward_ops_(sub_block, sub_block.ops, grad_sub_block,
-                                      no_grad_dict, grad_to_var, new_callbacks)
-            else:
-                _append_backward_ops_(sub_block, sub_block.ops, grad_sub_block,
-                                      no_grad_dict, grad_to_var, callbacks)
            program._rollback()
            grad_sub_block_list.append(grad_sub_block.desc)
@@ -424,9 +352,6 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
        # infer_shape and infer_type
        op_desc.infer_var_type(block.desc)
        op_desc.infer_shape(block.desc)
-        # ncclInit dones't need to set data_type
-        if op_desc.type() == 'ncclInit':
-            continue
        for arg in op_desc.output_arg_names():
            if arg in new_vars:
                _infer_var_data_type_(arg, block)

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 import collections
 import contextlib
+import os
 import re
 import six
 import sys
@@ -27,11 +28,18 @@ from .proto import framework_pb2
 try:
    from . import core
 except ImportError as e:
-    raise ImportError(
+    if os.name == 'nt':
-        """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\"
+        raise ImportError(
-    if you encounters \"libmkldnn.so not found\" errors. If you have python
+            """NOTE: You may need to run \"set PATH=c:\python27\lib:%PATH%\"
-    installed in other directory, replace \"/usr/local/lib\" with your own
+        if you encounters \"mkldnn.dll not found\" errors. If you have python
-    directory. The original error is: \n""" + cpt.get_exception_message(e))
+        installed in other directory, replace \"c:\python27\lib" with your own
+        directory. The original error is: \n""" + cpt.get_exception_message(e))
+    else:
+        raise ImportError(
+            """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\"
+        if you encounters \"libmkldnn.so not found\" errors. If you have python
+        installed in other directory, replace \"/usr/local/lib\" with your own
+        directory. The original error is: \n""" + cpt.get_exception_message(e))
 except Exception as e:
    raise e
 from . import unique_name
@@ -563,8 +571,8 @@ class Operator(object):
    OP_WITHOUT_KERNEL_SET = {
        'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
        'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
-        'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine',
+        'listen_and_serv', 'save_combine', 'load_combine', 'ncclInit', 'select',
-        'ncclInit', 'select', 'checkpoint_notify', 'gen_nccl_id'
+        'checkpoint_notify', 'gen_nccl_id'
    }
    def __init__(self,

--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -226,156 +226,6 @@ class BlockGuard(object):
        return True
-class ParallelDo(object):
-    """
-    ParallelDo is used to represent multi-thread data parallel processing.
-    Its vanilla implementation can be shown as the following (:math:`|` means
-    single thread and :math:`||||` means multiple threads)
-    .. code-block:: text
-      In the forward pass
-        |      Split input onto different devices
-        |      Copy parameter onto different devices
-        ||||   Compute forward pass in parallel
-        |      Merge output from different devices
-      In the backward pass
-        |      Split output@grad onto different devices
-        ||||   Compute backward pass in parallel
-        |      accumulate param@grad from different devices to the first device
-        |      Merge input@grad from different devices
-        |      Copy param@grad to the place of parallel_do_op
-    Examples:
-    .. code-block:: python
-      images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
-      label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-      # ParallelDo version & Single-thread version
-      if thread_num > 1:
-          places = fluid.layers.get_places(thread_num)
-          pd = fluid.layers.control_flow.ParallelDo(places)
-          with pd.do():
-              images = pd.read_input(images)
-              label = pd.read_input(label)
-              predict = cnn_model(images)
-              cost = fluid.layers.cross_entropy(input=predict, label=label)
-              avg_cost = fluid.layers.mean(x=cost)
-              pd.write_output(avg_cost)
-          avg_cost = pd()
-          avg_cost = fluid.layers.mean(avg_cost)
-      else:
-          predict = cnn_model(images)
-          cost = fluid.layers.cross_entropy(input=predict, label=label)
-          avg_cost = fluid.layers.mean(x=cost)
-    .. warning::
-       It will be soon deprecated, please use ParallelExecutor instead.
-    """
-    def __init__(self, places, use_nccl=False, name=None):
-        warnings.warn(
-            "API ParallelDo is deprecated since 0.15.0. Please use ParallelExecutor instead.",
-            Warning)
-        self.helper = LayerHelper("parallel_do", name=name)
-        self.inputs = []
-        self.places = places
-        self.outputs = []
-        self.status = StaticRNN.BEFORE_RNN_BLOCK
-        self.use_nccl = use_nccl
-    def do(self):
-        return BlockGuardWithCompletion(self)
-    def parent_block(self):
-        prog = self.helper.main_program
-        parent_idx = prog.current_block().parent_idx
-        assert parent_idx >= 0
-        parent_block = prog.block(parent_idx)
-        return parent_block
-    def __call__(self, *args, **kwargs):
-        if self.status != StaticRNN.AFTER_RNN_BLOCK:
-            raise ValueError("RNN output can only be retrieved after rnn block")
-        if len(self.outputs) == 0:
-            raise ValueError("RNN has no output")
-        elif len(self.outputs) == 1:
-            return self.outputs[0]
-        else:
-            return self.outputs
-    def read_input(self, var):
-        self.inputs.append(var)
-        return var
-    def write_output(self, var):
-        self.outputs.append(var)
-    def get_parameters(self):
-        main_program = self.helper.main_program
-        current_block = main_program.current_block()
-        parent_block = self.parent_block()
-        local_inputs = set()
-        params = list()
-        for var in self.inputs:
-            local_inputs.add(var.name)
-        for op in current_block.ops:
-            for iname in op.input_names:
-                for in_var_name in op.input(iname):
-                    if in_var_name not in local_inputs:
-                        params.append(in_var_name)
-            for oname in op.output_names:
-                for out_var_name in op.output(oname):
-                    local_inputs.add(out_var_name)
-        params = list(set(params))
-        return [parent_block.var(name) for name in params]
-    def _complete_op(self):
-        main_program = self.helper.main_program
-        current_block = main_program.current_block()
-        parent_block = self.parent_block()
-        step_scope = parent_block.create_var(
-            type=core.VarDesc.VarType.STEP_SCOPES)
-        self.outputs = [
-            parent_block.create_var(
-                name=o.name,
-                shape=o.shape,
-                dtype=o.dtype,
-                lod_level=o.lod_level,
-                persistable=o.persistable,
-                stop_gradient=o.stop_gradient) for o in self.outputs
-        ]
-        inputs = [parent_block.var(i.name) for i in self.inputs]
-        outputs = [parent_block.var(o.name) for o in self.outputs]
-        parent_block.append_op(
-            type='parallel_do',
-            inputs={
-                'inputs': inputs,
-                'parameters': self.get_parameters(),
-                'places': self.places
-            },
-            outputs={'outputs': outputs,
-                     'parallel_scopes': [step_scope]},
-            attrs={'sub_block': current_block,
-                   'use_nccl': self.use_nccl})
 class BlockGuardWithCompletion(BlockGuard):
    """
    BlockGuardWithCompletion class.
@@ -384,9 +234,8 @@ class BlockGuardWithCompletion(BlockGuard):
    """
    def __init__(self, rnn):
-        if not (isinstance(rnn, StaticRNN) or isinstance(rnn, ParallelDo)):
+        if not isinstance(rnn, StaticRNN):
-            raise TypeError(
+            raise TypeError("BlockGuardWithCompletion takes a StaticRNN")
-                "BlockGuardWithCompletion takes a StaticRNN or ParallelDo")
        super(BlockGuardWithCompletion, self).__init__(rnn.helper.main_program)
        self.rnn = rnn

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -177,6 +177,7 @@ __all__ = [
    'lstm',
    'psroi_pool',
    'teacher_student_sigmoid_loss',
+    'huber_loss',
 ]
 kIgnoreIndex = -100
@@ -498,7 +499,7 @@ def lstm(input,
    If Device is GPU, This op will use cudnn LSTM implementation
    A four-gate Long Short-Term Memory network with no peephole connections.
-    In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, 
+    In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1,
    the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
    $$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
@@ -525,19 +526,19 @@ def lstm(input,
    - $\tilde{c_t}$ is also called candidate hidden state,
      which is computed based on the current input and the previous hidden state.
-    Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, 
+    Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication,
    X represensts a matrix multiplication
    Args:
        input (Variable): LSTM input tensor, shape MUST be ( seq_len x batch_size x input_size )
-        init_h(Variable): The initial hidden state of the LSTM                       
+        init_h(Variable): The initial hidden state of the LSTM
                       This is a tensor with shape ( num_layers x batch_size x hidden_size)
                       if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
        init_c(Variable): The initial cell state of the LSTM.
                       This is a tensor with shape ( num_layers x batch_size x hidden_size )
                       if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
-        max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len 
+        max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len
        hidden_size (int): hidden size of the LSTM
        num_layers (int): total layers number of the LSTM
        dropout_prob(float|0.0): dropout prob, dropout ONLY work between rnn layers, NOT between time steps
@@ -556,10 +557,10 @@ def lstm(input,
                         if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2)
        last_h(Tensor): the hidden state of the last step of LSTM
                        shape is ( num_layers x batch_size x hidden_size )
-                        if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)                     
+                        if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
        last_c(Tensor): the cell state of the last step of LSTM
                        shape is ( num_layers x batch_size x hidden_size )
-                        if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)                     
+                        if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
    Examples:
@@ -4659,7 +4660,7 @@ def ctc_greedy_decoder(input, blank, name=None):
                      [0.5, 0.1, 0.3, 0.1]]
        input.lod = [[4, 4]]
        Computation:
        step1: Apply argmax to first input sequence which is input.data[0:4]. Then we get:
@@ -4692,7 +4693,7 @@ def ctc_greedy_decoder(input, blank, name=None):
    Returns:
        Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1].
                  'Lp' is the sum if all output sequences' length. If all the sequences
-                  in result were empty, the result LoDTensor will be [-1] with 
+                  in result were empty, the result LoDTensor will be [-1] with
                  LoD [[]] and dims [1, 1].
    Examples:
@@ -5046,7 +5047,7 @@ def hsigmoid(input,
    """
    The hierarchical sigmoid operator is used to accelerate the training
    process of language model. This operator organizes the classes into a
-    complete binary tree, or you can use is_custom to pass your own tree to 
+    complete binary tree, or you can use is_custom to pass your own tree to
    implement hierarchical. Each leaf node represents a class(a word) and each
    internal node acts as a binary classifier. For each word there's a unique
    path from root to it's leaf node, hsigmoid calculate the cost for each
@@ -5062,7 +5063,7 @@ def hsigmoid(input,
        2. build a dict to store word_id -> word's leaf to root path, we call it path_table.
        3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code
         means label of each binary classification, using 1 indicate true, 0 indicate false.
-        4. now, each word should has its path and code along the path, you can pass a batch of path and code 
+        4. now, each word should has its path and code along the path, you can pass a batch of path and code
        related to the same batch of inputs.
@@ -5072,8 +5073,8 @@ def hsigmoid(input,
            and :math:`D` is the feature size.
        label (Variable): The tensor variable contains labels of training data.
            It's a tensor with shape is :math:`[N \\times 1]`.
-        num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set, 
+        num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set,
-            it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num 
+            it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num
            which indicates the num of classes using by binary classify.
        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
             of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid
@@ -5086,15 +5087,15 @@ def hsigmoid(input,
             is not set, the bias is initialized zero. Default: None.
        name (str|None): A name for this layer(optional). If set None, the layer
             will be named automatically. Default: None.
-        path_table: (Variable|None) this variable can store each batch of samples' path to root, 
+        path_table: (Variable|None) this variable can store each batch of samples' path to root,
            it should be in leaf -> root order
-            path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like 
+            path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like
-            structure and each element in this array is indexes in parent nodes' Weight Matrix. 
+            structure and each element in this array is indexes in parent nodes' Weight Matrix.
-        path_code:  (Variable|None) this variable can store each batch of samples' code, 
+        path_code:  (Variable|None) this variable can store each batch of samples' code,
            each code consist with every code of parent nodes. it should be in leaf -> root order
-        is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is 
+        is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is
             set you need to set path_table/path_code/num_classes, otherwise num_classes should be set
-        is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient 
+        is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient
             of W and input will be sparse.
    Returns:
@@ -9419,3 +9420,51 @@ def psroi_pool(input,
            'pooled_width': pooled_width
        })
    return out
+def huber_loss(input, label, delta):
+    """
+    Huber loss is a loss function used in robust.
+    Huber loss can evaluate the fitness of input to label.
+    Different from MSE loss, Huber loss is more robust for outliers.
+    When the difference between input and label is large than delta
+    .. math::
+        huber\_loss = delta * (label - input) - 0.5 * delta * delta
+    When the difference between input and label is less than delta
+    .. math::
+        huber\_loss = 0.5 * (label - input) * (label - input)
+    Args:
+        input (Variable): This input is a probability computed by the previous operator.
+                          The first dimension is batch size, and the last dimension is 1.
+        label (Variable): The groud truth whose first dimension is batch size
+                          and last dimension is 1.
+        delta (float): The parameter of huber loss, which controls
+                       the range of outliers
+    Returns:
+        huber\_loss (Variable): The huber loss with shape [batch_size, 1].
+    Examples:
+        .. code-block:: python
+            predictions = fluid.layers.softmax(x)
+            loss = fluid.layers.huber_loss(input=predictions, label=label, 1.0)
+    """
+    helper = LayerHelper('huber_loss', **locals())
+    residual = helper.create_variable_for_type_inference(
+        dtype=helper.input_dtype())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
+    helper.append_op(
+        type='huber_loss',
+        inputs={'X': input,
+                'Y': label},
+        outputs={'Out': out,
+                 'Residual': residual},
+        attrs={'delta': delta})
+    return out
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -641,9 +641,14 @@ class AdamOptimizer(Optimizer):
        beta1 (float): The exponential decay rate for the 1st moment estimates.
        beta2 (float): The exponential decay rate for the 2nd moment estimates.
        epsilon (float): a small float value for numerical stability.
-        regularization: A Regularizer, such as
+        regularization: A Regularizer, such as fluid.regularizer.L2DecayRegularizer.
-                        fluid.regularizer.L2DecayRegularizer.
        name: A optional name prefix.
+        lazy_mode(bool: false): The official Adam algorithm has two moving-average accumulators
+        the accumulators are updated at every step. Every element of the two moving-average is updated
+        in both dense mode and sparse mode. If the size of parameter is very large, then the update
+        may be very slow. The lazy mode only update the element that has gradient is the current
+        mini-batch, so it will be much more faster. But this mode has different semantics with the
+        original Adam algorithm and may lead to different result.
    Examples:
        .. code-block:: python
@@ -663,7 +668,8 @@ class AdamOptimizer(Optimizer):
                 beta2=0.999,
                 epsilon=1e-8,
                 regularization=None,
-                 name=None):
+                 name=None,
+                 lazy_mode=False):
        assert learning_rate is not None
        assert beta1 is not None
        assert beta2 is not None
@@ -676,6 +682,7 @@ class AdamOptimizer(Optimizer):
        self._beta1 = beta1
        self._beta2 = beta2
        self._epsilon = epsilon
+        self._lazy_mode = lazy_mode
    def _create_accumulators(self, block, parameters):
        assert isinstance(block, framework.Block)
@@ -729,7 +736,8 @@ class AdamOptimizer(Optimizer):
            attrs={
                "beta1": self._beta1,
                "beta2": self._beta2,
-                "epsilon": self._epsilon
+                "epsilon": self._epsilon,
+                "lazy_mode": self._lazy_mode
            })
        return adam_op

--- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
@@ -15,7 +15,6 @@
 from __future__ import print_function
 from paddle.fluid.layers.device import get_places
-from paddle.fluid.layers.control_flow import ParallelDo
 import unittest
 import paddle.fluid as fluid
 import paddle
@@ -147,22 +146,7 @@ def train(word_dict,
        cost, acc_out, prediction = net_method(
            data, label, input_dim=dict_dim, class_dim=class_dim)
    else:
-        places = get_places()
+        raise NotImplementedError()
-        pd = ParallelDo(places)
-        with pd.do():
-            cost, acc, _ = net_method(
-                pd.read_input(data),
-                pd.read_input(label),
-                input_dim=dict_dim,
-                class_dim=class_dim)
-            pd.write_output(cost)
-            pd.write_output(acc)
-        cost, acc = pd()
-        cost = fluid.layers.mean(cost)
-        acc_out = fluid.layers.mean(acc)
-        prediction = None
-        assert save_dirname is None
    adagrad = fluid.optimizer.Adagrad(learning_rate=0.002)
    adagrad.minimize(cost)

--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -25,7 +25,6 @@ import numpy
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.layers.device import get_places
-from paddle.fluid.layers.control_flow import ParallelDo
 BATCH_SIZE = 64
@@ -82,19 +81,7 @@ def train(nn_type,
        net_conf = conv_net
    if parallel:
-        places = get_places()
+        raise NotImplementedError()
-        pd = ParallelDo(places)
-        with pd.do():
-            img_ = pd.read_input(img)
-            label_ = pd.read_input(label)
-            prediction, avg_loss, acc = net_conf(img_, label_)
-            for o in [avg_loss, acc]:
-                pd.write_output(o)
-        avg_loss, acc = pd()
-        # get mean loss and acc through every devices.
-        avg_loss = fluid.layers.mean(avg_loss)
-        acc = fluid.layers.mean(acc)
    else:
        prediction, avg_loss, acc = net_conf(img, label)
@@ -273,7 +260,7 @@ def inject_all_tests():
    for use_cuda in (False, True):
        if use_cuda and not core.is_compiled_with_cuda():
            continue
-        for parallel in (False, True):
+        for parallel in (False, ):
            for nn_type in ('mlp', 'conv'):
                inject_test_method(use_cuda, parallel, nn_type, True)

--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -17,7 +17,6 @@ from __future__ import print_function
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.layers.device import get_places
-from paddle.fluid.layers.control_flow import ParallelDo
 import unittest
 import os
 import numpy as np
@@ -84,18 +83,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
        avg_cost, predict_word = __network__(
            [first_word, second_word, third_word, forth_word, next_word])
    else:
-        places = get_places()
+        raise NotImplementedError()
-        pd = ParallelDo(places)
-        with pd.do():
-            avg_cost, predict_word = __network__(
-                list(
-                    map(pd.read_input, [
-                        first_word, second_word, third_word, forth_word,
-                        next_word
-                    ])))
-            pd.write_output(avg_cost)
-        avg_cost = fluid.layers.mean(pd())
    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
    sgd_optimizer.minimize(avg_cost)
@@ -262,7 +250,7 @@ def inject_test_method(use_cuda, is_sparse, is_parallel):
 for use_cuda in (False, True):
    for is_sparse in (False, True):
-        for is_parallel in (False, True):
+        for is_parallel in (False, ):
            inject_test_method(use_cuda, is_sparse, is_parallel)
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-import math
-import sys
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.layers.device import get_places
-from paddle.fluid.layers.control_flow import ParallelDo
-# need to fix random seed and training data to compare the loss
-# value accurately calculated by the default and the memory optimization
-# version.
-fluid.default_startup_program().random_seed = 111
-x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-device_type = 'CPU'
-use_nccl = False
-place = fluid.CPUPlace()
-if fluid.core.is_compiled_with_cuda():
-    device_type = 'CUDA'
-    use_nccl = False
-    place = fluid.CUDAPlace(0)
-places = get_places(device_count=0, device_type=device_type)
-pd = ParallelDo(places, use_nccl=use_nccl)
-with pd.do():
-    x_ = pd.read_input(x)
-    y_ = pd.read_input(y)
-    y_predict = fluid.layers.fc(input=x_, size=1, act=None)
-    cost = fluid.layers.square_error_cost(input=y_predict, label=y_)
-    avg_cost = fluid.layers.mean(x=cost)
-    pd.write_output(avg_cost)
-cost = pd()
-avg_cost = fluid.layers.mean(x=cost)
-sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01)
-sgd_optimizer.minimize(avg_cost)
-fluid.memory_optimize(fluid.default_main_program(), print_log=True)
-# fluid.release_memory(fluid.default_main_program())
-BATCH_SIZE = 200
-# fix the order of training data
-train_reader = paddle.batch(
-    paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE, drop_last=False)
-# train_reader = paddle.batch(
-#     paddle.reader.shuffle(
-#         paddle.dataset.uci_housing.train(), buf_size=500),
-#     batch_size=BATCH_SIZE)
-feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-exe = fluid.Executor(place)
-exe.run(fluid.default_startup_program())
-PASS_NUM = 100
-for pass_id in range(PASS_NUM):
-    for data in train_reader():
-        avg_loss_value, = exe.run(fluid.default_main_program(),
-                                  feed=feeder.feed(data),
-                                  fetch_list=[avg_cost])
-        if avg_loss_value[0] < 10.0:
-            exit(0)  # if avg cost less than 10.0, we think our code is good.
-        print(avg_loss_value[0])
-        if math.isnan(float(avg_loss_value)):
-            sys.exit("got NaN loss, training failed.")
-exit(1)
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -39,6 +39,7 @@ class TestParallelExecutorBase(unittest.TestCase):
                                  seed=None,
                                  use_parallel_executor=True,
                                  use_reduce=False,
+                                  use_ir_memory_optimize=False,
                                  fuse_elewise_add_act_ops=False,
                                  optimizer=fluid.optimizer.Adam,
                                  use_fast_executor=False,
@@ -82,6 +83,7 @@ class TestParallelExecutorBase(unittest.TestCase):
            build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \
                if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
            build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
+            build_strategy.memory_optimize = use_ir_memory_optimize
            build_strategy.enable_sequential_execution = enable_sequential_execution
            if use_cuda and core.is_compiled_with_cuda():
                build_strategy.remove_unnecessary_lock = True

--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -194,7 +194,8 @@ def adam_step(inputs, attributes):
    return param_out, moment1_out, moment2_out
-def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad):
+def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad,
+                     lazy_mode):
    '''
    Simulate one step of the adam optimizer
    :param inputs: dict of inputs
@@ -218,19 +219,30 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad):
    moment2_out = np.zeros(shape=[height, row_numel])
    param_out = np.zeros(shape=[height, row_numel])
-    for idx, row_id in enumerate(rows):
+    def update_row(row_id, update_value):
        moment1_out[row_id] = beta1 * moment1[row_id] + (1 - beta1
-                                                         ) * np_grad[idx]
+                                                         ) * update_value
        moment2_out[row_id] = beta2 * moment2[row_id] + (
-            1 - beta2) * np.square(np_grad[idx])
+            1 - beta2) * np.square(update_value)
        lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
        param_out[row_id] = param[row_id] - lr_t * (moment1_out[row_id] / (
            np.sqrt(moment2_out[row_id]) + epsilon))
+    if lazy_mode:
+        for idx, row_id in enumerate(rows):
+            update_row(row_id, np_grad[idx])
+    else:
+        for row_id in range(param_out.shape[0]):
+            update_value = np.zeros(np_grad[0].shape).astype("float32")
+            if row_id in rows:
+                update_value = np_grad[rows.index(row_id)]
+            update_row(row_id, update_value)
    return param_out, moment1_out, moment2_out
 class TestSparseAdamOp(unittest.TestCase):
-    def setup(self, scope, place):
+    def setup(self, scope, place, lazy_mode):
        beta1 = 0.78
        beta2 = 0.836
        epsilon = 1e-4
@@ -248,6 +260,7 @@ class TestSparseAdamOp(unittest.TestCase):
            'Beta2Pow': np.array([beta2**10]).astype("float32"),
            "LearningRate": np.full((1), 2.0).astype("float32")
        }
+        self.init_output = np.full((height, row_numel), 0.0).astype("float32")
        self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
        grad_selected_rows = scope.var('Grad').get_selected_rows()
@@ -262,19 +275,21 @@ class TestSparseAdamOp(unittest.TestCase):
        self.sparse_inputs = ["Grad"]
-        param_out, mom1, mom2 = adam_step_sparse(
+        param_out, mom1, mom2 = adam_step_sparse(self.dense_inputs, self.attrs,
-            self.dense_inputs, self.attrs, height, rows, row_numel, np_array)
+                                                 height, rows, row_numel,
+                                                 np_array, lazy_mode)
        self.outputs = {
            "ParamOut": param_out,
            "Moment1Out": mom1,
            "Moment2Out": mom2
        }
-    def check_with_place(self, place):
+    def check_with_place(self, place, lazy_mode):
        scope = core.Scope()
-        self.setup(scope, place)
+        self.setup(scope, place, lazy_mode)
        op_args = dict()
+        op_args['lazy_mode'] = lazy_mode
        for key, np_array in self.dense_inputs.items():
            var = scope.var(key).get_tensor()
            var.set(np_array, place)
@@ -283,7 +298,7 @@ class TestSparseAdamOp(unittest.TestCase):
            op_args[s] = s
        for s in self.outputs:
            var = scope.var(s).get_tensor()
-            var.set(self.outputs[s], place)
+            var.set(self.init_output, place)
            op_args[s] = s
        for k in self.attrs:
            op_args[k] = self.attrs[k]
@@ -297,20 +312,17 @@ class TestSparseAdamOp(unittest.TestCase):
            actual = np.array(out_var)
            actual = actual.reshape([actual.size])
            np_array = np_array.reshape([np_array.size])
-            for idx, row_id in enumerate(self.rows):
-                j = 0
+            for i in range(np_array.size):
-                while j < self.row_numel:
+                self.assertLess((actual[i] - np_array[i]), 0.00001)
-                    pos = row_id * self.row_numel + j
-                    self.assertLess((actual[pos] - np_array[pos]) / actual[pos],
+    def test_sparse_adam(self):
-                                    0.00001)
-                    j += 1
-    def test_sparse_sgd(self):
        places = [core.CPUPlace()]
        if core.is_compiled_with_cuda():
            places.append(core.CUDAPlace(0))
        for place in places:
-            self.check_with_place(place)
+            for lazy_mode in (True, False):
+                self.check_with_place(place, lazy_mode)
 if __name__ == "__main__":

--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
@@ -39,6 +39,7 @@ def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
    cost = network(data, label, len(word_dict))
+    cost.persistable = True
    optimizer = fluid.optimizer.Adagrad(learning_rate=0.2)
    optimizer.minimize(cost)

--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from parallel_executor_test_base import TestParallelExecutorBase
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import numpy as np
+import paddle
+import paddle.dataset.mnist as mnist
+import unittest
+import os
+MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio"
+def _feed_data_helper(use_feed):
+    if use_feed:
+        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    else:
+        reader = fluid.layers.open_files(
+            filenames=[MNIST_RECORDIO_FILE],
+            shapes=[[-1, 784], [-1, 1]],
+            lod_levels=[0, 0],
+            dtypes=['float32', 'int64'])
+        reader = fluid.layers.io.double_buffer(reader)
+        img, label = fluid.layers.read_file(reader)
+    return img, label
+def simple_fc_net(use_feed):
+    x, y = _feed_data_helper(use_feed)
+    hidden_layer = 4
+    for _ in range(hidden_layer):
+        x = fluid.layers.fc(input=x, size=20, act='relu')
+    y_predict = fluid.layers.fc(input=x, size=10, act='softmax')
+    cost = fluid.layers.cross_entropy(input=y_predict, label=y)
+    avg_cost = fluid.layers.mean(cost)
+    return avg_cost
+def fc_with_inplace_net(use_feed):
+    x, y = _feed_data_helper(use_feed)
+    fc = fluid.layers.fc(input=x, size=20, act='relu')
+    fc = fluid.layers.fc(input=fc, size=10, act='relu')
+    reshape = fluid.layers.reshape(x=fc, shape=[-1, 2, 5])
+    reshape = fluid.layers.reshape(x=reshape, shape=[-1, 5, 2])
+    y_predict = fluid.layers.fc(input=reshape, size=10, act='softmax')
+    cost = fluid.layers.cross_entropy(input=y_predict, label=y)
+    avg_cost = fluid.layers.mean(cost)
+    return avg_cost
+class TestMNIST(TestParallelExecutorBase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ['CPU_NUM'] = str(4)
+        # Convert mnist to recordio file
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            reader = paddle.batch(mnist.train(), batch_size=4)
+            feeder = fluid.DataFeeder(
+                feed_list=[  # order is image and label
+                    fluid.layers.data(
+                        name='image', shape=[784]),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            fluid.recordio_writer.convert_reader_to_recordio_file(
+                MNIST_RECORDIO_FILE, reader, feeder)
+    def _dummy_data(self):
+        np.random.seed(5)
+        img = np.random.random(size=[32, 784]).astype(np.float32)
+        label = np.ones(shape=[32, 1], dtype='int64')
+        return img, label
+    def _compare_ir_and_python_memory_optimize(self, model, use_cuda):
+        if use_cuda and not core.is_compiled_with_cuda():
+            return
+        img, label = self._dummy_data()
+        first_loss0, last_loss0 = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            memory_opt=False,
+            use_ir_memory_optimize=False)
+        first_loss1, last_loss1 = self.check_network_convergence(
+            model,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            memory_opt=False,
+            use_ir_memory_optimize=True)
+        for loss in zip(first_loss0, first_loss1):
+            self.assertAlmostEqual(loss[0], loss[1], delta=1e-6)
+        for loss in zip(last_loss0, last_loss1):
+            self.assertAlmostEqual(loss[0], loss[1], delta=1e-6)
+    def test_simple_fc_net(self):
+        self._compare_ir_and_python_memory_optimize(simple_fc_net, False)
+        self._compare_ir_and_python_memory_optimize(simple_fc_net, True)
+    def test_fc_with_reshape_net(self):
+        self._compare_ir_and_python_memory_optimize(fc_with_inplace_net, False)
+        self._compare_ir_and_python_memory_optimize(fc_with_inplace_net, True)
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_parallel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_op.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-import unittest
-import paddle.fluid as fluid
-from paddle.fluid.layers.device import get_places
-from paddle.fluid.layers.control_flow import ParallelDo
-import paddle.fluid.profiler as profiler
-import numpy
-import six
-class BaseParallelForTest(unittest.TestCase):
-    def run_test(self, callback, feed, fetch):
-        """
-        Run the unittest for parallel.for
-        Args:
-            callback(callable): A callable function returns a generator. There
-                are two yields in the generator function. The first yield
-                returns the data layers, and the second yield returns the loss.
-                The modified data variables will be sent back during the first
-                yield.
-            feed(dict): The executor feeding dictionary.
-            fetch(list|basestr): The fetch name lists.
-        Returns:
-            None
-        Raises:
-            AssertionError when the computation of cpu, parallel.for in cpu,
-                gpu, parallel.for in gpu are different.
-        """
-        cpu = fluid.CPUPlace()
-        result_cpu = self._run_test_impl_(
-            callback=callback,
-            feed=feed,
-            fetch=fetch,
-            place=cpu,
-            use_parallel=False)
-        result_cpu_parallel = self._run_test_impl_(
-            callback=callback,
-            feed=feed,
-            fetch=fetch,
-            place=cpu,
-            use_parallel=True)
-        if fluid.core.is_compiled_with_cuda():
-            gpu = fluid.CUDAPlace(0)
-            result_gpu = self._run_test_impl_(
-                callback=callback,
-                feed=feed,
-                fetch=fetch,
-                place=gpu,
-                use_parallel=False,
-                use_gpu=True)
-            result_gpu_parallel = self._run_test_impl_(
-                callback=callback,
-                feed=feed,
-                fetch=fetch,
-                place=gpu,
-                use_parallel=True,
-                use_gpu=True)
-            result_gpu_nccl = self._run_test_impl_(
-                callback=callback,
-                feed=feed,
-                fetch=fetch,
-                place=gpu,
-                use_parallel=True,
-                use_nccl=True,
-                use_gpu=True)
-            self._assert_same_(fetch, result_cpu, result_cpu_parallel,
-                               result_gpu, result_gpu_parallel, result_gpu_nccl)
-        else:
-            self._assert_same_(fetch, result_cpu, result_cpu_parallel)
-    def _run_test_impl_(self,
-                        callback,
-                        feed,
-                        fetch,
-                        place,
-                        use_parallel=False,
-                        use_nccl=False,
-                        use_gpu=False):
-        """
-        Run a single test, returns the fetch values
-        Args:
-            place(Place): the computation place.
-            use_parallel(bool): Whether use parallel.for or not.
-        Returns:
-            Fetched numpy arrays.
-        """
-        if isinstance(fetch, six.string_types):
-            fetch = [fetch]
-        main = fluid.Program()
-        startup = fluid.Program()
-        # Fix seed
-        main.random_seed = 10
-        startup.random_seed = 10
-        with fluid.program_guard(main, startup):
-            generator = callback()
-            # Automatically insert parallel do if use_parallel = True
-            if use_parallel:
-                thread_num = fluid.core.get_cuda_device_count(
-                ) if use_gpu else 8
-                places = get_places(thread_num)
-                pd = ParallelDo(places, use_nccl=use_nccl)
-                data = next(generator)
-                if isinstance(data, fluid.framework.Variable):
-                    data = [data]
-                with pd.do():
-                    ins = list(map(pd.read_input, data))
-                    if len(ins) == 1:
-                        ins = ins[0]
-                    loss = generator.send(ins)  # patch input
-                    pd.write_output(loss)
-                loss = pd()
-            else:
-                data = next(generator)
-                loss = generator.send(data)
-            self.assertIsNotNone(loss)
-            avg_loss = fluid.layers.mean(loss)
-            fluid.backward.append_backward(loss=avg_loss)
-        exe = fluid.Executor(place)
-        exe.run(startup)
-        if use_gpu:
-            profile_type = 'GPU'
-        else:
-            profile_type = 'CPU'
-        with profiler.profiler(profile_type, 'total', '/tmp/profiler'):
-            return exe.run(main, feed=feed, fetch_list=fetch)
-    def _assert_same_(self, fetch, *args):
-        """
-        Assert the return values of `run_test` are same.
-        Args:
-            fetch: Fetch list. Used for print error message
-            *args: The fetch result lists of each situations.
-        Returns:
-            None
-        Raises:
-            AssertionError
-        """
-        def _impl_(a, b, fetch_id, item_id):
-            item_str = [
-                'CPU', 'ParallelCPU', 'GPU', 'ParallelGPU', 'ParallelGPUNCCL'
-            ]
-            flag = numpy.allclose(a, b, rtol=0.1, atol=1e-3)
-            self.assertTrue(flag,
-                            "The {0} are different in {1}, {2} vs {3}".format(
-                                fetch[fetch_id], item_str[item_id], a, b))
-        for i, items in enumerate(zip(*args)):
-            self.assertGreater(len(items), 0)
-            for j in range(1, len(items)):
-                _impl_(items[0], items[j], fetch_id=i, item_id=j)
-class ParallelOpTest(BaseParallelForTest):
-    @staticmethod
-    def __network__():
-        x = fluid.layers.data(shape=[784], dtype='float32', name='img')
-        x = yield x
-        hidden = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
-        hidden = fluid.layers.batch_norm(input=hidden)
-        loss = fluid.layers.mean(hidden)
-        yield loss
-    def test_simple_fc(self):
-        self.run_test(
-            callback=self.__network__,
-            feed={
-                'img': numpy.random.random(size=(51, 784)).astype('float32')
-            },
-            fetch=['fc1.w@GRAD'])
-    def test_fc_with_tiny_data(self):
-        self.run_test(
-            callback=self.__network__,
-            feed={'img': numpy.random.random(size=(1, 784)).astype('float32')},
-            fetch=['fc1.w@GRAD'])
-class ParallelOpTestMultipleInput(BaseParallelForTest):
-    @staticmethod
-    def __network__():
-        x = fluid.layers.data(
-            shape=[784], dtype='float32', name='img1', stop_gradient=False)
-        y = fluid.layers.data(
-            shape=[784], dtype='float32', name='img2', stop_gradient=False)
-        yield [x, y]
-        x = x + y
-        hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
-        hidden2 = fluid.layers.fc(input=hidden1, size=200, param_attr='fc2.w')
-        hidden3 = fluid.layers.fc(input=hidden2, size=200, param_attr='fc3.w')
-        loss = fluid.layers.mean(hidden3)
-        yield loss
-    def test_simple_fc(self):
-        self.run_test(
-            callback=self.__network__,
-            feed={
-                'img1': numpy.random.random(size=(51, 784)).astype('float32'),
-                'img2': numpy.random.random(size=(51, 784)).astype('float32')
-            },
-            fetch=['fc1.w@GRAD', 'fc2.w@GRAD', 'fc3.w@GRAD'])
-if __name__ == '__main__':
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+from test_transpose_op import TestTransposeOp
+class TestTransposeMKLDNN(TestTransposeOp):
+    def init_op_type(self):
+        self.op_type = "transpose2"
+        self.use_mkldnn = True
+        self.is_test = True
+        return
+    def test_check_grad(self):
+        return
+    def test_check_grad_no_input(self):
+        return
+    def test_check_grad_no_filter(self):
+        return
+class TestCase0MKLDNN(TestTransposeMKLDNN):
+    def initTestCase(self):
+        self.shape = (3, )
+        self.axis = (0, )
+class TestCase1a(TestTransposeMKLDNN):
+    def initTestCase(self):
+        self.shape = (3, 4, 5)
+        self.axis = (0, 2, 1)
+class TestCase1b(TestTransposeMKLDNN):
+    def initTestCase(self):
+        self.shape = (3, 4, 5)
+        self.axis = (2, 1, 0)
+class TestCase2(TestTransposeMKLDNN):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5)
+        self.axis = (0, 2, 3, 1)
+class TestCase3(TestTransposeMKLDNN):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.axis = (4, 2, 3, 1, 0)
+class TestCase4(TestTransposeMKLDNN):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6, 1)
+        self.axis = (4, 2, 3, 1, 0, 5)
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -21,15 +21,24 @@ from op_test import OpTest
 class TestTransposeOp(OpTest):
    def setUp(self):
+        self.init_op_type()
        self.initTestCase()
-        self.op_type = "transpose2"
        self.inputs = {'X': np.random.random(self.shape).astype("float32")}
-        self.attrs = {'axis': list(self.axis)}
+        self.attrs = {
+            'axis': list(self.axis),
+            'use_mkldnn': self.use_mkldnn,
+            'is_test': self.is_test,
+        }
        self.outputs = {
            'XShape': np.random.random(self.shape).astype("float32"),
            'Out': self.inputs['X'].transpose(self.axis)
        }
+    def init_op_type(self):
+        self.op_type = "transpose2"
+        self.use_mkldnn = False
+        self.is_test = False
    def test_check_output(self):
        self.check_output(no_check_set=['XShape'])

--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -35,14 +35,14 @@ dtype_to_size = {
 }
 SUB_BLOCK_OPS = [
-    "while", "while_grad", "parallel_do", "parallel_do_grad",
+    "while", "while_grad", "conditional_block", "conditional_block_grad"
-    "conditional_block", "conditional_block_grad"
 ]
-SUB_BLOCK_PAIR = [("while", "while_grad"), ("parallel_do", "parallel_do_grad"),
+SUB_BLOCK_PAIR = [("while", "while_grad"),
                  ("conditional_block", "conditional_block_grad")]
 PRINT_LOG = False
+FLAGS_memory_optimize = ""
 class OrderedSet(MutableSet):
@@ -121,6 +121,7 @@ class ControlFlowGraph(object):
        self._defs = defaultdict(OrderedSet)
        self._live_in = defaultdict(OrderedSet)
        self._live_out = defaultdict(OrderedSet)
        self._skip_opt = skip_opt
        self.pool = []
@@ -144,7 +145,6 @@ class ControlFlowGraph(object):
        for i in range(self.op_size):
            self._uses[i].update(self._ops[i].input_arg_names())
            self._defs[i].update(self._ops[i].output_arg_names())
-            self._live_in[i] = self._uses[i]
    def _update_graph(self, old_name, new_name, begin_idx=0):
        for i in range(begin_idx, self.op_size):
@@ -177,20 +177,52 @@ class ControlFlowGraph(object):
                    worklist.append(d)
    def _fill_pool(self, i, is_forward):
+        def comparator(x, cache):
+            x_shape = x[1]
+            cache_shape = cache[1]
+            x_size = abs(reduce(lambda x, y: x * y, x_shape))
+            cache_size = abs(reduce(lambda x, y: x * y, cache_shape))
+            if (x_shape[0] == -1 and cache_shape[0] == -1) or \
+               (x_shape[0] != -1 and cache_shape[0] != -1) :
+                return x_size <= cache_size
+            else:
+                return False
+        def find_var_in_block(x):
+            known_vars = set()
+            for op in self._ops:
+                known_vars.update(op.output_arg_names())
+            return x in known_vars
        block_desc = self._ops[i].block()
        in_diff, _ = self._get_diff(self._live_in[i], self._live_out[i])
        # NOTE: must sort the in_diff set for cases that get different cache var.
        # FIXME(typhoonzero): maybe use a "sorted set" is better than this.
        can_optimize = [
-            x for x in in_diff
+            x for x in sorted(in_diff)
            if self._check_var_validity(block_desc, x, is_forward)
        ]
        if can_optimize:
            for var_name in can_optimize:
                cache = (var_name, self._find_var(block_desc, var_name,
                                                  is_forward).shape())
-                if cache not in self.pool:
+                if cache not in self.pool and find_var_in_block(var_name):
-                    self.pool.append(cache)
+                    i = 0
+                    while i < len(self.pool):
+                        mycache = self.pool[i]
+                        mysize = mycache[1][0]
+                        cache_size = cache[1][0]
+                        if (mysize == -1 and cache_size == -1) or \
+                           (mysize != -1 and cache_size != -1):
+                            if comparator(mycache, cache):
+                                i += 1
+                            else:
+                                break
+                        elif mysize == -1 and cache_size != -1:
+                            i += 1
+                        elif mysize != -1 and cache_size == -1:
+                            break
+                    self.pool.insert(i, cache)
    def _get_diff(self, a, b):
        u = a & b
@@ -229,7 +261,7 @@ class ControlFlowGraph(object):
    def _update_skip_opt_set(self):
        for i in range(self.op_size):
            op = self._ops[i]
-            if op.type() == "fill_constant" and op.attr("force_cpu") == True:
+            if op.has_attr("force_cpu") and op.attr("force_cpu") == True:
                self._skip_opt.update(op.output_arg_names())
    def release_memory(self, skip_opt_set=None):
@@ -281,6 +313,7 @@ class ControlFlowGraph(object):
        # update skip set to meet users' demand
        if skip_opt_set:
            self._skip_opt.update(skip_opt_set)
+        counter = 0
        for i in range(self.op_size):
            op = self._ops[i]
            if op.type() in SUB_BLOCK_OPS:
@@ -301,6 +334,9 @@ class ControlFlowGraph(object):
                    # If x is both in uses and defs, it can not be optimized!
                    if x in self._uses[i]:
                        continue
+                    if x == FLAGS_memory_optimize:
+                        print("start match var ", x, " of op ", op.type())
+                        print(self.pool)
                    for index, cache_pair in enumerate(self.pool):
                        cache_var = cache_pair[0]
                        cache_shape = cache_pair[1]
@@ -323,15 +359,13 @@ class ControlFlowGraph(object):
                        if not compare_shape(x_shape, cache_shape, level):
                            continue
                        # TODO(qijun): dtype_to_size[x_dtype] and dtype_to_size[cache_dtype]
-                        if x_dtype != cache_dtype:
-                            continue
                        if PRINT_LOG:
-                            print(("Hit Cache !!!! cache pool index "
+                            print(
-                                   "is %d, var name is %s, "
+                                ("!!! %d,  %s => %s, cache idx %d, pool size %d"
-                                   "cached var name is %s, "
+                                 % (counter, x + str(x_shape),
-                                   "var shape is %s ") % (index, x, cache_var,
+                                    cache_var + str(cache_shape), index,
-                                                          str(cache_shape)))
+                                    len(self.pool))))
+                            counter += 1
                        self.pool.pop(index)
                        # Rename the var to the cache var already with
                        # memory allocated in order to reuse the memory.
@@ -484,8 +518,11 @@ def memory_optimize(input_program,
    if level != 0 and level != 1:
        raise ValueError("only support opt_level 0 or 1.")
-    if skip_opt_set is not None and not isinstance(skip_opt_set, set):
+    if skip_opt_set is not None:
-        raise ValueError("only support skip_opt_set as set.")
+        if isinstance(skip_opt_set, set) or isinstance(skip_opt_set, list):
+            skip_opt_set = set(skip_opt_set)
+        else:
+            raise ValueError("only support skip_opt_set as set.")
    global PRINT_LOG
    PRINT_LOG = print_log
    if skip_grads:

--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -160,10 +160,11 @@ if '${WITH_FLUID_ONLY}'== 'OFF':
 # put all thirdparty libraries in paddle.libs
 libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs'
-if os.name != 'nt':
-    package_data['paddle.libs']= []
+package_data['paddle.libs']= []
-    package_data['paddle.libs']=['libwarpctc' + ext_name]
+package_data['paddle.libs']=[('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_name]
-    shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
+shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
 if '${WITH_MKL}' == 'ON':
    shutil.copy('${MKLML_LIB}', libs_path)
    shutil.copy('${MKLML_IOMP_LIB}', libs_path)