merge release/1.3

f79a3a83 · sneaxiy · d2d3f2b5 · f79a3a83 · f79a3a83 · f79a3a83
64 changed file
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -231,7 +231,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
    )
 ENDFUNCTION()

-SET(PROTOBUF_VERSION 3.1)
+SET(PROTOBUF_VERSION 3.1.0)

 IF(NOT PROTOBUF_FOUND)
    build_protobuf(extern_protobuf FALSE)

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -21,7 +21,7 @@ function(CheckCompilerCXX11Flag)
            if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3)
                message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.")
            endif()
-        endif()   
+        endif()
    endif()
 endfunction()

@@ -147,6 +147,7 @@ set(GPU_COMMON_FLAGS
    -Wno-error=unused-function  # Warnings in Numpy Header.
    -Wno-error=array-bounds # Warnings in Eigen::array
 )
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64")
 endif(NOT WIN32)

 if (APPLE)

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -261,7 +261,7 @@ paddle.fluid.layers.increment ArgSpec(args=['x', 'value', 'in_place'], varargs=N
 paddle.fluid.layers.array_write ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.create_array ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.less_than ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords='ignored', defaults=(None, None))
-paddle.fluid.layers.equal ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords='ignored', defaults=(None,))
+paddle.fluid.layers.equal ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.array_read ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.array_length ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.IfElse.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -50,12 +50,15 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)

-cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper)
+if(WITH_GPU)
+cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info)
+else()
+cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper cpu_info)
+endif()
+
 cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass)
 cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info)
 cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
-cc_library(memory_early_delete_pass SRCS memory_early_delete_pass.cc DEPS memory_optimize_pass computation_op_handle scale_loss_grad_op_handle rpc_op_handle
-        all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
 cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle)
 cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
 cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass)
@@ -67,13 +70,11 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)

-set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass memory_early_delete_pass inplace_op_pass)
+set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass)
 if (WITH_GPU)
  list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
 endif()
-cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph)
-cc_test(memory_optimize_pass_test SRCS memory_optimize_pass_test.cc memory_optimize_pass.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry pass)
-
+cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry)
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})

 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope

--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -206,8 +206,6 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
          new std::vector<OpDesc *>(main_program.Block(0).AllOps());
      graph->Set<const std::vector<OpDesc *>>(kAllOpDescs,
                                              all_op_descs);  // take ownership
-      graph->Set<GraphNodePool>(kGraphNodePool,
-                                new GraphNodePool);  // take ownership

      pass->Erase(kAllOpDescs);
      pass->SetNotOwned<const std::vector<OpDesc *>>(kAllOpDescs, all_op_descs);
@@ -242,7 +240,9 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
        continue;
      }
    }
+    VLOG(3) << "Start Apply Pass " << pass->Type();
    graph = pass->Apply(std::move(graph));
+    VLOG(3) << "Finish Apply Pass " << pass->Type();
  }
  return graph;
 }

--- a/paddle/fluid/framework/details/inplace_op_pass.cc
+++ b/paddle/fluid/framework/details/inplace_op_pass.cc
@@ -49,7 +49,7 @@ DEFINE_bool(
    "If this option turns on, only these op in whitelist can be inplaced."
    "If it turns off, all of the running op can be candidate of inplaced op."
    "Such as scale, elementwise_add"
-    "By default, it's turned on");
+    "By default, it's turned off");

 DECLARE_string(memory_optimize_debug);

@@ -171,16 +171,15 @@ void InplacePass::InplaceModifyDesc(const std::string& var,
  }
 }

-const SSANodePair InplacePass::TryInplaceModifyVar(const std::string& var,
-                                                   const std::string& cache_var,
-                                                   const size_t& idx,
-                                                   ir::Graph* graph) const {
+const NodeSwapQueue InplacePass::TryInplaceModifyVar(
+    const std::string& var, const std::string& cache_var, const size_t& idx,
+    ir::Graph* graph) const {
  PADDLE_ENFORCE(var_nodes_[var].size() >= 1 &&
                 var_nodes_[var].at(0)->Var() != nullptr);
  std::unique_ptr<VarDesc> var_desc(new VarDesc(*var_nodes_[var].at(0)->Var()));
  var_desc->SetName(cache_var);

-  SSANodePair swap_nodes;
+  NodeSwapQueue swap_nodes;

  for (size_t i = idx; i < view_.AllOps().size(); ++i) {
    auto* op = view_.AllOps()[i];
@@ -230,7 +229,7 @@ const SSANodePair InplacePass::TryInplaceModifyVar(const std::string& var,
  return swap_nodes;
 }

-void InplacePass::CommitModify(const SSANodePair& swap_nodes,
+void InplacePass::CommitModify(const NodeSwapQueue& swap_nodes,
                               ir::Graph* graph) const {
  for (auto& pair : swap_nodes) {
    auto *node = pair.first, *cache_node = pair.second;
@@ -245,7 +244,7 @@ void InplacePass::CommitModify(const SSANodePair& swap_nodes,
  }
 }

-void InplacePass::WithdrawModify(const SSANodePair& nodes,
+void InplacePass::WithdrawModify(const NodeSwapQueue& nodes,
                                 ir::Graph* graph) const {
  for (auto& pair : nodes) {
    auto *node = pair.first, *cache_node = pair.second;
@@ -403,18 +402,20 @@ void GraphView::Build(ir::Graph* g) {
  // 2. track the nodes which used by parameter server.
  // these node can not be inplaced, otherwise trainer
  // pserver can not find each other name.
-  for (auto& node : g->Nodes()) {
-    if (!node->IsOp()) continue;
-    if (node->Name() == "send") {
-      for (auto& in : node->inputs) {
-        dup_nodes_.emplace(in->Name());
-      }
+  auto update_skip_set = [&](ir::Node* node) {
+    for (auto& in : node->inputs) {
+      if (in->IsVar() && in->Var() != nullptr) dup_nodes_.emplace(in->Name());
    }
-    if (node->Name() == "recv") {
-      for (auto& out : node->outputs) {
+    for (auto& out : node->outputs) {
+      if (out->IsVar() && out->Var() != nullptr)
        dup_nodes_.emplace(out->Name());
-      }
    }
+  };
+  for (auto& node : g->Nodes()) {
+    if (!node->IsOp()) continue;
+    if (node->Name() == "send") update_skip_set(node);
+    if (node->Name() == "recv") update_skip_set(node);
+    if (node->Name() == "prefetch") update_skip_set(node);
  }
 }


--- a/paddle/fluid/framework/details/inplace_op_pass.h
+++ b/paddle/fluid/framework/details/inplace_op_pass.h
@@ -56,7 +56,8 @@ class GraphView {
  std::map<ir::Node*, std::unordered_set<ir::Node*>> adj_list_;
 };

-typedef std::vector<std::pair<ir::Node*, ir::Node*>> SSANodePair;
+// swap pairs in sequence
+typedef std::vector<std::pair<ir::Node*, ir::Node*>> NodeSwapQueue;
 class InplacePass : public ir::Pass {
 public:
  InplacePass();
@@ -68,14 +69,14 @@ class InplacePass : public ir::Pass {
  void InitSSAGraphNodes() const;

 private:
-  const SSANodePair TryInplaceModifyVar(const std::string& var,
-                                        const std::string& cache_var,
-                                        const size_t& idx,
-                                        ir::Graph* graph) const;
+  const NodeSwapQueue TryInplaceModifyVar(const std::string& var,
+                                          const std::string& cache_var,
+                                          const size_t& idx,
+                                          ir::Graph* graph) const;

-  void CommitModify(const SSANodePair&, ir::Graph* graph) const;
+  void CommitModify(const NodeSwapQueue&, ir::Graph* graph) const;

-  void WithdrawModify(const SSANodePair& nodes, ir::Graph* graph) const;
+  void WithdrawModify(const NodeSwapQueue& nodes, ir::Graph* graph) const;

  void InplaceModifyDesc(const std::string& in_var, const std::string& out_var,
                         const size_t& idx) const;

--- a/paddle/fluid/framework/details/memory_optimize_helper.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper.cc
@@ -13,17 +13,114 @@
 // limitations under the License.

 #include "paddle/fluid/framework/details/memory_optimize_helper.h"
+#include <algorithm>
+#include <deque>
 #include <functional>
-#include <iostream>
+#include <iterator>
 #include <numeric>
 #include <sstream>
 #include <string>
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/gpu_info.h"
+#endif  // PADDLE_WITH_CUDA

 namespace paddle {
 namespace framework {
 namespace details {
+using paddle::framework::VarDesc;
+
+std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph) {
+  PADDLE_ENFORCE(graph.Has(kAllOpDescs),
+                 "Graph has no attribute of kAllOpDescs.");
+  // 1. get op desc order
+  auto& op_descs = graph.Get<const std::vector<OpDesc*>>(kAllOpDescs);
+
+  // 2. topology sort order
+  auto nodes = graph.Nodes();
+  std::deque<ir::Node*> ops;
+  FilterVariables(nodes, [&](ir::Node* op) {
+    if (op->IsOp() && op->Op() != nullptr) {
+      ops.emplace_back(op);
+    }
+  });
+  std::unordered_map<ir::Node*, size_t> op_deps;
+  std::list<ir::Node*> ready_ops;
+  std::unordered_map<ir::Node*, std::unordered_set<ir::Node*>> pending_ops;
+
+  for (auto* op : ops) {
+    std::unordered_set<ir::Node*> preceding_op;
+    for (auto* in : op->inputs) {
+      if (in->inputs.empty()) continue;
+      PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp());
+      preceding_op.emplace(in->inputs[0]);
+      pending_ops[in->inputs[0]].emplace(op);
+    }
+    op_deps[op] = preceding_op.size();
+    if (preceding_op.empty()) {
+      ready_ops.emplace_back(op);
+    }
+  }
+
+  // 3. generated op list based desc order and the topology order
+  std::vector<ir::Node*> ret;
+  std::list<OpDesc*> op_descs_list(op_descs.begin(), op_descs.end());
+
+  auto update_by_found_node = [&](ir::Node* found_node) {
+    for (auto* pending_op : pending_ops[found_node]) {
+      if (--op_deps[pending_op] == 0) {
+        ready_ops.emplace_back(pending_op);
+      }
+    }
+    ready_ops.remove(found_node);
+    ret.emplace_back(found_node);
+  };
+
+  while (!ready_ops.empty()) {
+    bool all_of_ready_op_unmatched = true;
+    for (auto it = op_descs_list.begin(); it != op_descs_list.end();) {
+      auto op_desc = *it;
+      ir::Node* found_node = nullptr;
+      for (auto* op : ready_ops) {
+        if (IsSameDesc(op->Op(), op_desc)) {
+          found_node = op;
+          break;
+        }
+      }
+
+      // 3.1 op desc deleted by other pass
+      if (found_node == nullptr) {
+        ++it;
+        continue;
+      } else {
+        all_of_ready_op_unmatched = false;
+        it = op_descs_list.erase(it);
+      }
+      update_by_found_node(found_node);
+    }
+
+    // 3.2 op descs are added by other pass
+    // preceding op non empty means some new op descs are
+    // created, but not contained in return node list.
+    // these new op desc may depend on each other.
+    std::list<ir::Node*> prev_ready_ops(ready_ops);
+    if (all_of_ready_op_unmatched) {
+      for (auto op : prev_ready_ops) {
+        update_by_found_node(op);
+      }
+    }
+  }
+
+  PADDLE_ENFORCE(std::all_of(
+      op_deps.begin(), op_deps.end(),
+      [&](const std::pair<ir::Node*, size_t>& p) { return p.second == 0; }));
+
+  return ret;
+}

-size_t NodeSizeInBytes(const VarDesc& node) {
+size_t NodeSize(const VarDesc& node) {
  auto shape = node.GetShape();
  int size =
      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
@@ -31,9 +128,9 @@ size_t NodeSizeInBytes(const VarDesc& node) {
  return type_size * std::abs(size);
 }

-size_t NodeSizeInBytes(ir::Node* n) {
+size_t NodeSize(ir::Node* n) {
  auto* desc = FindVarDescInBlock(n);
-  return NodeSizeInBytes(*desc);
+  return NodeSize(*desc);
 }

 std::string DebugStringImpl(VarDesc* var) {
@@ -59,7 +156,6 @@ std::string DebugStringImpl(VarDesc* var) {
 std::string DebugString(ir::Node* var) {
  return DebugStringImpl(FindVarDescInBlock(var));
 }
-// return DebugString(var->Var()); }

 // NOTE(dzh): based ir node, if a large node has been reused
 // by a small size node, then next time it appear in pool, it will
@@ -76,22 +172,26 @@ struct NodeComparator {
  bool operator()(ir::Node* lhs, ir::Node* rhs) const {
    auto* lhs_desc = FindVarDescInBlock(lhs);
    auto* rhs_desc = FindVarDescInBlock(rhs);
+    // match data type
+    if (lhs_desc->GetDataType() != rhs_desc->GetDataType()) {
+      return false;
+    }
+    // match shape
    auto lhs_shape = lhs_desc->GetShape();
    auto rhs_shape = rhs_desc->GetShape();
    if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) ||
        (lhs_shape[0] != -1 && rhs_shape[0] != -1)) {
-      return NodeSizeInBytes(lhs) <= NodeSizeInBytes(rhs);
+      return NodeSize(lhs) <= NodeSize(rhs);
    } else {
      return false;
    }
  }
 };

-void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) {
+void OrderedSet::Insert(ir::Node* var) {
  PADDLE_ENFORCE(var->IsVar() && !var->IsCtrlVar());
-  PADDLE_ENFORCE(op->IsOp());
  if (mark_table_.count(var->Name()) != 0) {
-    mark_table_[var->Name()]->second.insert(op);
+    mark_table_[var->Name()]->emplace_back(var);
    return;
  }

@@ -99,14 +199,15 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) {
  auto var_shape = var_desc->GetShape();
  int batch_size = static_cast<int>(var_shape[0]);

-  NodeComparator compare_node;
+  NodeComparator functor;
  Iter it = nodes_.begin();
  while (it != nodes_.end()) {
-    auto* cache_desc = FindVarDescInBlock(it->first);
+    auto& prev = it->front();
+    auto* cache_desc = FindVarDescInBlock(prev);
    int cache_batch_size = cache_desc->GetShape()[0];
    if ((cache_batch_size == -1 && batch_size == -1) ||
        (cache_batch_size != -1 && batch_size != -1)) {
-      if (compare_node(it->first, var)) {
+      if (functor(prev, var)) {
        ++it;
      } else {
        break;
@@ -118,62 +219,127 @@ void OrderedNodeList::Insert(ir::Node* var, ir::Node* op) {
    }
  }

-  it =
-      nodes_.insert(it, std::make_pair(var, std::unordered_set<ir::Node*>{op}));
+  it = nodes_.insert(it, {var});
  mark_table_[var->Name()] = it;
 }

-int OrderedNodeList::GetIndex(ir::Node* var) {
+int OrderedSet::GetNodeIndexInPool(ir::Node* var) {
  return std::distance(nodes_.begin(), mark_table_[var->Name()]);
 }

-ir::Node* OrderedNodeList::NodeMatch(ir::Node* var) const {
+ir::Node* OrderedSet::FindBestFitNode(ir::Node* var) const {
  ir::Node* found_node = nullptr;
-  NodeComparator compare_node;
+  NodeComparator functor;

  for (auto it = nodes_.begin(); it != nodes_.end(); ++it) {
-    if (compare_node(var, it->first)) {
-      found_node = it->first;
+    auto& candidate = it->front();
+    if (functor(var, candidate)) {
+      found_node = candidate;
      break;
    }
  }
  return found_node;
 }

-void OrderedNodeList::Erase(ir::Node* var) { Erase(var->Name()); }
+ir::Node* OrderedSet::FindNextBestFitNode(ir::Node* var, ir::Node* prev) const {
+  ir::Node* found_node = nullptr;
+  NodeComparator functor;
+  auto it =
+      std::find_if(nodes_.begin(), nodes_.end(), [&](const NodeVector& v) {
+        if (v.front() == prev)
+          return true;
+        else
+          return false;
+      });
+  PADDLE_ENFORCE(it != nodes_.end(), "Not found previous in node list!");
+  for (it = std::next(it); it != nodes_.end(); ++it) {
+    auto& candidate = it->front();
+    if (functor(var, candidate)) {
+      found_node = candidate;
+      break;
+    }
+  }
+  return found_node;
+}
+
+bool OrderedSet::Has(ir::Node* var) const {
+  if (mark_table_.count(var->Name())) {
+    auto& node_in_samename = mark_table_.at(var->Name());
+    auto iter =
+        std::find_if(node_in_samename->begin(), node_in_samename->end(),
+                     [&](ir::Node* n) { return n->Name() == var->Name(); });
+    return iter != node_in_samename->end();
+  }
+  return false;
+}

-void OrderedNodeList::Erase(const std::string& var) {
+void OrderedSet::Erase(const std::string& var) {
  PADDLE_ENFORCE(mark_table_.count(var));
  nodes_.erase(mark_table_[var]);
  mark_table_.erase(var);
 }

-std::string OrderedNodeList::ToString() const {
+void OrderedSet::Erase(ir::Node* var) {
+  PADDLE_ENFORCE(var != nullptr);
+  Erase(var->Name());
+}
+
+std::string OrderedSet::ToString() const {
  std::stringstream ss;
  for (auto it = nodes_.begin(); it != nodes_.end(); ++it) {
-    ss << DebugString(it->first) << " ";
+    for (auto& node : *it) {
+      ss << DebugString(node) << " ";
+    }
  }
  return ss.str();
 }

 bool NodeCanReused(ir::Node* node) {
+  // valid the node is a var node
  if (node == nullptr || !node->IsVar() || node->IsCtrlVar()) return false;
-  // auto* desc = node->Var();
-  bool flag = NodeCanReused(*node->Var());
+
+  bool flag = true;
+  // op output force generated in cpu, can not be reused.
  for (auto* op : node->inputs) {
    if (op->Op()->HasAttr("force_cpu")) {
-      // op output force generated in cpu, can not be reused.
      flag &= framework::AttrReader(op->Op()->GetAttrMap())
                  .Get<bool>("force_cpu") == 0;
    }
  }
+  // var desc validation.
+  flag &= NodeCanReused(*node->Var());
  return flag;
 }

+int MinChunkSize() {
+  int size{0};
+#ifdef PADDLE_WITH_CUDA
+  size = platform::GpuMinChunkSize();
+#else
+  size = platform::CpuMinChunkSize();
+#endif  // PADDLE_WITH_CUDA
+  return size;
+}
+
 bool NodeCanReused(const VarDesc& node) {
  auto type = node.GetType();
-  if (node.Persistable() || type != proto::VarType::LOD_TENSOR ||
-      node.GetShape().empty()) {
+  // only these types holds bulk of gpu memory
+  if (!(type == proto::VarType::LOD_TENSOR ||
+        type == proto::VarType::SELECTED_ROWS ||
+        type == proto::VarType::LOD_TENSOR_ARRAY)) {
+    return false;
+  }
+  // persistable variable is parameter
+  if (node.Persistable()) {
+    return false;
+  }
+  // shape < min_chunk_size is meaningless.
+  // further more, fetched loss always has size = 1
+  // which should not be reused.
+  auto shape = node.GetShape();
+  int size = std::abs(
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()));
+  if (shape.empty() || size < MinChunkSize()) {
    return false;
  }
  // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
@@ -193,6 +359,176 @@ bool OpHasSubBlock(OpDesc* desc) {
  return false;
 }

+ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) {
+  ops_ = SortOpLikeDescOrder(graph);
+  ConnectNodes();
+}
+
+void ControlFlowGraph::BuildCFGGraph() {
+  // FIXME(dzh): same effect with ConnectNodes, but use the control
+  // link to build dependency graph, it goes wrong in transformer.
+  for (ir::Node* op : ops_) {
+    for (auto& input_var : op->inputs) {
+      if (!input_var->inputs.empty()) {
+        PADDLE_ENFORCE(
+            input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(),
+            "Preceding Op Node of Var Node must be unique");
+        auto* pred_op = input_var->inputs[0];
+        if (pred_op->Op() != nullptr) {
+          predecessors_[op].insert(pred_op);
+          successors_[pred_op].insert(op);
+        }
+      }
+      if (input_var->IsVar() && !input_var->IsCtrlVar()) {
+        uses_[op].insert(input_var->Name());
+      }
+    }
+    for (auto& output_var : op->outputs) {
+      // output var may be used by many op
+      for (auto* succ_op : output_var->outputs) {
+        if (succ_op->Op() != nullptr) {
+          successors_[op].insert(succ_op);
+          predecessors_[succ_op].insert(op);
+        }
+      }
+      if (output_var->IsVar() && !output_var->IsCtrlVar()) {
+        defs_[op].insert(output_var->Name());
+      }
+    }
+  }
+}
+
+void ControlFlowGraph::ConnectNodes() {
+  for (size_t i = 0; i < ops_.size(); ++i) {
+    auto& op = ops_[i];
+    try {
+      auto& next_op = ops_.at(i + 1);
+      successors_[op].insert(next_op);
+      predecessors_[next_op].insert(op);
+    } catch (...) {
+      // do nothing
+    }
+
+    FilterVariables(op->inputs,
+                    [&](ir::Node* var) { uses_[op].emplace(var->Name()); });
+
+    FilterVariables(op->outputs,
+                    [&](ir::Node* var) { defs_[op].emplace(var->Name()); });
+  }
+}
+
+void ControlFlowGraph::LiveVariableAnalysis() {
+  // NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm)
+  // compute the liveness of for each variable though reversed_ops algorithm.
+  // It iterates the operators from end to begin, compute the live in/live out
+  // variable set for each op, then the diff between in/out will be used for
+  // the variable reuse. For detail refer to
+  // http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf
+  std::list<ir::Node*> work_list(ops_.rbegin(), ops_.rend());
+  while (!work_list.empty()) {
+    ir::Node* op = work_list.front();
+    work_list.pop_front();
+    // get the live_in calculated before. Empty if first.
+    auto prev_live_in = std::move(live_in_[op]);
+    for (auto& s : successors_[op]) {
+      for (auto& var : live_in_[s]) {
+        live_out_[op].insert(var);
+      }
+    }
+    for (auto& var : uses_[op]) {
+      live_in_[op].insert(var);
+    }
+    for (auto& var : live_out_[op]) {
+      live_in_[op].insert(var);
+    }
+    for (auto& var : defs_[op]) {
+      live_in_[op].erase(var);
+    }
+
+    // If the live_in is not changed, then the liveness analysis of
+    // predecessors is completed.
+    //
+    // Otherwise, recalculate the predecessors liveness
+    if (live_in_[op] != prev_live_in) {
+      for (auto& pre : predecessors_[op]) {
+        work_list.push_back(pre);
+      }
+    }
+  }
+}
+
+void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
+                                           const std::string& new_node,
+                                           int begin_idx) {
+  // update graph from begin idx to the end
+  for (size_t i = begin_idx; i != ops_.size(); ++i) {
+    auto* op = ops_[i];
+    if (uses_[op].find(old_node) != uses_[op].end()) {
+      uses_[op].erase(old_node);
+      uses_[op].insert(new_node);
+    }
+    if (defs_[op].find(old_node) != defs_[op].end()) {
+      defs_[op].erase(old_node);
+      defs_[op].insert(new_node);
+    }
+    if (live_in_[op].find(old_node) != live_in_[op].end()) {
+      live_in_[op].erase(old_node);
+      live_in_[op].insert(new_node);
+    }
+    if (live_out_[op].find(old_node) != live_out_[op].end()) {
+      live_out_[op].erase(old_node);
+      live_out_[op].insert(new_node);
+    }
+  }
+}
+
+const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const {
+  auto it = live_in_.find(op);
+  PADDLE_ENFORCE(
+      it != live_in_.end(),
+      string::Sprintf("Expect %s in live_in, but Not Found.", op->Name()));
+  return it->second;
+}
+
+const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const {
+  auto it = live_out_.find(op);
+  PADDLE_ENFORCE(
+      it != live_out_.end(),
+      string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
+  return it->second;
+}
+
+const std::set<std::string> ControlFlowGraph::Use(ir::Node* op) const {
+  auto it = uses_.find(op);
+  PADDLE_ENFORCE(
+      it != uses_.end(),
+      string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
+  return it->second;
+}
+
+const std::vector<ir::Node*> ControlFlowGraph::Ops() const { return ops_; }
+
+std::vector<ir::Node*>& ControlFlowGraph::Ops() { return ops_; }
+
+ir::Node* ControlFlowGraph::GetNodeByName(const std::string& name,
+                                          ir::Node* op) const {
+  // in ssa-graph, different version nodes have same name,
+  // this function get the latest version var before target op
+  // It may return nullptr, such as data node.
+  ir::Node* found_node = nullptr;
+  for (auto* node : ops_) {
+    if (node == op) break;
+    for (auto& output : node->outputs) {
+      PADDLE_ENFORCE((output != nullptr && output->IsVar()),
+                     "Output is empty!");
+      if (output->Var() && output->Name() == name) {
+        found_node = output;
+      }
+    }
+  }
+  return found_node;
+}
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/memory_optimize_helper.h
+++ b/paddle/fluid/framework/details/memory_optimize_helper.h
@@ -17,6 +17,8 @@
 #include <iostream>
 #include <iterator>
 #include <list>
+#include <map>
+#include <set>
 #include <string>
 #include <utility>
 #include <vector>
@@ -27,41 +29,43 @@ namespace paddle {
 namespace framework {
 namespace details {

-constexpr char kFetchedVars[] = "fetched_vars";
-constexpr char kGraphNodePool[] = "graph_node_pool";
+constexpr char kAllOpDescs[] = "all_op_descs";

-// NOTE(dzh): Variable and the operators use the var.
-// for early delete pass.
-// Because analysis var pass build base on ir::Node, which maybe released
-// or modified between passes, so we use OpDesc* to mark ops.
-using GraphNodePool = std::vector<
-    std::pair<std::string /*var node*/, std::unordered_set<OpDesc*> /* ops */>>;
+std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);

-// NOTE(dzh): by default, it sort node in ascend order(by node bytes size).
-// in fluid, -1 means the batch_size is determined in runtime.
-// the node batch_size equal -1 always ranking in the front than the node not.
+// NOTE(dzh): A ordered set for node reuse in memory optimize.
+// the orderedset sort node in ascend order(by node bytes size).
+// in fluid, -1 means the batch_size, which is determined in runtime.
+// So the reuse happens between nodes who's batch_size both are -1
+// simultaneously or not.
+//
+// sort rule:
+// rule 0 : smaller node ranking in front.
+// rule 1 : batch_size equal -1 ranking in the front than the node not.
+//
 // For example,
 // node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], ..
-// O(1) insert, delete
-class OrderedNodeList {
- public:
-  using NodePair = std::pair<ir::Node*, std::unordered_set<ir::Node*>>;
-  using Iter = typename std::list<NodePair>::iterator;
-  using ConstIter = typename std::list<NodePair>::const_iterator;

-  void Insert(ir::Node* var, ir::Node* op);
+class OrderedSet {
+ public:
+  // nodes with same name exists in pool.
+  using NodeVector = std::vector<ir::Node*>;
+  using Iter = typename std::list<NodeVector>::iterator;
+  using ConstIter = typename std::list<NodeVector>::const_iterator;

+  void Insert(ir::Node* var);
  void Erase(ir::Node* var);
-
  void Erase(const std::string& var);
-
-  bool Has(ir::Node* var) { return mark_table_.count(var->Name()); }
-
-  bool Has(const std::string& var) { return mark_table_.count(var); }
-
-  ir::Node* NodeMatch(ir::Node* var) const;
+  bool Has(ir::Node* var) const;
+  void Clear() {
+    mark_table_.clear();
+    nodes_.clear();
+  }
+  // find the bestfit shape node block with var.
+  ir::Node* FindBestFitNode(ir::Node* var) const;
+  ir::Node* FindNextBestFitNode(ir::Node* var, ir::Node* prev) const;
  // map store non-const iterator, can not promise const
-  int GetIndex(ir::Node* var);
+  int GetNodeIndexInPool(ir::Node* var);
  // pool all node to string
  std::string ToString() const;

@@ -69,18 +73,54 @@ class OrderedNodeList {
  Iter end() { return nodes_.end(); }
  ConstIter begin() const { return nodes_.begin(); }
  ConstIter end() const { return nodes_.end(); }
-  size_t size() const { return nodes_.size(); }

-  void Clear() {
-    mark_table_.clear();
-    nodes_.clear();
-  }
+  size_t size() const { return nodes_.size(); }

 private:
  // for searching.
  std::unordered_map<std::string, Iter> mark_table_;
-  // node swap pairs. var -> ops dep var
-  std::list<NodePair> nodes_;
+  // node pool
+  std::list<NodeVector> nodes_;
+};
+
+class ControlFlowGraph {
+ public:
+  ControlFlowGraph() = default;
+  // IR Graph
+  explicit ControlFlowGraph(const ir::Graph& graph);
+
+  void LiveVariableAnalysis();
+
+  void RenameVarInCFGGraph(const std::string& old_node,
+                           const std::string& new_node, int begin_idx);
+
+  const std::set<std::string> LiveIn(ir::Node* op) const;
+  const std::set<std::string> LiveOut(ir::Node* op) const;
+  const std::set<std::string> Use(ir::Node* op) const;
+  const std::vector<ir::Node*> Ops() const;
+  std::vector<ir::Node*>& Ops();
+
+  // for ssa-graph nodes
+  ir::Node* GetNodeByName(const std::string& name, ir::Node* op) const;
+
+ private:
+  void BuildCFGGraph();
+  void ConnectNodes();
+
+  using NodeListMap = std::unordered_map<ir::Node*, std::set<ir::Node*>>;
+  using VarSetMap = std::map<ir::Node*, std::set<std::string>>;
+  // successors ops use the output variables.
+  NodeListMap successors_;
+  // predecessors ops generated input variables.
+  NodeListMap predecessors_;
+  // variables lived before run current op.
+  VarSetMap live_in_;
+  // variables lived after run current op.
+  VarSetMap live_out_;
+  VarSetMap uses_;  // op inputs
+  VarSetMap defs_;  // op outputs
+
+  std::vector<ir::Node*> ops_;  // op sequence by topology sort
 };

 // valid a tensor can be reuse or not
@@ -93,15 +133,24 @@ bool NodeCanReused(const VarDesc& node);
 bool OpHasSubBlock(OpDesc* desc);

 // node memory size in bytes
-size_t NodeSizeInBytes(ir::Node* n);
+size_t NodeSize(ir::Node* n);

 // node memory size in bytes
-size_t NodeSizeInBytes(const VarDesc&);
+size_t NodeSize(const VarDesc&);

 std::string DebugString(ir::Node* var);

+// NOTE(dzhwinter)
+// after node reuse, the replaced node shape is
+// different with its VarDesc. So need to find the
+// correct VarDesc in Block.
 VarDesc* FindVarDescInBlock(ir::Node* n);

+static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
+  return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
+         op1->Outputs() == op2->Outputs();
+}
+
 template <typename Container, typename Callback>
 class FilterVariableImpl {
 public:

--- a/paddle/fluid/framework/details/memory_optimize_helper_test.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper_test.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/details/memory_optimize_helper.h"
 #include <algorithm>
 #include <iostream>
+#include <iterator>
 #include <memory>
 #include <sstream>
 #include <string>
@@ -22,13 +23,19 @@
 #include <vector>
 #include "glog/logging.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/framework/details/graph_test_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"

 namespace paddle {
 namespace framework {
 namespace details {

-TEST(OrderedNodeList, Normal) {
-  OrderedNodeList pool;
+TEST(OrderedSet, Normal) {
+  OrderedSet pool;
  std::vector<std::unique_ptr<ir::Node>> nodes;

  // clang-format off
@@ -56,8 +63,15 @@ TEST(OrderedNodeList, Normal) {
    nodes.emplace_back(std::move(node));
  }

+  // Insert
  for (auto& node : nodes) {
-    pool.Insert(node.get(), op.get());
+    pool.Insert(node.get());
+  }
+
+  // Has/size
+  ASSERT_EQ(pool.size(), shapes.size());
+  for (auto& node : nodes) {
+    ASSERT_TRUE(pool.Has(node.get()));
  }

  // assert its order and interface.
@@ -66,14 +80,14 @@ TEST(OrderedNodeList, Normal) {
  std::cout << pool.ToString() << std::endl;

  ASSERT_EQ(pool.size(), static_cast<size_t>(COUNT - 1));
-  ASSERT_EQ(pool.GetIndex(nodes.back().get()), 0);
+  ASSERT_EQ(pool.GetNodeIndexInPool(nodes.back().get()), 0);

  {
    auto v1 = block_desc->Var("11");
    v1->SetShape({-1, 256, 56, 56});
    std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v1);
    node1->inputs.emplace_back(op.get());
-    auto* cache = pool.NodeMatch(node1.get());
+    auto* cache = pool.FindBestFitNode(node1.get());
    ASSERT_EQ(cache, nullptr);
  }
  {
@@ -81,16 +95,447 @@ TEST(OrderedNodeList, Normal) {
    v2->SetShape({-1, 2, 5});
    std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v2);
    node1->inputs.emplace_back(op.get());
-    auto* cache = pool.NodeMatch(node1.get());
-    ASSERT_EQ(pool.GetIndex(cache), 2);  // match 6:[-1,2,5]
+    auto* cache = pool.FindBestFitNode(node1.get());
+    ASSERT_EQ(pool.GetNodeIndexInPool(cache), 2);  // match 6:[-1,2,5]
  }
  {
    auto v3 = block_desc->Var("13");
    v3->SetShape({2, 5});
    std::unique_ptr<ir::Node> node1 = ir::CreateNodeForTest(v3);
    node1->inputs.emplace_back(op.get());
-    auto* cache = pool.NodeMatch(node1.get());
-    ASSERT_EQ(pool.GetIndex(cache), 5);  // match  4:[5,2]
+    auto* cache = pool.FindBestFitNode(node1.get());
+    ASSERT_EQ(pool.GetNodeIndexInPool(cache), 5);  // match  4:[5,2]
+  }
+}
+
+TEST(OrderedSet, FindBestFitNode) {
+  OrderedSet pool;
+  std::vector<std::unique_ptr<ir::Node>> nodes;
+  ProgramDesc prog;
+  BlockDesc* block_desc = prog.MutableBlock(0);
+  auto* op_desc = block_desc->AppendOp();
+  op_desc->SetType("dummy");
+  std::unique_ptr<ir::Node> op = ir::CreateNodeForTest(op_desc);
+
+  {
+    auto desc = block_desc->Var("a");
+    desc->SetShape({128, 128});
+    std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
+    node->inputs.emplace_back(op.get());
+    nodes.emplace_back(std::move(node));
+  }
+  {
+    auto desc = block_desc->Var("b");
+    desc->SetShape({128, 129});
+    std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
+    node->inputs.emplace_back(op.get());
+    nodes.emplace_back(std::move(node));
+  }
+  {
+    auto desc = block_desc->Var("c");
+    desc->SetShape({128, 128});
+    std::unique_ptr<ir::Node> node = ir::CreateNodeForTest(desc);
+    node->inputs.emplace_back(op.get());
+    nodes.emplace_back(std::move(node));
+  }
+
+  for (auto& node : nodes) {
+    pool.Insert(node.get());
+  }
+
+  // FindNextBestFitNode
+  auto* n = nodes[0].get();
+  auto* cache = pool.FindBestFitNode(n);
+  PADDLE_ENFORCE(cache->Name() == "a");
+  cache = pool.FindNextBestFitNode(n, cache);
+  PADDLE_ENFORCE(cache->Name() == "c");
+  cache = pool.FindNextBestFitNode(n, cache);
+  PADDLE_ENFORCE(cache->Name() == "b");
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OPERATOR(sum, paddle::framework::DummyOp,
+                  paddle::framework::SumOpMaker,
+                  paddle::framework::DummyVarTypeInference);
+REGISTER_OPERATOR(assign, paddle::framework::DummyOp,
+                  paddle::framework::AssignOpMaker,
+                  paddle::framework::DummyVarTypeInference);
+REGISTER_OPERATOR(dummy, paddle::framework::DummyOp,
+                  paddle::framework::SumOpMaker,
+                  paddle::framework::DummyVarTypeInference);
+/*
+  https://en.wikipedia.org/wiki/Live_variable_analysis
+  Create a customed classical dependency graph, left row is the instruction
+  number.
+  1. a = 1
+  2. b = a
+  3. c = a
+  4. d = b + c
+  5. e = d
+
+  a--------+
+  |        |
+  b        c
+  |        |
+  d--------+
+  |
+  e
+  Then analysis these variable's liveness range
+ */
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+inline static ProgramDesc FillProgramDesc() {
+  ProgramDesc prog;
+  prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("d")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("e")->SetType(proto::VarType::LOD_TENSOR);
+  {
+    auto* op = prog.MutableBlock(0)->AppendOp();
+    op->SetType("assign");
+    op->SetInput("X", {"a"});
+    op->SetOutput("Out", {"b"});
+  }
+  {
+    auto* op = prog.MutableBlock(0)->AppendOp();
+    op->SetType("assign");
+    op->SetInput("X", {"a"});
+    op->SetOutput("Out", {"c"});
+  }
+  {
+    auto* op = prog.MutableBlock(0)->AppendOp();
+    op->SetType("sum");
+    op->SetInput("X", {"b", "c"});
+    op->SetOutput("Out", {"d"});
+  }
+  {
+    auto* op = prog.MutableBlock(0)->AppendOp();
+    op->SetType("assign");
+    op->SetInput("X", {"d"});
+    op->SetOutput("Out", {"e"});
+  }
+  return prog;
+}
+
+TEST(CFGGraph, IRGraph) {
+  // prepare ir graph
+  auto prog = FillProgramDesc();
+  ir::Graph graph(prog);
+  const std::vector<OpDesc*>* all_op_descs =
+      new std::vector<OpDesc*>(prog.Block(0).AllOps());
+  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+
+  ControlFlowGraph cfg(graph);
+  cfg.LiveVariableAnalysis();
+
+  // test assign op
+  ASSERT_TRUE((std::set<std::string>{"a"} == cfg.LiveIn(cfg.Ops()[0])));
+  ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveOut(cfg.Ops()[0])));
+
+  // test assign op
+  ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveIn(cfg.Ops()[1])));
+  ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveOut(cfg.Ops()[1])));
+
+  // test sum op
+  ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveIn(cfg.Ops()[2])));
+  ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveOut(cfg.Ops()[2])));
+
+  // test assign op
+  ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveIn(cfg.Ops()[3])));
+  ASSERT_TRUE((std::set<std::string>{} == cfg.LiveOut(cfg.Ops()[3])));
+}
+
+// 1. normal test
+TEST(SortOpLikeDescOrder, NormalTest) {
+  auto prog = FillProgramDesc();
+  ir::Graph graph(prog);
+  const std::vector<OpDesc*>* all_op_descs =
+      new std::vector<OpDesc*>(prog.Block(0).AllOps());
+  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+
+  auto nodes = SortOpLikeDescOrder(graph);
+  auto op_descs = prog.Block(0).AllOps();
+  for (size_t i = 0; i < nodes.size(); ++i) {
+    auto node = nodes[i];
+    auto op_desc = op_descs[i];
+    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
+  }
+}
+
+// 2. remove some op_desc
+TEST(SortOpLikeDescOrder, RemoveOpDesc) {
+  auto prog = FillProgramDesc();
+  ir::Graph graph(prog);
+  const std::vector<OpDesc*>* all_op_descs =
+      new std::vector<OpDesc*>(prog.Block(0).AllOps());
+  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+  auto nodes = graph.Nodes();
+  auto op_descs = prog.Block(0).AllOps();
+  ir::Node* found_node = nullptr;
+  for (auto node : nodes) {
+    if (node->IsOp() && node->outputs.back()->Name() == "e") {
+      found_node = node;
+      break;
+    }
+  }
+  PADDLE_ENFORCE(found_node != nullptr);
+  for (auto it = op_descs.begin(); it != op_descs.end();) {
+    if (IsSameDesc(*it, found_node->Op())) {
+      it = op_descs.erase(it);
+    } else {
+      ++it;
+    }
+  }
+
+  auto find_node_in_graph = [&](std::string s) {
+    ir::Node* ret = nullptr;
+    for (auto n : graph.Nodes()) {
+      if (n->Name() == s) {
+        ret = n;
+        break;
+      }
+    }
+    PADDLE_ENFORCE(ret != nullptr);
+    return ret;
+  };
+
+  ir::Node* e = find_node_in_graph("e");
+  ir::Node* d = find_node_in_graph("d");
+  std::remove(d->outputs.begin(), d->outputs.end(), found_node);
+  graph.RemoveNode(found_node);
+  graph.RemoveNode(e);
+
+  // other node keeps the same order
+  auto remain_nodes = SortOpLikeDescOrder(graph);
+  for (size_t i = 0; i < remain_nodes.size(); ++i) {
+    auto node = remain_nodes[i];
+    auto op_desc = op_descs[i];
+    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
+  }
+}
+
+// 3. add some op_desc
+TEST(SortOpLikeDescOrder, AddOpDesc) {
+  auto prog = FillProgramDesc();
+  const std::vector<OpDesc*>* all_op_descs =
+      new std::vector<OpDesc*>(prog.Block(0).AllOps());
+  ir::Graph graph(prog);
+
+  auto find_node_in_graph = [&](std::string s) {
+    ir::Node* ret = nullptr;
+    for (auto n : graph.Nodes()) {
+      if (n->Name() == s) {
+        ret = n;
+        break;
+      }
+    }
+    PADDLE_ENFORCE(ret != nullptr);
+    return ret;
+  };
+
+  // cached desc different with real one
+  // mimic the intermidiete pass modify the programdesc.
+  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+
+  auto op_descs = prog.Block(0).AllOps();
+
+  auto op = prog.MutableBlock(0)->AppendOp();
+  prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
+  op->SetType("sum");
+  op->SetInput("X", {"b", "c"});
+  op->SetOutput("Out", {"d1"});
+  ir::Node* node = graph.CreateOpNode(op);
+  ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
+  ir::Node* b = find_node_in_graph("b");
+  ir::Node* c = find_node_in_graph("c");
+  node->outputs.emplace_back(d1);
+  node->inputs.emplace_back(b);
+  node->inputs.emplace_back(c);
+  d1->inputs.emplace_back(node);
+  b->outputs.emplace_back(node);
+  c->outputs.emplace_back(node);
+  op_descs.insert(op_descs.begin() + 4, op);
+
+  auto nodes = SortOpLikeDescOrder(graph);
+
+  for (size_t i = 0; i < nodes.size(); ++i) {
+    auto node = nodes[i];
+    auto op_desc = op_descs[i];
+    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
+  }
+}
+
+// 4. add and delete some op_desc
+TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
+  auto prog = FillProgramDesc();
+  ir::Graph graph(prog);
+  const std::vector<OpDesc*>* all_op_descs =
+      new std::vector<OpDesc*>(prog.Block(0).AllOps());
+  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+
+  auto find_node_in_graph = [&](std::string s) {
+    ir::Node* ret = nullptr;
+    for (auto n : graph.Nodes()) {
+      if (n->Name() == s) {
+        ret = n;
+        break;
+      }
+    }
+    PADDLE_ENFORCE(ret != nullptr);
+    return ret;
+  };
+
+  // remove sum node
+  auto op_descs = prog.Block(0).AllOps();
+  ir::Node* found_node = nullptr;
+  auto nodes = graph.Nodes();
+  for (auto node : nodes) {
+    if (node->Name() == "sum") {
+      found_node = node;
+      break;
+    }
+  }
+  PADDLE_ENFORCE(found_node != nullptr);
+  for (auto it = op_descs.begin(); it != op_descs.end();) {
+    if (IsSameDesc(*it, found_node->Op())) {
+      it = op_descs.erase(it);
+    } else {
+      ++it;
+    }
+  }
+  {
+    ir::Node* d = find_node_in_graph("d");
+    ir::Node* c = find_node_in_graph("c");
+    ir::Node* e = find_node_in_graph("e");
+    std::remove(d->outputs.begin(), d->outputs.end(), found_node);
+    std::remove(c->outputs.begin(), c->outputs.end(), found_node);
+    ir::Node* pending_op = found_node->outputs[0]->outputs[0];
+    graph.RemoveNode(e);
+    graph.RemoveNode(pending_op);
+    graph.RemoveNode(found_node);
+  }
+
+  // add node
+  auto op = prog.MutableBlock(0)->AppendOp();
+  prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
+  op->SetType("sum");
+  op->SetInput("X", {"b", "c"});
+  op->SetOutput("Out", {"d1"});
+  {
+    ir::Node* node = graph.CreateOpNode(op);
+    ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
+    ir::Node* b = find_node_in_graph("b");
+    ir::Node* c = find_node_in_graph("c");
+    node->outputs.emplace_back(d1);
+    node->inputs.emplace_back(b);
+    node->inputs.emplace_back(c);
+    b->outputs.emplace_back(node);
+    c->outputs.emplace_back(node);
+  }
+  op_descs.insert(op_descs.begin() + 2, op);
+
+  // check the order
+  auto mynodes = SortOpLikeDescOrder(graph);
+  for (size_t i = 0; i < mynodes.size(); ++i) {
+    auto node = mynodes[i];
+    auto op_desc = op_descs[i];
+    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
+  }
+}
+
+// 5. add and replace some op_desc inplace.
+TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
+  auto prog = FillProgramDesc();
+  ir::Graph graph(prog);
+  const std::vector<OpDesc*>* all_op_descs =
+      new std::vector<OpDesc*>(prog.Block(0).AllOps());
+  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
+
+  auto find_node_in_graph = [&](std::string s) {
+    ir::Node* ret = nullptr;
+    for (auto n : graph.Nodes()) {
+      if (n->Name() == s) {
+        ret = n;
+        break;
+      }
+    }
+    PADDLE_ENFORCE(ret != nullptr);
+    return ret;
+  };
+
+  auto op_descs = prog.Block(0).AllOps();
+  // add node
+  auto op = prog.MutableBlock(0)->AppendOp();
+  prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
+  op->SetType("sum");
+  op->SetInput("X", {"b", "c"});
+  op->SetOutput("Out", {"d1"});
+  {
+    ir::Node* node = graph.CreateOpNode(op);
+    ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
+    ir::Node* b = find_node_in_graph("b");
+    ir::Node* c = find_node_in_graph("c");
+    node->outputs.emplace_back(d1);
+    node->inputs.emplace_back(b);
+    node->inputs.emplace_back(c);
+    d1->inputs.emplace_back(node);
+    b->outputs.emplace_back(node);
+    c->outputs.emplace_back(node);
+  }
+
+  op_descs.emplace_back(op);
+
+  // replace op_desc inplace
+  auto nodes = graph.Nodes();
+  ir::Node* found_node = nullptr;
+  for (auto node : nodes) {
+    if (node->IsOp() && node->Op() && node->Name() == "assign") {
+      if (node->outputs.size() == 1 && node->outputs[0]->Name() == "e") {
+        found_node = node;
+        break;
+      }
+    }
+  }
+  {
+    ir::Node* d = find_node_in_graph("d");
+    ir::Node* e = find_node_in_graph("e");
+    std::remove(d->outputs.begin(), d->outputs.end(), found_node);
+    std::remove(e->inputs.begin(), e->inputs.end(), found_node);
+    graph.RemoveNode(found_node);
+  }
+  op_descs.erase(op_descs.begin() + 3);
+
+  auto replace_op = prog.MutableBlock(0)->AppendOp();
+  replace_op->SetType("sum");
+  replace_op->SetInput("X", {"d", "d1"});
+  replace_op->SetOutput("Out", {"e"});
+  {
+    ir::Node* sum2 = graph.CreateOpNode(replace_op);
+    ir::Node* e = find_node_in_graph("e");
+    ir::Node* d = find_node_in_graph("d");
+    ir::Node* d1 = find_node_in_graph("d1");
+    sum2->inputs.emplace_back(d);
+    sum2->inputs.emplace_back(d1);
+    sum2->outputs.emplace_back(e);
+    e->inputs.emplace_back(sum2);
+    d->outputs.emplace_back(sum2);
+    d1->outputs.emplace_back(sum2);
+  }
+
+  op_descs.emplace_back(replace_op);
+  // compare op order
+  auto graph_nodes = SortOpLikeDescOrder(graph);
+  for (size_t i = 0; i < graph_nodes.size(); ++i) {
+    auto node = graph_nodes[i];
+    auto op_desc = op_descs[i];
+    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
  }
 }


--- a/paddle/fluid/framework/details/memory_optimize_pass.cc
+++ b/paddle/fluid/framework/details/memory_optimize_pass.cc
@@ -43,16 +43,10 @@ namespace paddle {
 namespace framework {
 namespace details {

-static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
-  return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
-         op1->Outputs() == op2->Outputs();
-}
-
 std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  auto nodes = graph->Nodes();
-  auto subblock_vars = GetSubBlockVars(nodes);
-  skip_set_.insert(subblock_vars.begin(), subblock_vars.end());
+  CollectSkipVarsSet(nodes);

  cfg_.reset(new details::ControlFlowGraph(*graph));
  cfg_->LiveVariableAnalysis();
@@ -75,82 +69,67 @@ std::unique_ptr<ir::Graph> MemoryOptimizePass::ApplyImpl(
    }

    for (auto& var : op->outputs) {
-      if (!NodeCanReused(var) || cfg_->Use(op).count(var->Name()) == 0 ||
-          skip_set_.count(var->Name()))
+      if (var->IsVar() && !var->IsCtrlVar() && skip_set_.count(var->Name())) {
+        VLOG(3) << "Skip set contains variable of " << var->Name()
+                << "disable reuse on it. skipped";
        continue;
-      ir::Node* cache = pool_.NodeMatch(var);
-
-      if (var->Name() == FLAGS_memory_optimize_debug) {
-        VLOG(3) << "start match var " << DebugString(var) << " of op "
-                << op->Name();
-        VLOG(3) << pool_.ToString();
-        VLOG(3) << "matched in pool : "
-                << ((cache == nullptr) ? "False" : "True");
      }
+      if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) {
+        ir::Node* cache = pool_.FindBestFitNode(var);
+        while (cache != nullptr && var->Name() == cache->Name()) {
+          VLOG(3) << "The same cache variable is cascade reused. "
+                  << cache->Name() << " is re-filled to the pool after "
+                  << "the reused op is finished. Current op can not "
+                  << "replace it again. Skip this candidate.";
+          cache = pool_.FindNextBestFitNode(var, cache);
+        }
+        if (var->Name() == FLAGS_memory_optimize_debug) {
+          VLOG(3) << "start match var " << DebugString(var) << " of op "
+                  << op->Name();
+          VLOG(3) << pool_.ToString();
+          VLOG(3) << "matched in pool : "
+                  << ((cache == nullptr) ? "False" : "True");
+        }

-      if (cache == nullptr) continue;
-      if (var->Name() == cache->Name()) {
-        VLOG(3) << "The same cache variable is cascade reused." << var->Name()
-                << " is re-filled to the pool after"
-                << "the reused op is finished. Current op can not "
-                << "replace it again. Skip this candidate.";
-        continue;
-
-        int node_idx_in_pool = pool_.GetIndex(cache);
-        VLOG(3) << string::Sprintf(
-            "!!! %s,  %s => %s, cache idx %d, pool size %d",
-            std::to_string(reuse_id++), DebugString(var), DebugString(cache),
-            node_idx_in_pool, static_cast<int>(pool_.size()));
-        // update CFG Graph on the fly.
-        // reused var maybe re-fill into the pool
-        cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx);
-        // NOTE(dzhwinter): we need to both update the ProgramDesc
-        // and IR Graph. because op_desc/var_desc is used in CreateOp,
-        // CreateVar when running happens. But IR Graph
-        // define the dependence relationship between nodes.
-        RenameVarInGraphDesc(var->Name(), cache->Name(), idx);
-        RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get());
-
-        pool_.Erase(cache);
-      }
-      // fill the pool
-      std::unordered_set<std::string> unlived_vars;
-      for (auto var : cfg_->LiveIn(op)) {
-        if (cfg_->LiveOut(op).count(var) == 0) {
-          unlived_vars.emplace(var);
+        if (cache != nullptr) {
+          int node_idx_in_pool = pool_.GetNodeIndexInPool(cache);
+          VLOG(3) << string::Sprintf(
+              "!!! %s,  %s => %s, cache idx %d, pool size %d",
+              std::to_string(reuse_id++), DebugString(var), DebugString(cache),
+              node_idx_in_pool, static_cast<int>(pool_.size()));
+          // NOTE(dzhwinter): update the ProgramDesc/IR Graph
+          // and the CFG Graph on the fly.
+          //
+          // IR Graph define the dependence relationship between nodes.
+          //
+          // ProgramDesc defines the input/output vars. Its used in
+          // CreateOp, CreateVar when running happens.
+          //
+          // CFG Graph store the liveness information, when reuse happens
+          // we also need to update the variable liveness.
+          const std::string var_name = var->Name();
+          const std::string cache_name = cache->Name();
+
+          cfg_->RenameVarInCFGGraph(var_name, cache_name, idx);
+          RenameVarInGraphDesc(var_name, cache_name, idx);
+          RenameVarInGraphNode(var_name, cache_name, idx, graph.get());
+          pool_.Erase(cache_name);
        }
      }
-      for (auto var : unlived_vars) {
-        ir::Node* var_node = cfg_->GetNodeFromVarName(var, op);
+    }
+    // fill the pool
+    for (auto var : cfg_->LiveIn(op)) {
+      if (cfg_->LiveOut(op).count(var) == 0) {
+        ir::Node* var_node = cfg_->GetNodeByName(var, op);
+        if (var_node == nullptr || var_node->IsCtrlVar()) continue;
        if (NodeCanReused(var_node) && !pool_.Has(var_node)) {
-          pool_.Insert(var_node, op);
+          pool_.Insert(var_node);
        }
      }
    }
  }
  graph->ResolveHazard(var_nodes_);

-  // For early delete pass. use GraphNodePool load the unlived vars.
-  // 1. find all deps op for each unlived var in memory pool.
-  for (auto& op : graph->Nodes()) {
-    for (auto& var : op->inputs) {
-      if (pool_.Has(var)) {
-        pool_.Insert(var, op);
-      }
-    }
-  }
-  // 2. convert ir node based memory pool to graph node
-  // because Node* maybe released bettwen passes.
-  auto& graph_pool = graph->Get<GraphNodePool>(kGraphNodePool);
-  for (auto it = pool_.begin(); it != pool_.end(); ++it) {
-    std::unordered_set<OpDesc*> descs;
-    for (auto& op : it->second) {
-      PADDLE_ENFORCE(op->IsOp());
-      descs.insert(op->Op());
-    }
-    graph_pool.push_back(std::make_pair(it->first->Name(), descs));
-  }
-
  return graph;
 }

@@ -199,12 +178,12 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
    PADDLE_ENFORCE(sub_op != nullptr);
    for (auto* var : sub_op->outputs) {
      if (NodeCanReused(var)) {
-        ir::Node* cache = pool_.NodeMatch(var);
+        ir::Node* cache = pool_.FindBestFitNode(var);
        if (cache != nullptr) {
          if (var->Var()->GetDataType() != cache->Var()->GetDataType()) {
            continue;
          }
-          int node_idx_in_pool = pool_.GetIndex(cache);
+          int node_idx_in_pool = pool_.GetNodeIndexInPool(cache);
          VLOG(3) << string::Sprintf(
              "!!! %s,  %s => %s, cache idx %d, pool size %d",
              std::to_string(sub_reuse_id++), DebugString(var),
@@ -224,20 +203,27 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
  }
 }

-std::unordered_set<std::string> MemoryOptimizePass::GetSubBlockVars(
+void MemoryOptimizePass::CollectSkipVarsSet(
    const std::unordered_set<ir::Node*>& nodes) const {
-  std::unordered_set<std::string> vars;
+  auto update_skip_set = [&](OpDesc* op_desc) {
+    auto inputs = op_desc->InputArgumentNames();
+    auto outputs = op_desc->OutputArgumentNames();
+    skip_set_.insert(inputs.begin(), inputs.end());
+    skip_set_.insert(outputs.begin(), outputs.end());
+  };
  for (auto& op : nodes) {
    if (!op->IsOp() || op->Op() == nullptr) continue;
    auto* op_desc = op->Op();
-    if (OpHasSubBlock(op_desc)) {
-      auto inputs = op_desc->InputArgumentNames();
-      auto outputs = op_desc->OutputArgumentNames();
-      vars.insert(inputs.begin(), inputs.end());
-      vars.insert(outputs.begin(), outputs.end());
-    }
+    // NOTE(dzhwinter):
+    // current block can not reuse next level block vars.
+    if (OpHasSubBlock(op_desc)) update_skip_set(op_desc);
+    // NOTE(dzhwinter):
+    // distributed ops input/output name need to
+    // keep same bettwen trainer/pserver
+    if (op_desc->Type() == "send") update_skip_set(op_desc);
+    if (op_desc->Type() == "recv") update_skip_set(op_desc);
+    if (op_desc->Type() == "prefetch") update_skip_set(op_desc);
  }
-  return vars;
 }

 void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var,
@@ -291,8 +277,7 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
    // redirect the input to the latest version of cache_var
    for (auto* node : op->inputs) {
      if (node->Name() == var) {
-        ir::Node* cache_node = graph->CreateVarNode(var_desc.get());
-        var_nodes_[cache_var].emplace_back(cache_node);
+        ir::Node* cache_node = var_nodes_[cache_var].back();

        // swap node to cache_node
        cache_node->outputs.insert(cache_node->outputs.end(),
@@ -301,11 +286,15 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
        auto* prev_op = node->inputs[0];
        std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node,
                     cache_node);
-        cache_node->inputs.emplace_back(prev_op);
        for (auto* next_op : node->outputs) {
          std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
                       cache_node);
        }
+
+        // erase unused node
+        auto& nodes = var_nodes_.at(var);
+        nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end());
+        graph->RemoveNode(node);
      }
    }

@@ -325,271 +314,14 @@ void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var,
          std::replace(next_op->inputs.begin(), next_op->inputs.end(), node,
                       cache_node);
        }
-      }
-    }
-  }
-
-  // release node of unused var in graph
-  for (auto* node : var_nodes_[var]) {
-    graph->RemoveNode(node);
-  }
-  var_nodes_.at(var).clear();
-}
-
-std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph) {
-  PADDLE_ENFORCE(graph.Has(kAllOpDescs),
-                 "Graph has no attribute of kAllOpDescs.");
-  // 1. get op desc order
-  auto& op_descs = graph.Get<const std::vector<OpDesc*>>(kAllOpDescs);
-
-  // 2. topology sort order
-  auto nodes = graph.Nodes();
-  std::deque<ir::Node*> ops;
-  FilterVariables(nodes, [&](ir::Node* op) {
-    if (op->IsOp() && op->Op() != nullptr) {
-      ops.emplace_back(op);
-    }
-  });
-  std::unordered_map<ir::Node*, size_t> op_deps;
-  std::list<ir::Node*> ready_ops;
-  std::unordered_map<ir::Node*, std::unordered_set<ir::Node*>> pending_ops;
-
-  for (auto* op : ops) {
-    std::unordered_set<ir::Node*> preceding_op;
-    for (auto* in : op->inputs) {
-      if (in->inputs.empty()) continue;
-      PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp());
-      preceding_op.emplace(in->inputs[0]);
-      pending_ops[in->inputs[0]].emplace(op);
-    }
-    op_deps[op] = preceding_op.size();
-    if (preceding_op.empty()) {
-      ready_ops.emplace_back(op);
-    }
-  }
-
-  // 3. generated op list based desc order and the topology order
-  std::vector<ir::Node*> ret;
-  std::list<OpDesc*> op_descs_list(op_descs.begin(), op_descs.end());
-
-  auto update_by_found_node = [&](ir::Node* found_node) {
-    for (auto* pending_op : pending_ops[found_node]) {
-      if (--op_deps[pending_op] == 0) {
-        ready_ops.emplace_back(pending_op);
-      }
-    }
-    ready_ops.remove(found_node);
-    ret.emplace_back(found_node);
-  };
-
-  while (!ready_ops.empty()) {
-    bool all_of_ready_op_unmatched = true;
-    for (auto it = op_descs_list.begin(); it != op_descs_list.end();) {
-      auto op_desc = *it;
-      ir::Node* found_node = nullptr;
-      for (auto* op : ready_ops) {
-        if (IsSameDesc(op->Op(), op_desc)) {
-          found_node = op;
-          break;
-        }
-      }
-
-      // 3.1 op desc deleted by other pass
-      if (found_node == nullptr) {
-        ++it;
-        continue;
-      } else {
-        all_of_ready_op_unmatched = false;
-        it = op_descs_list.erase(it);
-      }
-      update_by_found_node(found_node);
-    }
-
-    // 3.2 op descs are added by other pass
-    // preceding op non empty means some new op descs are
-    // created, but not contained in return node list.
-    // these new op desc may depend on each other.
-    std::list<ir::Node*> prev_ready_ops(ready_ops);
-    if (all_of_ready_op_unmatched) {
-      for (auto op : prev_ready_ops) {
-        update_by_found_node(op);
-      }
-    }
-  }
-
-  PADDLE_ENFORCE(std::all_of(
-      op_deps.begin(), op_deps.end(),
-      [&](const std::pair<ir::Node*, size_t>& p) { return p.second == 0; }));
-
-  return ret;
-}
-
-ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) {
-  ops_ = SortOpLikeDescOrder(graph);
-  ConnectNodes();
-}
-
-void ControlFlowGraph::BuildCFGGraph() {
-  // FIXME(dzh): same effect with ConnectNodes, but use the control
-  // link to build dependency graph, it goes wrong in transformer.
-  for (ir::Node* op : ops_) {
-    for (auto& input_var : op->inputs) {
-      if (!input_var->inputs.empty()) {
-        PADDLE_ENFORCE(
-            input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(),
-            "Preceding Op Node of Var Node must be unique");
-        auto* pred_op = input_var->inputs[0];
-        if (pred_op->Op() != nullptr) {
-          predecessors_[op].insert(pred_op);
-          successors_[pred_op].insert(op);
-        }
-      }
-      if (input_var->IsVar() && !input_var->IsCtrlVar()) {
-        uses_[op].insert(input_var->Name());
-      }
-    }
-    for (auto& output_var : op->outputs) {
-      // output var may be used by many op
-      for (auto* succ_op : output_var->outputs) {
-        if (succ_op->Op() != nullptr) {
-          successors_[op].insert(succ_op);
-          predecessors_[succ_op].insert(op);
-        }
-      }
-      if (output_var->IsVar() && !output_var->IsCtrlVar()) {
-        defs_[op].insert(output_var->Name());
-      }
-    }
-  }
-}
-
-void ControlFlowGraph::ConnectNodes() {
-  for (size_t i = 0; i < ops_.size(); ++i) {
-    auto& op = ops_[i];
-    try {
-      auto& next_op = ops_.at(i + 1);
-      successors_[op].insert(next_op);
-      predecessors_[next_op].insert(op);
-    } catch (...) {
-      // do nothing
-    }
-
-    FilterVariables(op->inputs,
-                    [&](ir::Node* var) { uses_[op].emplace(var->Name()); });
-
-    FilterVariables(op->outputs,
-                    [&](ir::Node* var) { defs_[op].emplace(var->Name()); });
-  }
-}
-
-void ControlFlowGraph::LiveVariableAnalysis() {
-  // NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm)
-  // compute the liveness of for each variable though reversed_ops algorithm.
-  // It iterates the operators from end to begin, compute the live in/live out
-  // variable set for each op, then the diff between in/out will be used for
-  // the variable reuse. For detail refer to
-  // http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf
-  std::list<ir::Node*> work_list(ops_.rbegin(), ops_.rend());
-  while (!work_list.empty()) {
-    ir::Node* op = work_list.front();
-    work_list.pop_front();
-    // get the live_in calculated before. Empty if first.
-    auto prev_live_in = std::move(live_in_[op]);
-    for (auto& s : successors_[op]) {
-      for (auto& var : live_in_[s]) {
-        live_out_[op].insert(var);
-      }
-    }
-    for (auto& var : uses_[op]) {
-      live_in_[op].insert(var);
-    }
-    for (auto& var : live_out_[op]) {
-      live_in_[op].insert(var);
-    }
-    for (auto& var : defs_[op]) {
-      live_in_[op].erase(var);
-    }
-
-    // If the live_in is not changed, then the liveness analysis of
-    // predecessors is completed.
-    //
-    // Otherwise, recalculate the predecessors liveness
-    if (live_in_[op] != prev_live_in) {
-      for (auto& pre : predecessors_[op]) {
-        work_list.push_back(pre);
-      }
-    }
-  }
-}
-
-void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node,
-                                           const std::string& new_node,
-                                           int begin_idx) {
-  // update graph from begin idx to the end
-  for (size_t i = begin_idx; i != ops_.size(); ++i) {
-    auto* op = ops_[i];
-    if (uses_[op].find(old_node) != uses_[op].end()) {
-      uses_[op].erase(old_node);
-      uses_[op].insert(new_node);
-    }
-    if (defs_[op].find(old_node) != defs_[op].end()) {
-      defs_[op].erase(old_node);
-      defs_[op].insert(new_node);
-    }
-    if (live_in_[op].find(old_node) != live_in_[op].end()) {
-      live_in_[op].erase(old_node);
-      live_in_[op].insert(new_node);
-    }
-    if (live_out_[op].find(old_node) != live_out_[op].end()) {
-      live_out_[op].erase(old_node);
-      live_out_[op].insert(new_node);
-    }
-  }
-}
-
-const std::set<std::string> ControlFlowGraph::LiveIn(ir::Node* op) const {
-  auto it = live_in_.find(op);
-  PADDLE_ENFORCE(
-      it != live_in_.end(),
-      string::Sprintf("Expect %s in live_in, but Not Found.", op->Name()));
-  return it->second;
-}
-
-const std::set<std::string> ControlFlowGraph::LiveOut(ir::Node* op) const {
-  auto it = live_out_.find(op);
-  PADDLE_ENFORCE(
-      it != live_out_.end(),
-      string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
-  return it->second;
-}
-
-const std::set<std::string> ControlFlowGraph::Use(ir::Node* op) const {
-  auto it = uses_.find(op);
-  PADDLE_ENFORCE(
-      it != uses_.end(),
-      string::Sprintf("Expect %s in live_out, but Not Found.", op->Name()));
-  return it->second;
-}
-
-const std::vector<ir::Node*> ControlFlowGraph::Ops() const { return ops_; }
-
-std::vector<ir::Node*>& ControlFlowGraph::Ops() { return ops_; }

-ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name,
-                                               ir::Node* op) const {
-  // in ssa-graph, different version nodes have same name,
-  // this function get the latest version var before target op
-  // It may return nullptr, such as data node.
-  ir::Node* found_node = nullptr;
-  for (auto* node : ops_) {
-    if (node == op) break;
-    for (auto& output : node->outputs) {
-      if (output->Name() == name) {
-        found_node = output;
+        // erase unused node
+        auto& nodes = var_nodes_.at(var);
+        nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end());
+        graph->RemoveNode(node);
      }
    }
  }
-  return found_node;
 }

 }  // namespace details
@@ -598,5 +330,4 @@ ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name,

 REGISTER_PASS(memory_optimize_pass,
              paddle::framework::details::MemoryOptimizePass)
-    .RequireGraphAttr(paddle::framework::details::kGraphNodePool)
    .RequireGraphAttr(paddle::framework::details::kAllOpDescs);
--- a/paddle/fluid/framework/details/memory_optimize_pass.h
+++ b/paddle/fluid/framework/details/memory_optimize_pass.h
@@ -32,20 +32,15 @@
 namespace paddle {
 namespace framework {
 namespace details {
-constexpr char kAllOpDescs[] = "all_op_descs";
-
-std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);
-
-class ControlFlowGraph;

 class MemoryOptimizePass : public ir::Pass {
 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(
      std::unique_ptr<ir::Graph> graph) const override;
-
- private:
  // fill the variable map(var_nodes) by version.
  void InitSSAGraphNodes() const;
+
+ private:
  // update program descs
  void RenameVarInGraphDesc(const std::string& var,
                            const std::string& cache_var, size_t idx) const;
@@ -55,13 +50,14 @@ class MemoryOptimizePass : public ir::Pass {
                            ir::Graph* graph) const;

  void SubGraphOptimize(OpDesc* op_desc) const;
-  // scan subblock and collect the output/input variables.
-  std::unordered_set<std::string> GetSubBlockVars(
-      const std::unordered_set<ir::Node*>&) const;
+  // 1. scan op with subblock and collect the output/input vars.
+  // while, while_grad, conditional_block
+  // 2. scan distributed ops and collect the output/input vars
+  void CollectSkipVarsSet(const std::unordered_set<ir::Node*>&) const;

 private:
  // Reuse Node Pool, Owned.
-  mutable OrderedNodeList pool_;
+  mutable OrderedSet pool_;
  // controlflow Graph
  mutable std::unique_ptr<ControlFlowGraph> cfg_;
  // skip set
@@ -70,45 +66,6 @@ class MemoryOptimizePass : public ir::Pass {
  mutable std::map<std::string, std::vector<ir::Node*>> var_nodes_;
 };

-class ControlFlowGraph {
- public:
-  ControlFlowGraph() = default;
-  // For IR Graph in parallelexecutor
-  explicit ControlFlowGraph(const ir::Graph& graph);
-
-  void LiveVariableAnalysis();
-
-  void RenameVarInCFGGraph(const std::string& old_node,
-                           const std::string& new_node, int begin_idx);
-
-  const std::set<std::string> LiveIn(ir::Node* op) const;
-  const std::set<std::string> LiveOut(ir::Node* op) const;
-  const std::set<std::string> Use(ir::Node* op) const;
-  const std::vector<ir::Node*> Ops() const;
-  std::vector<ir::Node*>& Ops();
-
-  // for ssa-graph nodes
-  ir::Node* GetNodeFromVarName(const std::string& name, ir::Node* op) const;
-
- private:
-  void BuildCFGGraph();
-  void ConnectNodes();
-  using NodeListMap = std::unordered_map<ir::Node*, std::set<ir::Node*>>;
-  using VarSetMap = std::map<ir::Node*, std::set<std::string>>;
-  // successors ops use the output variables.
-  NodeListMap successors_;
-  // predecessors ops generated input variables.
-  NodeListMap predecessors_;
-  // variables lived before run current op.
-  VarSetMap live_in_;
-  // variables lived after run current op.
-  VarSetMap live_out_;
-  VarSetMap uses_;  // op inputs
-  VarSetMap defs_;  // op outputs
-
-  std::vector<ir::Node*> ops_;  // op sequence by topology sort
-};
-
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/memory_optimize_pass_test.cc
+++ b/paddle/fluid/framework/details/memory_optimize_pass_test.cc
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/memory_optimize_pass.h"
-#include <algorithm>
-#include <iostream>
-#include <iterator>
-#include "glog/logging.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/details/graph_test_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-
-REGISTER_OPERATOR(sum, paddle::framework::DummyOp,
-                  paddle::framework::SumOpMaker,
-                  paddle::framework::DummyVarTypeInference);
-REGISTER_OPERATOR(assign, paddle::framework::DummyOp,
-                  paddle::framework::AssignOpMaker,
-                  paddle::framework::DummyVarTypeInference);
-REGISTER_OPERATOR(dummy, paddle::framework::DummyOp,
-                  paddle::framework::SumOpMaker,
-                  paddle::framework::DummyVarTypeInference);
-/*
-  https://en.wikipedia.org/wiki/Live_variable_analysis
-  Create a customed classical dependency graph, left row is the instruction
-  number.
-  1. a = 1
-  2. b = a
-  3. c = a
-  4. d = b + c
-  5. e = d
-
-  a--------+
-  |        |
-  b        c
-  |        |
-  d--------+
-  |
-  e
-  Then analysis these variable's liveness range
- */
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
-  return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&
-         op1->Outputs() == op2->Outputs();
-}
-
-inline static ProgramDesc FillProgramDesc() {
-  ProgramDesc prog;
-  prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("d")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("e")->SetType(proto::VarType::LOD_TENSOR);
-  {
-    auto* op = prog.MutableBlock(0)->AppendOp();
-    op->SetType("assign");
-    op->SetInput("X", {"a"});
-    op->SetOutput("Out", {"b"});
-  }
-  {
-    auto* op = prog.MutableBlock(0)->AppendOp();
-    op->SetType("assign");
-    op->SetInput("X", {"a"});
-    op->SetOutput("Out", {"c"});
-  }
-  {
-    auto* op = prog.MutableBlock(0)->AppendOp();
-    op->SetType("sum");
-    op->SetInput("X", {"b", "c"});
-    op->SetOutput("Out", {"d"});
-  }
-  {
-    auto* op = prog.MutableBlock(0)->AppendOp();
-    op->SetType("assign");
-    op->SetInput("X", {"d"});
-    op->SetOutput("Out", {"e"});
-  }
-  return prog;
-}
-
-TEST(CFGGraph, IRGraph) {
-  // prepare ir graph
-  auto prog = FillProgramDesc();
-  ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
-
-  ControlFlowGraph cfg(graph);
-  cfg.LiveVariableAnalysis();
-
-  // test assign op
-  ASSERT_TRUE((std::set<std::string>{"a"} == cfg.LiveIn(cfg.Ops()[0])));
-  ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveOut(cfg.Ops()[0])));
-
-  // test assign op
-  ASSERT_TRUE((std::set<std::string>{"a", "b"} == cfg.LiveIn(cfg.Ops()[1])));
-  ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveOut(cfg.Ops()[1])));
-
-  // test sum op
-  ASSERT_TRUE((std::set<std::string>{"b", "c"} == cfg.LiveIn(cfg.Ops()[2])));
-  ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveOut(cfg.Ops()[2])));
-
-  // test assign op
-  ASSERT_TRUE((std::set<std::string>{"d"} == cfg.LiveIn(cfg.Ops()[3])));
-  ASSERT_TRUE((std::set<std::string>{} == cfg.LiveOut(cfg.Ops()[3])));
-}
-
-// 1. normal test
-TEST(SortOpLikeDescOrder, NormalTest) {
-  auto prog = FillProgramDesc();
-  ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
-
-  auto nodes = SortOpLikeDescOrder(graph);
-  auto op_descs = prog.Block(0).AllOps();
-  for (size_t i = 0; i < nodes.size(); ++i) {
-    auto node = nodes[i];
-    auto op_desc = op_descs[i];
-    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
-  }
-}
-
-// 2. remove some op_desc
-TEST(SortOpLikeDescOrder, RemoveOpDesc) {
-  auto prog = FillProgramDesc();
-  ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
-  auto nodes = graph.Nodes();
-  auto op_descs = prog.Block(0).AllOps();
-  ir::Node* found_node = nullptr;
-  for (auto node : nodes) {
-    if (node->IsOp() && node->outputs.back()->Name() == "e") {
-      found_node = node;
-      break;
-    }
-  }
-  PADDLE_ENFORCE(found_node != nullptr);
-  for (auto it = op_descs.begin(); it != op_descs.end();) {
-    if (IsSameDesc(*it, found_node->Op())) {
-      it = op_descs.erase(it);
-    } else {
-      ++it;
-    }
-  }
-
-  auto find_node_in_graph = [&](std::string s) {
-    ir::Node* ret = nullptr;
-    for (auto n : graph.Nodes()) {
-      if (n->Name() == s) {
-        ret = n;
-        break;
-      }
-    }
-    PADDLE_ENFORCE(ret != nullptr);
-    return ret;
-  };
-
-  ir::Node* e = find_node_in_graph("e");
-  ir::Node* d = find_node_in_graph("d");
-  std::remove(d->outputs.begin(), d->outputs.end(), found_node);
-  graph.RemoveNode(found_node);
-  graph.RemoveNode(e);
-
-  // other node keeps the same order
-  auto remain_nodes = SortOpLikeDescOrder(graph);
-  for (size_t i = 0; i < remain_nodes.size(); ++i) {
-    auto node = remain_nodes[i];
-    auto op_desc = op_descs[i];
-    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
-  }
-}
-
-// 3. add some op_desc
-TEST(SortOpLikeDescOrder, AddOpDesc) {
-  auto prog = FillProgramDesc();
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  ir::Graph graph(prog);
-
-  auto find_node_in_graph = [&](std::string s) {
-    ir::Node* ret = nullptr;
-    for (auto n : graph.Nodes()) {
-      if (n->Name() == s) {
-        ret = n;
-        break;
-      }
-    }
-    PADDLE_ENFORCE(ret != nullptr);
-    return ret;
-  };
-
-  // cached desc different with real one
-  // mimic the intermidiete pass modify the programdesc.
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
-
-  auto op_descs = prog.Block(0).AllOps();
-
-  auto op = prog.MutableBlock(0)->AppendOp();
-  prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
-  op->SetType("sum");
-  op->SetInput("X", {"b", "c"});
-  op->SetOutput("Out", {"d1"});
-  ir::Node* node = graph.CreateOpNode(op);
-  ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
-  ir::Node* b = find_node_in_graph("b");
-  ir::Node* c = find_node_in_graph("c");
-  node->outputs.emplace_back(d1);
-  node->inputs.emplace_back(b);
-  node->inputs.emplace_back(c);
-  d1->inputs.emplace_back(node);
-  b->outputs.emplace_back(node);
-  c->outputs.emplace_back(node);
-  op_descs.insert(op_descs.begin() + 4, op);
-
-  auto nodes = SortOpLikeDescOrder(graph);
-
-  for (size_t i = 0; i < nodes.size(); ++i) {
-    auto node = nodes[i];
-    auto op_desc = op_descs[i];
-    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
-  }
-}
-
-// 4. add and delete some op_desc
-TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) {
-  auto prog = FillProgramDesc();
-  ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
-
-  auto find_node_in_graph = [&](std::string s) {
-    ir::Node* ret = nullptr;
-    for (auto n : graph.Nodes()) {
-      if (n->Name() == s) {
-        ret = n;
-        break;
-      }
-    }
-    PADDLE_ENFORCE(ret != nullptr);
-    return ret;
-  };
-
-  // remove sum node
-  auto op_descs = prog.Block(0).AllOps();
-  ir::Node* found_node = nullptr;
-  auto nodes = graph.Nodes();
-  for (auto node : nodes) {
-    if (node->Name() == "sum") {
-      found_node = node;
-      break;
-    }
-  }
-  PADDLE_ENFORCE(found_node != nullptr);
-  for (auto it = op_descs.begin(); it != op_descs.end();) {
-    if (IsSameDesc(*it, found_node->Op())) {
-      it = op_descs.erase(it);
-    } else {
-      ++it;
-    }
-  }
-  {
-    ir::Node* d = find_node_in_graph("d");
-    ir::Node* c = find_node_in_graph("c");
-    ir::Node* e = find_node_in_graph("e");
-    std::remove(d->outputs.begin(), d->outputs.end(), found_node);
-    std::remove(c->outputs.begin(), c->outputs.end(), found_node);
-    ir::Node* pending_op = found_node->outputs[0]->outputs[0];
-    graph.RemoveNode(e);
-    graph.RemoveNode(pending_op);
-    graph.RemoveNode(found_node);
-  }
-
-  // add node
-  auto op = prog.MutableBlock(0)->AppendOp();
-  prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
-  op->SetType("sum");
-  op->SetInput("X", {"b", "c"});
-  op->SetOutput("Out", {"d1"});
-  {
-    ir::Node* node = graph.CreateOpNode(op);
-    ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
-    ir::Node* b = find_node_in_graph("b");
-    ir::Node* c = find_node_in_graph("c");
-    node->outputs.emplace_back(d1);
-    node->inputs.emplace_back(b);
-    node->inputs.emplace_back(c);
-    b->outputs.emplace_back(node);
-    c->outputs.emplace_back(node);
-  }
-  op_descs.insert(op_descs.begin() + 2, op);
-
-  // check the order
-  auto mynodes = SortOpLikeDescOrder(graph);
-  for (size_t i = 0; i < mynodes.size(); ++i) {
-    auto node = mynodes[i];
-    auto op_desc = op_descs[i];
-    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
-  }
-}
-
-// 5. add and replace some op_desc inplace.
-TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) {
-  auto prog = FillProgramDesc();
-  ir::Graph graph(prog);
-  const std::vector<OpDesc*>* all_op_descs =
-      new std::vector<OpDesc*>(prog.Block(0).AllOps());
-  graph.Set(details::kAllOpDescs, all_op_descs);  // take ownership
-
-  auto find_node_in_graph = [&](std::string s) {
-    ir::Node* ret = nullptr;
-    for (auto n : graph.Nodes()) {
-      if (n->Name() == s) {
-        ret = n;
-        break;
-      }
-    }
-    PADDLE_ENFORCE(ret != nullptr);
-    return ret;
-  };
-
-  auto op_descs = prog.Block(0).AllOps();
-  // add node
-  auto op = prog.MutableBlock(0)->AppendOp();
-  prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR);
-  op->SetType("sum");
-  op->SetInput("X", {"b", "c"});
-  op->SetOutput("Out", {"d1"});
-  {
-    ir::Node* node = graph.CreateOpNode(op);
-    ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1"));
-    ir::Node* b = find_node_in_graph("b");
-    ir::Node* c = find_node_in_graph("c");
-    node->outputs.emplace_back(d1);
-    node->inputs.emplace_back(b);
-    node->inputs.emplace_back(c);
-    d1->inputs.emplace_back(node);
-    b->outputs.emplace_back(node);
-    c->outputs.emplace_back(node);
-  }
-
-  op_descs.emplace_back(op);
-
-  // replace op_desc inplace
-  auto nodes = graph.Nodes();
-  ir::Node* found_node = nullptr;
-  for (auto node : nodes) {
-    if (node->IsOp() && node->Op() && node->Name() == "assign") {
-      if (node->outputs.size() == 1 && node->outputs[0]->Name() == "e") {
-        found_node = node;
-        break;
-      }
-    }
-  }
-  {
-    ir::Node* d = find_node_in_graph("d");
-    ir::Node* e = find_node_in_graph("e");
-    std::remove(d->outputs.begin(), d->outputs.end(), found_node);
-    std::remove(e->inputs.begin(), e->inputs.end(), found_node);
-    graph.RemoveNode(found_node);
-  }
-  op_descs.erase(op_descs.begin() + 3);
-
-  auto replace_op = prog.MutableBlock(0)->AppendOp();
-  replace_op->SetType("sum");
-  replace_op->SetInput("X", {"d", "d1"});
-  replace_op->SetOutput("Out", {"e"});
-  {
-    ir::Node* sum2 = graph.CreateOpNode(replace_op);
-    ir::Node* e = find_node_in_graph("e");
-    ir::Node* d = find_node_in_graph("d");
-    ir::Node* d1 = find_node_in_graph("d1");
-    sum2->inputs.emplace_back(d);
-    sum2->inputs.emplace_back(d1);
-    sum2->outputs.emplace_back(e);
-    e->inputs.emplace_back(sum2);
-    d->outputs.emplace_back(sum2);
-    d1->outputs.emplace_back(sum2);
-  }
-
-  op_descs.emplace_back(replace_op);
-  // compare op order
-  auto graph_nodes = SortOpLikeDescOrder(graph);
-  for (size_t i = 0; i < graph_nodes.size(); ++i) {
-    auto node = graph_nodes[i];
-    auto op_desc = op_descs[i];
-    ASSERT_TRUE(IsSameDesc(node->Op(), op_desc));
-  }
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/details/sequential_execution_pass.cc
+++ b/paddle/fluid/framework/details/sequential_execution_pass.cc
@@ -17,6 +17,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+#include "paddle/fluid/framework/details/memory_optimize_helper.h"
 #include "paddle/fluid/framework/op_proto_maker.h"

 namespace paddle {

--- a/paddle/fluid/framework/details/sequential_execution_pass.h
+++ b/paddle/fluid/framework/details/sequential_execution_pass.h
@@ -21,8 +21,6 @@ namespace paddle {
 namespace framework {
 namespace details {

-constexpr char kAllOpDescs[] = "all_op_descs";
-
 class SequentialExecutionPass : public ir::Pass {
 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(

--- a/paddle/fluid/framework/inplace_op_inference.h
+++ b/paddle/fluid/framework/inplace_op_inference.h
@@ -69,7 +69,7 @@ class InplaceInToOut : public InplaceOpInference {
  bool TryInplaceInputOutput(const VarDesc& in, const VarDesc& out) const {
    return in.Name() != out.Name() && details::NodeCanReused(in) &&
           details::NodeCanReused(out) &&
-           details::NodeSizeInBytes(out) <= details::NodeSizeInBytes(in);
+           details::NodeSize(out) <= details::NodeSize(in);
  }
 };


--- a/paddle/fluid/framework/inplace_op_inference_test.cc
+++ b/paddle/fluid/framework/inplace_op_inference_test.cc
@@ -179,11 +179,11 @@ TEST(InferInplace, SingleOpInplaceInToOut) {
  op->SetOutput("Out", {"test2_out"});

  prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64});
+  prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128});
  prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
  prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
  prog.MutableBlock(0)->Var("test2_out");
-  prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 128, 128});

  auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
  auto in_to_outs = infer_inplace(*op, op->Block());
@@ -201,11 +201,11 @@ TEST(InferInplace, SingleGradOpInplaceInToOut) {
  op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"});

  prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16, 1024, 1024});
  prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
  prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
  prog.MutableBlock(0)->Var("test2_out");
-  prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 1024, 1024});

  auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
  auto in_to_outs = infer_inplace(*op, op->Block());
@@ -233,12 +233,12 @@ TEST(InferInplace, MultiOutInplaceInToOut) {
  prog.MutableBlock(0)->Var("o0");
  prog.MutableBlock(0)->Var("y0");
  prog.MutableBlock(0)->Var("z0");
-  prog.MutableBlock(0)->Var("a0")->SetShape({32, 16});
-  prog.MutableBlock(0)->Var("b0")->SetShape({32, 16});
-  prog.MutableBlock(0)->Var("c0")->SetShape({32, 16});
-  prog.MutableBlock(0)->Var("o0")->SetShape({32, 16});
-  prog.MutableBlock(0)->Var("y0")->SetShape({32, 16});
-  prog.MutableBlock(0)->Var("z0")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});

  auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
  auto in_to_outs = infer_inplace(*op, op->Block());
@@ -267,15 +267,16 @@ TEST(InferInplace, MultiGradInplaceInToOut) {
  prog.MutableBlock(0)->Var("o0");
  prog.MutableBlock(0)->Var("y0");
  prog.MutableBlock(0)->Var("z0");
-  prog.MutableBlock(0)->Var("a0")->SetShape({32, 16});
-  prog.MutableBlock(0)->Var("b0")->SetShape({32, 16});
-  prog.MutableBlock(0)->Var("c0")->SetShape({32, 16});
-  prog.MutableBlock(0)->Var("o0")->SetShape({32, 16});
-  prog.MutableBlock(0)->Var("y0")->SetShape({32, 16});
-  prog.MutableBlock(0)->Var("z0")->SetShape({32, 16});
+  prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});

  auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
  auto in_to_outs = infer_inplace(*op, op->Block());
+
  EXPECT_EQ(in_to_outs.size(), 3ul);
  std::unordered_map<std::string, std::string> expects = {
      {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"},

--- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
+++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
@@ -38,9 +38,13 @@ std::unique_ptr<ir::Graph> IdentityScaleOpCleanPass::ApplyImpl(
                      ->assert_is_op("scale")
                      ->assert_op_attr<float>("scale", 1.)
                      ->assert_op_attr<float>("bias", 0.);
-  auto scale_out = detector.mutable_pattern()
-                       ->NewNode("scale_out")
-                       ->assert_is_op_output("scale");
+  auto scale_out =
+      detector.mutable_pattern()
+          ->NewNode("scale_out")
+          ->assert_is_op_output("scale")
+          // scale's output var should has only one consumer, or it can't be
+          // removed.
+          ->assert_more([](Node* x) { return x->outputs.size() == 1UL; });

  pre_op->LinksTo({scale_in});
  scale_op->LinksFrom({scale_in}).LinksTo({scale_out});

--- a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
+++ b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
@@ -37,6 +37,7 @@ class InferCleanGraphPass : public FusePassBase {
    std::unordered_set<const Node*> invalid_nodes;
    int valid_op = 0;
    for (auto* node : graph->Nodes()) {
+      PADDLE_ENFORCE_NOT_NULL(node);
      if (is_valid_node(node)) {
        invalid_nodes.insert(node);
      } else if (node->IsOp()) {

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -171,14 +171,6 @@ std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
    eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_);
    graph = eager_deletion_pass->Apply(std::move(graph));
    VLOG(10) << "EagerDeletionPass Applied";
-
-    if (build_strategy_.memory_early_delete_) {
-      auto early_delete_pass =
-          ir::PassRegistry::Instance().Get("memory_early_delete_pass");
-      early_delete_pass->SetNotOwned(details::kGarbageCollector, &gcs_);
-      graph = early_delete_pass->Apply(std::move(graph));
-    }
-    VLOG(10) << "MemoryEarlyDeletePass Applied.";
  }

  return graph;
@@ -288,6 +280,8 @@ ParallelExecutor::ParallelExecutor(
  graphs.push_back(std::move(graph));
 #endif
  auto max_memory_size = GetEagerDeletionThreshold();
+  VLOG(10) << "Eager Deletion Threshold "
+           << static_cast<float>(max_memory_size) / (1 << 30);
  if (max_memory_size >= 0) {
    for (size_t i = 0; i < graphs.size(); ++i) {
      graphs[i] = member_->PrepareGCAndRefCnts(
@@ -506,6 +500,5 @@ ParallelExecutor::~ParallelExecutor() {
 }  // namespace framework
 }  // namespace paddle

-USE_PASS(memory_early_delete_pass);
 USE_PASS(reference_count_pass);
 USE_PASS(eager_deletion_pass);
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
@@ -460,77 +460,6 @@ inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
  return node.inputs.size() == n;
 }

-NodesTSIterator::NodesTSIterator(const std::vector<Node *> &source) {
-  PADDLE_ENFORCE(!source.empty(),
-                 "Start points of topological sorting should not be empty!");
-  // CHECK all the inputs' in-degree is 0
-  for (auto *node : source) {
-    PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0));
-  }
-
-  std::unordered_set<Node *> visited;
-  std::unordered_set<Node *> to_visit{source.begin(), source.end()};
-
-  std::vector<Node *> inlink_visited;
-  while (!to_visit.empty()) {
-    std::vector<Node *> queue(to_visit.begin(), to_visit.end());
-    for (auto *p : queue) {
-      if (Agent(p).deleted()) {
-        visited.insert(p);
-        to_visit.erase(p);
-      }
-
-      inlink_visited.clear();
-
-      std::copy_if(p->inputs.begin(), p->inputs.end(),
-                   std::back_inserter(inlink_visited),
-                   [&](Node *x) -> bool { return visited.count(x) != 0; });
-
-      if (inlink_visited.size() == p->inputs.size()) {
-        sorted_.push_back(p);
-        for (auto *_ : p->outputs) {
-          if (!visited.count(_)) {
-            to_visit.insert(_);
-          }
-        }
-
-        to_visit.erase(p);
-        visited.insert(p);
-      }
-    }
-  }
-}
-
-NodesTSIterator::NodesTSIterator(const NodesTSIterator &other)
-    : sorted_(other.sorted_), cursor_(other.cursor_) {}
-
-Node &NodesTSIterator::operator*() {
-  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
-  return *sorted_[cursor_];
-}
-
-NodesTSIterator &NodesTSIterator::operator++() {
-  if (++cursor_ >= sorted_.size()) {
-    sorted_.clear();
-    cursor_ = 0;
-  }
-  return *this;
-}
-NodesTSIterator &NodesTSIterator::operator=(const NodesTSIterator &other) {
-  cursor_ = other.cursor_;
-  sorted_ = other.sorted_;
-  return *this;
-}
-
-bool NodesTSIterator::operator==(const NodesTSIterator &other) {
-  return sorted_ == other.sorted_ && cursor_ == other.cursor_;
-}
-
-Node *NodesTSIterator::operator->() {
-  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
-  return sorted_[cursor_];
-}
-
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h
@@ -30,6 +30,7 @@ namespace inference {
 namespace analysis {

 using framework::ir::Graph;
+using framework::ir::NodesTSIterator;

 const char kIsFunctionNode[] = "__is_function_node__";
 const char kFunctionNodeSubGraph[] = "__function_node_sub_graph__";
@@ -132,32 +133,6 @@ struct Agent {
  framework::ir::Node *x_;
 };

-// Topological sorting iterator on nodes.
-struct NodesTSIterator
-    : public std::iterator<std::forward_iterator_tag, framework::ir::Node *> {
-  NodesTSIterator() = default;
-  explicit NodesTSIterator(const std::vector<framework::ir::Node *> &source);
-  NodesTSIterator(NodesTSIterator &&other)
-      : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
-    other.cursor_ = 0;
-  }
-  NodesTSIterator(const NodesTSIterator &other);
-
-  framework::ir::Node &operator*();
-  NodesTSIterator &operator++();
-  // TODO(Superjomn) current implementation just compare the first
-  // element, need to compare the graph and all the elements in the queue and
-  // set.
-  NodesTSIterator &operator=(const NodesTSIterator &other);
-  bool operator==(const NodesTSIterator &other);
-  bool operator!=(const NodesTSIterator &other) { return !(*this == other); }
-  framework::ir::Node *operator->();
-
- private:
-  std::vector<framework::ir::Node *> sorted_;
-  size_t cursor_{0};
-};
-
 // The nodes those have no input will be treated as start points.
 static std::vector<framework::ir::Node *> ExtractStartPoints(const Graph &g) {
  std::vector<framework::ir::Node *> result;

--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -16,6 +16,12 @@
 /*! \file paddle_api.h
 */

+/*! \mainpage Paddle Inference APIs
+ * \section intro_sec Introduction
+ * The Paddle inference library aims to offer an high performance inference SDK
+ * for Paddle users.
+ */
+
 #include <cassert>
 #include <memory>
 #include <string>
@@ -34,26 +40,49 @@ enum PaddleDType {
 };

 /**
- *\brief Memory menager for PaddleTensor.
+ * \brief Memory manager for `PaddleTensor`.
 *
- *The PaddleBuf holds a buffer for data input or output. The memory can be
- *allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
- *should be reused for better performance.
+ * The PaddleBuf holds a buffer for data input or output. The memory can be
+ * allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
+ * should be reused for better performance.
 *
- *For user allocated memory, the following API can be used:
- *- PaddleBuf(void* data, size_t length) to set an external memory by
- *specifying
- *  the memory address and length.
- *- Reset(void* data, size_t length) to reset the PaddleBuf with an external
+ * For user allocated memory, the following API can be used:
+ * - PaddleBuf(void* data, size_t length) to set an external memory by
+ * specifying the memory address and length.
+ * - Reset(void* data, size_t length) to reset the PaddleBuf with an external
 *memory.
- *ATTENTION, for user allocated memory, deallocation should be done by users
+ * ATTENTION, for user allocated memory, deallocation should be done by users
 *externally after the program finished. The PaddleBuf won't do any allocation
 *or deallocation.
 *
- *To have the PaddleBuf allocate and manage the memory:
- *- PaddleBuf(size_t length) will allocate a memory of size `length`.
- *- Resize(size_t length) resize the memory to no less than `length`, ATTENTION
+ * To have the PaddleBuf allocate and manage the memory:
+ * - PaddleBuf(size_t length) will allocate a memory of size `length`.
+ * - Resize(size_t length) resize the memory to no less than `length`, ATTENTION
 *  if the allocated memory is larger than `length`, nothing will done.
+ *
+ * Usage:
+ *
+ * Let PaddleBuf manage the memory internally.
+ * \code{cpp}
+ * const int num_elements = 128;
+ * PaddleBuf buf(num_elements * sizeof(float));
+ * \endcode
+ *
+ * Or
+ * \code{cpp}
+ * PaddleBuf buf;
+ * buf.Resize(num_elements * sizeof(float));
+ * \endcode
+ * Works the exactly the same.
+ *
+ * One can also make the `PaddleBuf` use the external memory.
+ * \code{cpp}
+ * PaddleBuf buf;
+ * void* external_memory = new float[num_elements];
+ * buf.Reset(external_memory, num_elements*sizeof(float));
+ * ...
+ * delete[] external_memory; // manage the memory lifetime outside.
+ * \endcode
 */
 class PaddleBuf {
 public:
@@ -78,7 +107,7 @@ class PaddleBuf {
  /** Tell whether the buffer is empty.
   */
  bool empty() const { return length_ == 0; }
-  /** Get the memory address.
+  /** Get the data's memory address.
   */
  void* data() const { return data_; }
  /** Get the memory length.
@@ -110,7 +139,8 @@ struct PaddleTensor {
 };

 enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
-/** Tensor without copy, currently only supports AnalysisPredictor.
+
+/** Tensor without copy, currently only supports `AnalysisPredictor`.
 */
 class ZeroCopyTensor {
 public:
@@ -269,9 +299,11 @@ struct NativeConfig : public PaddlePredictor::Config {
 *
 * Usage:
 *
+ * \code{.cpp}
 * NativeConfig config;
 * ... // change the configs.
 * auto native_predictor = CreatePaddlePredictor(config);
+ * \endcode
 *
 * FOR EXTENSION DEVELOPER:
 * Different predictors are designated by config type. Similar configs can be

--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -66,8 +66,54 @@ void GpuPassStrategy::EnableMKLDNN() {
  LOG(ERROR) << "GPU not support MKLDNN yet";
 }

+GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
+  passes_.assign({
+    "infer_clean_graph_pass",                        //
+        "identity_scale_op_clean_pass",              //
+        "conv_affine_channel_fuse_pass",             //
+        "conv_eltwiseadd_affine_channel_fuse_pass",  //
+        "conv_bn_fuse_pass",                         //
+#if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
+                           // guaranteed at least v7
+        "conv_elementwise_add_act_fuse_pass",   //
+        "conv_elementwise_add2_act_fuse_pass",  //
+        "conv_elementwise_add_fuse_pass",       //
+#endif
+  });
+
+  for (int i = 6; i >= 3; i--) {
+    passes_.push_back("transpose_flatten" + std::to_string(i) +
+                      "_concat_fuse_pass");
+  }
+  use_gpu_ = true;
+}
+
 void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
  analysis_passes_.push_back(pass);
 }

+CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
+  // NOTE the large fusions should be located in the front, so that they will
+  // not be damaged by smaller ones.
+  passes_.assign({
+      "infer_clean_graph_pass",         //
+      "attention_lstm_fuse_pass",       //
+      "seqpool_concat_fuse_pass",       //
+      "seqconv_eltadd_relu_fuse_pass",  //
+      // "embedding_fc_lstm_fuse_pass", //
+      "fc_lstm_fuse_pass",             //
+      "mul_lstm_fuse_pass",            //
+      "fc_gru_fuse_pass",              //
+      "mul_gru_fuse_pass",             //
+      "seq_concat_fc_fuse_pass",       //
+      "fc_fuse_pass",                  //
+      "repeated_fc_relu_fuse_pass",    //
+      "squared_mat_sub_fuse_pass",     //
+      "conv_bn_fuse_pass",             //
+      "conv_eltwiseadd_bn_fuse_pass",  //
+      "is_test_pass",                  //
+      "identity_scale_op_clean_pass",  //
+  });
+  use_gpu_ = false;
+}
 }  // namespace paddle
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -97,30 +97,7 @@ class PassStrategy : public PaddlePassBuilder {
 */
 class CpuPassStrategy : public PassStrategy {
 public:
-  CpuPassStrategy() : PassStrategy({}) {
-    // NOTE the large fusions should be located in the front, so that they will
-    // not be damaged by smaller ones.
-    passes_.assign({
-        "infer_clean_graph_pass",         //
-        "attention_lstm_fuse_pass",       //
-        "seqpool_concat_fuse_pass",       //
-        "seqconv_eltadd_relu_fuse_pass",  //
-        // "embedding_fc_lstm_fuse_pass", //
-        "fc_lstm_fuse_pass",             //
-        "mul_lstm_fuse_pass",            //
-        "fc_gru_fuse_pass",              //
-        "mul_gru_fuse_pass",             //
-        "seq_concat_fc_fuse_pass",       //
-        "fc_fuse_pass",                  //
-        "repeated_fc_relu_fuse_pass",    //
-        "squared_mat_sub_fuse_pass",     //
-        "conv_bn_fuse_pass",             //
-        "conv_eltwiseadd_bn_fuse_pass",  //
-        "is_test_pass",                  //
-        "identity_scale_op_clean_pass",  //
-    });
-    use_gpu_ = false;
-  }
+  CpuPassStrategy();

  explicit CpuPassStrategy(const CpuPassStrategy &other)
      : PassStrategy(other.AllPasses()) {}
@@ -153,27 +130,7 @@ class CpuPassStrategy : public PassStrategy {
 */
 class GpuPassStrategy : public PassStrategy {
 public:
-  GpuPassStrategy() : PassStrategy({}) {
-    passes_.assign({
-      "infer_clean_graph_pass",                        //
-          "identity_scale_op_clean_pass",              //
-          "conv_affine_channel_fuse_pass",             //
-          "conv_eltwiseadd_affine_channel_fuse_pass",  //
-          "conv_bn_fuse_pass",                         //
-#if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
-                           // guaranteed at least v7
-          "conv_elementwise_add_act_fuse_pass",   //
-          "conv_elementwise_add2_act_fuse_pass",  //
-          "conv_elementwise_add_fuse_pass",       //
-#endif
-    });
-
-    for (int i = 6; i >= 3; i--) {
-      passes_.push_back("transpose_flatten" + std::to_string(i) +
-                        "_concat_fuse_pass");
-    }
-    use_gpu_ = true;
-  }
+  GpuPassStrategy();

  explicit GpuPassStrategy(const GpuPassStrategy &other)
      : PassStrategy(other.AllPasses()) {

--- a/paddle/fluid/operators/controlflow/compare_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_op.cc
@@ -51,6 +51,11 @@ class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
                                  comment.type));
    AddInput("Y", string::Sprintf("the right hand operand of %s operator",
                                  comment.type));
+    AddAttr<int>(
+        "axis",
+        "The start dimension index for broadcasting Y onto X. [default -1]")
+        .SetDefault(-1)
+        .EqualGreaterThan(-1);
    AddAttr<bool>("force_cpu",
                  "Force fill output variable to cpu "
                  "memory. Otherwise, fill output variable to the running "
@@ -64,11 +69,6 @@ N-dim tensor. X and Y could be any type.  The each element of the Out tensor is
 calculated by $%s$
 )DOC",
                               comment.equation));
-    AddAttr<int>(
-        "axis",
-        "The start dimension index for broadcasting Y onto X. [default -1]")
-        .SetDefault(-1)
-        .EqualGreaterThan(-1);
  }
 };


--- a/paddle/fluid/operators/detection/density_prior_box_op.h
+++ b/paddle/fluid/operators/detection/density_prior_box_op.h
@@ -72,7 +72,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for
 #endif
-    for (int i = 0; i < fixed_ratios.size(); i++) {
+    for (size_t i = 0; i < fixed_ratios.size(); i++) {
      sqrt_fixed_ratios.push_back(sqrt(fixed_ratios[i]));
    }

@@ -115,11 +115,10 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
      }
    }
    if (clip) {
-      platform::Transform<platform::CPUDeviceContext> trans;
-      ClipFunctor<T> clip_func;
-      trans(ctx.template device_context<platform::CPUDeviceContext>(),
-            boxes->data<T>(), boxes->data<T>() + boxes->numel(),
-            boxes->data<T>(), clip_func);
+      T* dt = boxes->data<T>();
+      std::transform(dt, dt + boxes->numel(), dt, [](T v) -> T {
+        return std::min<T>(std::max<T>(v, 0.), 1.);
+      });
    }
    framework::Tensor var_t;
    var_t.mutable_data<T>(
@@ -141,7 +140,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel<T> {
 #pragma omp parallel for collapse(2)
 #endif
    for (int i = 0; i < box_num; ++i) {
-      for (int j = 0; j < variances.size(); ++j) {
+      for (size_t j = 0; j < variances.size(); ++j) {
        e_vars(i, j) = variances[j];
      }
    }

--- a/paddle/fluid/operators/detection/prior_box_op.h
+++ b/paddle/fluid/operators/detection/prior_box_op.h
@@ -46,13 +46,6 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
  }
 }

-template <typename T>
-struct ClipFunctor {
-  HOSTDEVICE inline T operator()(T in) const {
-    return std::min<T>(std::max<T>(in, 0.), 1.);
-  }
-};
-
 template <typename T>
 class PriorBoxOpKernel : public framework::OpKernel<T> {
 public:
@@ -101,31 +94,30 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
    boxes->mutable_data<T>(ctx.GetPlace());
    vars->mutable_data<T>(ctx.GetPlace());

-    auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes);
+    T* b_t = boxes->data<T>();
    for (int h = 0; h < feature_height; ++h) {
      for (int w = 0; w < feature_width; ++w) {
        T center_x = (w + offset) * step_width;
        T center_y = (h + offset) * step_height;
        T box_width, box_height;
-        int idx = 0;
        for (size_t s = 0; s < min_sizes.size(); ++s) {
          auto min_size = min_sizes[s];
          if (min_max_aspect_ratios_order) {
            box_width = box_height = min_size / 2.;
-            e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-            e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-            e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-            e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
-            idx++;
+            b_t[0] = (center_x - box_width) / img_width;
+            b_t[1] = (center_y - box_height) / img_height;
+            b_t[2] = (center_x + box_width) / img_width;
+            b_t[3] = (center_y + box_height) / img_height;
+            b_t += 4;
            if (max_sizes.size() > 0) {
              auto max_size = max_sizes[s];
              // square prior with size sqrt(minSize * maxSize)
              box_width = box_height = sqrt(min_size * max_size) / 2.;
-              e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-              e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-              e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-              e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
-              idx++;
+              b_t[0] = (center_x - box_width) / img_width;
+              b_t[1] = (center_y - box_height) / img_height;
+              b_t[2] = (center_x + box_width) / img_width;
+              b_t[3] = (center_y + box_height) / img_height;
+              b_t += 4;
            }
            // priors with different aspect ratios
            for (size_t r = 0; r < aspect_ratios.size(); ++r) {
@@ -135,11 +127,11 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
              }
              box_width = min_size * sqrt(ar) / 2.;
              box_height = min_size / sqrt(ar) / 2.;
-              e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-              e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-              e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-              e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
-              idx++;
+              b_t[0] = (center_x - box_width) / img_width;
+              b_t[1] = (center_y - box_height) / img_height;
+              b_t[2] = (center_x + box_width) / img_width;
+              b_t[3] = (center_y + box_height) / img_height;
+              b_t += 4;
            }
          } else {
            // priors with different aspect ratios
@@ -147,21 +139,21 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
              float ar = aspect_ratios[r];
              box_width = min_size * sqrt(ar) / 2.;
              box_height = min_size / sqrt(ar) / 2.;
-              e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-              e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-              e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-              e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
-              idx++;
+              b_t[0] = (center_x - box_width) / img_width;
+              b_t[1] = (center_y - box_height) / img_height;
+              b_t[2] = (center_x + box_width) / img_width;
+              b_t[3] = (center_y + box_height) / img_height;
+              b_t += 4;
            }
            if (max_sizes.size() > 0) {
              auto max_size = max_sizes[s];
              // square prior with size sqrt(minSize * maxSize)
              box_width = box_height = sqrt(min_size * max_size) / 2.;
-              e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-              e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-              e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-              e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
-              idx++;
+              b_t[0] = (center_x - box_width) / img_width;
+              b_t[1] = (center_y - box_height) / img_height;
+              b_t[2] = (center_x + box_width) / img_width;
+              b_t[3] = (center_y + box_height) / img_height;
+              b_t += 4;
            }
          }
        }
@@ -169,11 +161,10 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
    }

    if (clip) {
-      platform::Transform<platform::CPUDeviceContext> trans;
-      ClipFunctor<T> clip_func;
-      trans(ctx.template device_context<platform::CPUDeviceContext>(),
-            boxes->data<T>(), boxes->data<T>() + boxes->numel(),
-            boxes->data<T>(), clip_func);
+      T* dt = boxes->data<T>();
+      std::transform(dt, dt + boxes->numel(), dt, [](T v) -> T {
+        return std::min<T>(std::max<T>(v, 0.), 1.);
+      });
    }

    framework::Tensor var_t;

--- a/paddle/fluid/operators/detection/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc
@@ -144,34 +144,40 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
                   "The ignore threshold to ignore confidence loss.")
        .SetDefault(0.7);
    AddComment(R"DOC(
-         This operator generate yolov3 loss by given predict result and ground
+         This operator generates yolov3 loss based on given predict result and ground
         truth boxes.
         
         The output of previous network is in shape [N, C, H, W], while H and W
-         should be the same, specify the grid size, each grid point predict given
-         number boxes, this given number is specified by anchors, it should be 
-         half anchors length, which following will be represented as S. In the 
-         second dimention(the channel dimention), C should be S * (class_num + 5),
-         class_num is the box categoriy number of source dataset(such as coco), 
-         so in the second dimention, stores 4 box location coordinates x, y, w, h 
-         and confidence score of the box and class one-hot key of each anchor box.
+         should be the same, H and W specify the grid size, each grid point predict 
+         given number boxes, this given number, which following will be represented as S,
+         is specified by the number of anchors, In the second dimension(the channel
+         dimension), C should be equal to S * (class_num + 5), class_num is the object 
+         category number of source dataset(such as 80 in coco dataset), so in the 
+         second(channel) dimension, apart from 4 box location coordinates x, y, w, h, 
+         also includes confidence score of the box and class one-hot key of each anchor box.

-         While the 4 location coordinates if $$tx, ty, tw, th$$, the box predictions
-         correspnd to:
+         Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box predictions
+         should be as follows:

         $$
-         b_x = \sigma(t_x) + c_x
-         b_y = \sigma(t_y) + c_y
+         b_x = \\sigma(t_x) + c_x
+         $$
+         $$
+         b_y = \\sigma(t_y) + c_y
+         $$
+         $$
         b_w = p_w e^{t_w}
+         $$
+         $$
         b_h = p_h e^{t_h}
         $$

-         While $$c_x, c_y$$ is the left top corner of current grid and $$p_w, p_h$$
-         is specified by anchors.
+         In the equation above, :math:`c_x, c_y` is the left top corner of current grid
+         and :math:`p_w, p_h` is specified by anchors.

         As for confidence score, it is the logistic regression value of IoU between
         anchor boxes and ground truth boxes, the score of the anchor box which has 
-         the max IoU should be 1, and if the anchor box has IoU bigger then ignore 
+         the max IoU should be 1, and if the anchor box has IoU bigger than ignore 
         thresh, the confidence score loss of this anchor box will be ignored.

         Therefore, the yolov3 loss consist of three major parts, box location loss,
@@ -186,13 +192,13 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {

         In order to trade off box coordinate losses between big boxes and small 
         boxes, box coordinate losses will be mutiplied by scale weight, which is
-         calculated as follow.
+         calculated as follows.

         $$
         weight_{box} = 2.0 - t_w * t_h
         $$

-         Final loss will be represented as follow.
+         Final loss will be represented as follows.

         $$
         loss = (loss_{xy} + loss_{wh}) * weight_{box}

--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -264,6 +264,23 @@ class ElementwiseOpInplace : public framework::InplaceInToOut {
  }
 };

+class ElementwiseGradOpInplace : public framework::InplaceInToOut {
+ public:
+  using framework::InplaceInToOut::InplaceInToOut;
+
+ protected:
+  std::unordered_map<std::string, std::string> Apply(
+      const framework::OpDesc &op_desc,
+      framework::BlockDesc *block) const override {
+    std::unordered_map<std::string, std::string> ret;
+    if (block->HasVar(framework::GradVarName("X")) &&
+        block->HasVar(framework::GradVarName("Out"))) {
+      ret[framework::GradVarName("Out")] = framework::GradVarName("X");
+    }
+    return ret;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

@@ -316,4 +333,5 @@ class ElementwiseOpInplace : public framework::InplaceInToOut {
                    op_type##GradMaker,                                \
                    ::paddle::operators::ElementwiseOpInplace);        \
  REGISTER_OPERATOR(op_type##_grad,                                    \
-                    ::paddle::operators::ElementwiseOpExplicitGrad)
+                    ::paddle::operators::ElementwiseOpExplicitGrad,    \
+                    ::paddle::operators::ElementwiseGradOpInplace)
--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
@@ -311,6 +311,10 @@ class LSTMGradKernel : public framework::OpKernel<T> {
        lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
      }

+      // lstm_value.output_value not used in bp, set to nullptr
+      // lstm_grad.state_active_grad not used in bp, set to nullptr
+      lstm_value.output_value = nullptr;
+      lstm_grad.state_active_grad = nullptr;
      int cur_batch_size = bend - bstart;
      math::LstmUnitGradFunctor<DeviceContext, T>::compute(
          device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size,

--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -405,6 +405,11 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
      }

      int cur_batch_size = bend - bstart;
+      // lstmp_value.output_value not used in bp, set to null
+      // lstmp_grad.state_active_grad not used in bp, set to null
+      lstmp_value.output_value = nullptr;
+      lstmp_grad.state_active_grad = nullptr;
+
      math::LstmUnitGradFunctor<DeviceContext, T>::compute(
          device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size,
          gate_act, cell_act, cand_act);

--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -168,9 +168,10 @@ void Pool2dOpMaker::Make() {
                            "be ignored.");  // TODO(Chengduo): Add checker.
                                             // (Currently,
  // TypedAttrChecker don't support vector type.)
-  AddAttr<bool>("global_pooling",
-                "(bool, default false) Whether to use the global pooling. "
-                "If global_pooling = true, ksize and paddings will be ignored.")
+  AddAttr<bool>(
+      "global_pooling",
+      "(bool, default false) Whether to use the global pooling. "
+      "If global_pooling = true, kernel size and paddings will be ignored.")
      .SetDefault(false);
  AddAttr<std::vector<int>>("strides",
                            "(vector<int>, default {1, 1}), strides(height, "
@@ -182,7 +183,7 @@ void Pool2dOpMaker::Make() {
      "paddings",
      "(vector<int>, default {0,0}), paddings(height, width) of pooling "
      "operator."
-      "If global_pooling = true, paddings and ksize will be ignored.")
+      "If global_pooling = true, paddings and kernel size will be ignored.")
      .SetDefault({0, 0});
  AddAttr<bool>(
      "exclusive",
@@ -204,7 +205,7 @@ void Pool2dOpMaker::Make() {
      .SetDefault(false);
  AddAttr<bool>(
      "ceil_mode",
-      "(bool, default false) Wether to use the ceil function to calculate "
+      "(bool, default false) Whether to use the ceil function to calculate "
      "output height and width. False is the default. If it is set to False, "
      "the floor function will be used.")
      .SetDefault(false);
@@ -259,31 +260,40 @@ Example:
       W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
       $$

-  For exclusive = true:
+  For exclusive = false:
       $$
       hstart = i * strides[0] - paddings[0]
+       $$
+       $$
       hend = hstart + ksize[0]
+       $$
+       $$
       wstart = j * strides[1] - paddings[1]
+       $$
+       $$
       wend = wstart + ksize[1]
+       $$
+       $$
       Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]}
       $$
-  For exclusive = false:
+
+  For exclusive = true:
       $$
       hstart = max(0, i * strides[0] - paddings[0])
+       $$
+       $$
       hend = min(H, hstart + ksize[0])
+       $$
+       $$
       wstart = max(0, j * strides[1] - paddings[1])
+       $$
+       $$
       wend = min(W, wstart + ksize[1])
+       $$
+       $$
       Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
       $$

-  For adaptive = true:
-      $$
-      hstart = floor(i * H_{in} / H_{out})
-      hend = ceil((i + 1) * H_{in} / H_{out})
-      wstart = floor(j * W_{in} / W_{out})
-      wend = ceil((j + 1) * W_{in} / W_{out})
-      Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
-      $$
 )DOC");
 }

@@ -324,7 +334,7 @@ void Pool3dOpMaker::Make() {
  AddAttr<bool>(
      "global_pooling",
      "(bool, default false) Whether to use the global pooling. "
-      "If global_pooling = true, ksize and paddings wille be ignored.")
+      "If global_pooling = true, kernel size and paddings will be ignored.")
      .SetDefault(false);
  AddAttr<std::vector<int>>(
      "strides",
@@ -359,7 +369,7 @@ void Pool3dOpMaker::Make() {
      .SetDefault(false);
  AddAttr<bool>(
      "ceil_mode",
-      "(bool, default false) Wether to use the ceil function to calculate "
+      "(bool, default false) Whether to use the ceil function to calculate "
      "output height and width. False is the default. If it is set to False, "
      "the floor function will be used.")
      .SetDefault(false);
@@ -392,48 +402,68 @@ Example:
  Output:
       Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
  For ceil_mode = false:
-  $$
-       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
-       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
-       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
-  $$
+       $$
+       D_{out} = \\frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1
+       $$
+       $$
+       H_{out} = \\frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[2]} + 1
+       $$
+       $$
+       W_{out} = \\frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
+       $$
  For ceil_mode = true:
-  $$
-       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1 \\
-       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\
-       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
-  $$
-  For exclusive = true:
-  $$
-  dstart = i * strides[0] - paddings[0]
-  dend = dstart + ksize[0]
-  hstart = j * strides[1] - paddings[1]
-  hend = hstart + ksize[1]
-  wstart = k * strides[2] - paddings[2]
-  wend = wstart + ksize[2]
-  Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]}
-  $$
+       $$
+       D_{out} = \\frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1
+       $$
+       $$
+       H_{out} = \\frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1
+       $$
+       $$
+       W_{out} = \\frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
+       $$
+
  For exclusive = false:
-  $$
-  dstart = max(0, i * strides[0] - paddings[0])
-  dend = min(D, dstart + ksize[0])
-  hstart = max(0, j * strides[1] - paddings[1])
-  hend = min(H, hstart + ksize[1])
-  wstart = max(0, k * strides[2] - paddings[2])
-  wend = min(W, wstart + ksize[2])
-  Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
-  $$
-
-  For adaptive = true:
-  $$
-  dstart = floor(i * D_{in} / D_{out})
-  dend = ceil((i + 1) * D_{in} / D_{out})
-  hstart = floor(j * H_{in} / H_{out})
-  hend = ceil((j + 1) * H_{in} / H_{out})
-  wstart = floor(k * W_{in} / W_{out})
-  wend = ceil((k + 1) * W_{in} / W_{out})
-  Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
-  $$
+       $$
+       dstart = i * strides[0] - paddings[0]
+       $$
+       $$
+       dend = dstart + ksize[0]
+       $$
+       $$
+       hstart = j * strides[1] - paddings[1]
+       $$
+       $$
+       hend = hstart + ksize[1]
+       $$
+       $$
+       wstart = k * strides[2] - paddings[2]
+       $$
+       $$
+       wend = wstart + ksize[2]
+       $$
+       $$
+       Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]}
+       $$
+
+  For exclusive = true:
+       $$
+       dstart = max(0, i * strides[0] - paddings[0])
+       $$
+       $$
+       dend = min(D, dstart + ksize[0])
+       $$
+       $$
+       hend = min(H, hstart + ksize[1])
+       $$
+       $$
+       wstart = max(0, k * strides[2] - paddings[2])
+       $$
+       $$
+       wend = min(W, wstart + ksize[2])
+       $$
+       $$
+       Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
+       $$

 )DOC");
 }

--- a/paddle/fluid/operators/random_crop_op.h
+++ b/paddle/fluid/operators/random_crop_op.h
@@ -121,7 +121,7 @@ struct RandomCropFunctor {
  HOSTDEVICE void operator()(size_t ins_idx) {
    typename Random<DeviceContext>::Engine engine(seed_);
    engine.discard(ins_idx * (rank_ - num_batchsize_dims_));
-    size_t offsets[9];
+    size_t offsets[9] = {};
    for (int i = num_batchsize_dims_; i < rank_; ++i) {
      typename Random<DeviceContext>::template UniformIntDist<size_t> dist(
          0, x_dims_[i] - out_dims_[i]);

--- a/paddle/fluid/operators/slice_op.cu
+++ b/paddle/fluid/operators/slice_op.cu
@@ -12,18 +12,138 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include <thrust/device_vector.h>
+#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/slice_op.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+template <size_t D>
+__global__ void Padding(const paddle::platform::float16* d_out,
+                        const int* out_dims, const int* in_dims,
+                        const int* offsets, int64_t n,
+                        paddle::platform::float16* d_in) {
+  int64_t out_idx = threadIdx.x + blockDim.x * blockIdx.x;
+  if (out_idx < n) {
+    int64_t out_idx_tmp = out_idx;
+    int coords[D] = {0};
+    for (int i = D - 1; i >= 0; --i) {
+      coords[i] = out_idx_tmp % out_dims[i];
+      out_idx_tmp /= out_dims[i];
+      coords[i] += offsets[i];
+    }
+
+    int64_t in_idx = 0;
+    for (int i = 0; i < D; ++i) {
+      in_idx = in_idx * in_dims[i] + coords[i];
+    }
+
+    d_in[in_idx] = d_out[out_idx];
+  }
+}
+
+template <>
+class SliceGradKernel<paddle::platform::CUDADeviceContext,
+                      paddle::platform::float16>
+    : public framework::OpKernel<paddle::platform::float16> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* d_in = ctx.Output<framework::Tensor>(framework::GradVarName("Input"));
+    d_in->mutable_data<paddle::platform::float16>(ctx.GetPlace());
+
+    auto out_dims = d_out->dims();
+    auto in_dims = d_in->dims();
+    int rank = out_dims.size();
+    std::vector<int> offsets(rank, 0);
+    auto axes = ctx.Attr<std::vector<int>>("axes");
+    auto starts = ctx.Attr<std::vector<int>>("starts");
+
+    for (size_t i = 0; i < starts.size(); ++i) {
+      if (starts[i] < 0) {
+        starts[i] += in_dims[axes[i]];
+      }
+      offsets[axes[i]] = std::max(starts[i], 0);
+    }
+
+    math::SetConstant<paddle::platform::CUDADeviceContext,
+                      paddle::platform::float16>
+        set_zero;
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::CUDADeviceContext>();
+    set_zero(dev_ctx, d_in, static_cast<paddle::platform::float16>(0));
+
+    int64_t numel = d_out->numel();
+    dim3 blocks((numel - 1) / PADDLE_CUDA_NUM_THREADS + 1);
+    dim3 threads(PADDLE_CUDA_NUM_THREADS);
+    auto stream = ctx.cuda_device_context().stream();
+
+    auto out_shape = framework::vectorize2int(out_dims);
+    thrust::device_vector<int> out_dims_vec(out_shape.begin(), out_shape.end());
+    auto in_shape = framework::vectorize2int(in_dims);
+    thrust::device_vector<int> in_dims_vec(in_shape.begin(), in_shape.end());
+    thrust::device_vector<int> offsets_vec(offsets.begin(), offsets.end());
+    const int* out_dims_ptr = thrust::raw_pointer_cast(out_dims_vec.data());
+    const int* in_dims_ptr = thrust::raw_pointer_cast(in_dims_vec.data());
+    const int* offsets_ptr = thrust::raw_pointer_cast(offsets_vec.data());
+
+    switch (rank) {
+      case 1:
+        Padding<1><<<blocks, threads, 0, stream>>>(
+            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
+            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
+        break;
+      case 2:
+        Padding<2><<<blocks, threads, 0, stream>>>(
+            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
+            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
+        break;
+      case 3:
+        Padding<3><<<blocks, threads, 0, stream>>>(
+            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
+            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
+        break;
+      case 4:
+        Padding<4><<<blocks, threads, 0, stream>>>(
+            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
+            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
+        break;
+      case 5:
+        Padding<5><<<blocks, threads, 0, stream>>>(
+            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
+            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
+        break;
+      case 6:
+        Padding<6><<<blocks, threads, 0, stream>>>(
+            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
+            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
+        break;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle

 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
    slice, ops::SliceKernel<paddle::platform::CUDADeviceContext, float>,
    ops::SliceKernel<paddle::platform::CUDADeviceContext, double>,
    ops::SliceKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SliceKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    ops::SliceKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::SliceKernel<paddle::platform::CUDADeviceContext, plat::float16>);

 REGISTER_OP_CUDA_KERNEL(
    slice_grad,
    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, float>,
    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, double>,
    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, plat::float16>);
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -439,7 +439,8 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
        context.Input<Tensor>(framework::GradVarName("Loss"))->data<T>();
    Tensor* logit_grad =
        context.Output<Tensor>(framework::GradVarName("Logits"));
-    logit_grad->ShareDataWith(*context.Input<Tensor>("Softmax"));
+    framework::TensorCopy(*context.Input<Tensor>("Softmax"), context.GetPlace(),
+                          context.device_context(), logit_grad);
    T* logit_grad_data = logit_grad->data<T>();

    const int batch_size = logit_grad->dims()[0];

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -398,9 +398,9 @@ PYBIND11_MODULE(core, m) {
           py::arg("recursive_sequence_lengths"), R"DOC(
           Set LoD of the LoDTensor according to recursive sequence length.

-           For example, if recursive_sequence_lengths=[2, 3], meaning that
+           For example, if recursive_sequence_lengths=[[2, 3]], meaning that
           there are two sequences with length 2 and 3 respectively, the 
-           corresponding lod would be [0, 2, 2+3], i.e, [0, 2, 5].  
+           corresponding lod would be [[0, 2, 2+3]], i.e, [[0, 2, 5]].  

           Args:
                recursive_sequence_lengths (List[List[int]]): sequence lengths. 
@@ -803,9 +803,11 @@ All parameter, weight, gradient are variables in Paddle.
      .def(py::init<const platform::Place &>())
      .def("close", &Executor::Close)
      .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope,
-                     int block_id, bool create_local_scope, bool create_vars) {
+                     int block_id, bool create_local_scope, bool create_vars,
+                     const std::vector<std::string> &fetch_vars) {
        pybind11::gil_scoped_release release;
-        self.Run(prog, scope, block_id, create_local_scope, create_vars);
+        self.Run(prog, scope, block_id, create_local_scope, create_vars,
+                 fetch_vars);
      });

  m.def("init_gflags", framework::InitGflags);

--- a/paddle/scripts/fast_install.sh
+++ b/paddle/scripts/fast_install.sh
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -25,4 +25,5 @@ import paddle.reader
 import paddle.dataset
 import paddle.batch
 import paddle.compat
+import paddle.distributed
 batch = batch.batch
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -161,7 +161,6 @@ def __bootstrap__():
            'times_excess_than_required_tmp_allocation',
            'enable_inplace_whitelist'
        ]
-
    core.init_gflags([sys.argv[0]] +
                     ["--tryfromenv=" + ",".join(read_env_flags)])
    core.init_glog(sys.argv[0])

--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -177,7 +177,10 @@ class CompiledProgram(object):

        # FIXME(dzhwinter): enable_inplace should be after memory_optimize
        # if turn on python memory optimize, turn off the inplace_pass.
-        self._build_strategy.enable_inplace = False if self._program._is_mem_optimized else True
+        if self._build_strategy.memory_optimize is True:
+            self._build_strategy.memory_optimize = False if self._program._is_mem_optimized else True
+        if self._build_strategy.enable_inplace is True:
+            self._build_strategy.enable_inplace = False if self._program._is_mem_optimized else True

        if self._build_strategy.num_trainers > 1 and trainers_endpoints:
            assert self._build_strategy.num_trainers == len(

--- a/python/paddle/fluid/contrib/int8_inference/README.md
+++ b/python/paddle/fluid/contrib/int8_inference/README.md
@@ -63,10 +63,10 @@ Notes:
 ## 4. How to reproduce the results
 * Small dataset
 ```bash
-python python/paddle/fluid/contrib/tests/test_calibration.py
+FLAGS_use_mkldnn=true python python/paddle/fluid/contrib/tests/test_calibration.py
 ```

 * Full dataset
 ```bash
-DATASET=full python python/paddle/fluid/contrib/tests/test_calibration.py
+FLAGS_use_mkldnn=true DATASET=full python python/paddle/fluid/contrib/tests/test_calibration.py
 ```
--- a/python/paddle/fluid/contrib/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/tests/CMakeLists.txt
@@ -6,5 +6,9 @@ if(APPLE OR WIN32 OR NOT WITH_MKL)
 endif()

 foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
+    if(src MATCHES "test_calibration")
+        py_test(${src} SRCS ${src}.py ENVS FLAGS_use_mkldnn=true)
+    else()
+        py_test(${src} SRCS ${src}.py)
+    endif()
 endforeach()
--- a/python/paddle/fluid/contrib/tests/test_calibration.py
+++ b/python/paddle/fluid/contrib/tests/test_calibration.py
@@ -199,7 +199,6 @@ class TestCalibrationForResnet50(unittest.TestCase):

    def run_program(self, model_path, generate_int8=False, algo='direct'):
        image_shape = [3, 224, 224]
-        os.environ['FLAGS_use_mkldnn'] = 'True'

        fluid.memory_optimize(fluid.default_main_program())

@@ -241,9 +240,6 @@ class TestCalibrationForResnet50(unittest.TestCase):
            label = label.reshape([-1, 1])
            running_program = calibrator.sampling_program.clone(
            ) if generate_int8 else infer_program.clone()
-            for op in running_program.current_block().ops:
-                if op.has_attr("use_mkldnn"):
-                    op._set_attr("use_mkldnn", True)

            t1 = time.time()
            _, acc1, _ = exe.run(

--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -261,45 +261,42 @@ def _as_lodtensor(data, place):

 class Executor(object):
    """
-    An Executor in Python, only support the single-GPU running. For multi-cards, please refer to
-    ParallelExecutor.
-    Python executor takes a program, add feed operators and fetch operators to this program according
+    An Executor in Python, supports single/multiple-GPU running, and single/multiple-CPU running.
+    Python executor takes a program, adds feed operators and fetch operators to this program according
    to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
-    the variables(or names) that user want to get after program run. Note: the executor will run all
+    the variables(or names) that user wants to get after program runs. Note: the executor will run all
    operators in the program but not only the operators dependent by the fetch_list.
-    It store the global variables into the global scope, and create a local scope for the temporary
-    variables. The local scope contents will be discarded after every minibatch forward/backward finished.
-    But the global scope variables will be persistent through different runs.
-    All of ops in program will be running in sequence.
+    It stores the global variables into the global scope, and creates a local scope for the temporary
+    variables. The contents in local scope may be discarded after every minibatch forward/backward
+    finished. But the global scope variables will be persistent through different runs.


    Example:
-    .. code-block:: python
-        # First create the Executor.
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-
-        # Run the startup program once and only once.
-        # Not need to optimize/compile the startup program.
-        exe.run(fluid.default_startup_program())
-
-        # Run the main program directly without compile.
-        loss, = exe.run(fluid.default_main_program(),
-                        feed=feed_dict,
-                        fetch_list=[loss.name])
-        # Or, compiled the program and run. See `CompiledProgram` for more detail.
-        compiled_prog = compiler.CompiledProgram(
-            fluid.default_main_program()).with_data_parallel(
-            loss_name=loss.name)
-        loss, = exe.run(compiled_prog,
-                        feed=feed_dict,
-                        fetch_list=[loss.name])
+
+        .. code-block:: python
+
+            # First create the Executor.
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+
+            # Run the startup program once and only once.
+            # Not need to optimize/compile the startup program.
+            exe.run(fluid.default_startup_program())
+
+            # Run the main program directly without compile.
+            loss, = exe.run(fluid.default_main_program(),
+                            feed=feed_dict,
+                            fetch_list=[loss.name])
+            # Or, compiled the program and run. See `CompiledProgram` for more detail.
+            compiled_prog = compiler.CompiledProgram(
+                fluid.default_main_program()).with_data_parallel(
+                loss_name=loss.name)
+            loss, = exe.run(compiled_prog,
+                            feed=feed_dict,
+                            fetch_list=[loss.name])

    Args:
        place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device
-
-    Note: For debugging complicated network in parallel-GPUs, you can test it on the executor.
-    They has the exactly same arguments, and expected the same results.
    """

    def __init__(self, place):
@@ -382,6 +379,12 @@ class Executor(object):
        ]
        return outs

+    '''
+    TODO(typhoonzero): Define "no longer use" meaning? Can user create
+    a new Executor for the same program and run?
+    TODO(panyx0718): Why ParallelExecutor doesn't have close?
+    '''
+
    def close(self):
        """
        Close this executor.
@@ -389,9 +392,6 @@ class Executor(object):
        You can no longer use this executor after calling this method.
        For the distributed training, this method would free the resource on PServers related to
        the current Trainer.
-        TODO(typhoonzero): Define "no longer use" meaning? Can user create
-        a new Executor for the same program and run?
-        TODO(panyx0718): Why ParallelExecutor doesn't have close?

        Example:
            >>> cpu = core.CPUPlace()

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -555,7 +555,8 @@ class OpProtoHolder(object):
        return {
            core.op_proto_and_checker_maker.kOpRoleAttrName(),
            core.op_proto_and_checker_maker.kOpRoleVarAttrName(),
-            core.op_proto_and_checker_maker.kOpNameScopeAttrName()
+            core.op_proto_and_checker_maker.kOpNameScopeAttrName(),
+            core.op_proto_and_checker_maker.kOpCreationCallstackAttrName()
        }



--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -766,7 +766,10 @@ def _load_distributed_persistables(executor, dirname, main_program=None):
                    dtype=slice_var.dtype,
                    persistable=True)

-                dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:])
+                dim1_flatten = 1
+                if len(slice.shape) >= 2:
+                    dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:])
+
                start = int(offset / dim1_flatten)
                end = int(offset / dim1_flatten + slice.shape[0])

@@ -892,7 +895,7 @@ def save_inference_model(dirname,
                                     True is supported.

    Returns:
-        None
+        target_var_name_list(list): The fetch variables' name list

    Raises:
        ValueError: If `feed_var_names` is not a list of basestring.
@@ -945,11 +948,13 @@ def save_inference_model(dirname,
    # TODO(Superjomn) add an IR pass to remove 1-scale op.
    with program_guard(main_program):
        uniq_target_vars = []
-        for var in target_vars:
+        for i, var in enumerate(target_vars):
            if isinstance(var, Variable):
-                var1 = layers.scale(var, 1.)
-            uniq_target_vars.append(var1)
+                var = layers.scale(
+                    var, 1., name="save_infer_model/scale_{}".format(i))
+            uniq_target_vars.append(var)
        target_vars = uniq_target_vars
+    target_var_name_list = [var.name for var in target_vars]

    # when a pserver and a trainer running on the same machine, mkdir may conflict
    try:
@@ -1006,6 +1011,7 @@ def save_inference_model(dirname,
        params_filename = os.path.basename(params_filename)

    save_persistables(executor, dirname, main_program, params_filename)
+    return target_var_name_list


 def load_inference_model(dirname,

--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -506,9 +506,9 @@ class While(object):
    while loop control flow.

    Args:
-        cond (Variable): condition used to compare.
+        cond(Variable): condition used to compare.
        is_test(bool): A flag indicating whether execution is in test phase.
-        name (str): The name of this layer.
+        name(str): The name of this layer.

    Examples:
          .. code-block:: python
@@ -589,7 +589,8 @@ class While(object):


 def lod_rank_table(x, level=0):
-    """LoD Rank Table Operator. Given an input variable **x** and a level number
+    """
+    LoD Rank Table Operator. Given an input variable **x** and a level number
    of LoD, this layer creates a LodRankTable object. A LoDRankTable object
    contains a list of bi-element tuples. Each tuple consists of an index and
    a length, both of which are int type. Refering to specified level of LoD,
@@ -883,10 +884,8 @@ def less_than(x, y, force_cpu=None, cond=None, **ignored):
    return cond


-def equal(x, y, cond=None, **ignored):
+def equal(x, y, cond=None):
    """
-    **equal**
-
    This layer returns the truth value of :math:`x == y` elementwise.

    Args:
@@ -1458,7 +1457,6 @@ class DynamicRNN(object):

        Returns:
            The current timestep in the input sequence.
-
        """
        self._assert_in_rnn_block_("step_input")
        if not isinstance(x, Variable):
@@ -1535,8 +1533,7 @@ class DynamicRNN(object):
    @signature_safe_contextmanager
    def block(self):
        """
-        The block for user to define operators in RNN. See the class docstring
-        for more details.
+        The block for user to define operators in RNN.
        """
        if self.status != DynamicRNN.BEFORE_RNN:
            raise ValueError("rnn.block() can only be invoke once")
@@ -1640,8 +1637,7 @@ class DynamicRNN(object):
            dtype(str|numpy.dtype): The data type of the initialized memory.

        Returns:
-            the memory variable.
-
+            The memory variable.
        """
        self._assert_in_rnn_block_('memory')
        self._init_zero_idx_()
@@ -1740,7 +1736,7 @@ class DynamicRNN(object):

    def output(self, *outputs):
        """
-        mark the RNN output variables.
+        Mark the RNN output variables.

        Args:
            outputs: The output variables.

--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -545,15 +545,16 @@ def yolov3_loss(x,
        TypeError: Attr ignore_thresh of yolov3_loss must be a float number

    Examples:
-    .. code-block:: python
-
-        x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
-        gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32')
-        gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32')
-        anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
-        anchors = [0, 1, 2]
-        loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80, anchors=anchors, 
-                                        ignore_thresh=0.5, downsample_ratio=32)
+      .. code-block:: python
+
+          x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
+          gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32')
+          gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32')
+          anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
+          anchor_mask = [0, 1, 2]
+          loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, gtlabel=gtlabel, anchors=anchors, 
+                                          anchor_mask=anchor_mask, class_num=80,
+                                          ignore_thresh=0.7, downsample_ratio=32)
    """
    helper = LayerHelper('yolov3_loss', **locals())


--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -56,7 +56,10 @@ def data(name,

    Args:
       name(str): The name/alias of the function
-       shape(list): Tuple declaring the shape.
+       shape(list): Tuple declaring the shape. If :code:`append_batch_size` is 
+                    True and there is no -1 inside :code:`shape`, it should be 
+                    considered as the shape of the each sample. Otherwise, it
+                    should be considered as the shape of the batched data.  
       append_batch_size(bool):
          1. If true, it prepends -1 to the shape.
            For example if shape=[1], the resulting shape is [-1, 1].

--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -24,7 +24,7 @@ from ..framework import OpProtoHolder, Variable, core, convert_np_dtype_to_dtype
 from ..layer_helper import LayerHelper

 __all__ = [
-    'deprecated', 'generate_layer_fn', 'generate_layer_fn_noattr', 'autodoc',
+    'deprecated', 'generate_layer_fn', 'generate_activation_fn', 'autodoc',
    'templatedoc'
 ]

@@ -89,6 +89,9 @@ def _generate_doc_string_(op_proto, additional_args_lines=None):
        buf.write('\n')

    skip_attrs = OpProtoHolder.generated_op_attr_names()
+    # attr use_mkldnn and is_test also should not be visible to users.
+    skip_attrs.add("use_mkldnn")
+    skip_attrs.add("is_test")

    for each_attr in op_proto.attrs:
        if each_attr.name in skip_attrs:
@@ -226,7 +229,7 @@ def generate_layer_fn(op_type):
    return func


-def generate_layer_fn_noattr(op_type):
+def generate_activation_fn(op_type):
    """Register the Python layer for an Operator without Attribute.

    Args:
@@ -246,6 +249,7 @@ def generate_layer_fn_noattr(op_type):

    func.__name__ = op_type
    func.__doc__ = _generate_doc_string_(op_proto)
+
    return func



--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -2441,7 +2441,7 @@ def pool2d(input,

          data = fluid.layers.data(
              name='data', shape=[3, 32, 32], dtype='float32')
-          conv2d = fluid.layers.pool2d(
+          pool2d = fluid.layers.pool2d(
                            input=data,
                            pool_size=2,
                            pool_type='max',
@@ -2490,6 +2490,7 @@ def pool2d(input,
    return pool_out


+@templatedoc()
 def pool3d(input,
           pool_size=-1,
           pool_type="max",
@@ -2501,13 +2502,19 @@ def pool3d(input,
           name=None,
           exclusive=True):
    """
-    This function adds the operator for pooling in 3-dimensions, using the
-    pooling configurations mentioned in input parameters.
+    ${comment}

    Args:
-        input (Variable): ${input_comment}
-        pool_size (int): ${ksize_comment}
-        pool_type (str): ${pooling_type_comment}
+        input (Variable): The input tensor of pooling operator. The format of
+                          input tensor is NCDHW, where N is batch size, C is
+                          the number of channels, D is the depth of the feature,
+                          H is the height of the feature, and W is the width
+                          of the feature.
+        pool_size (int|list|tuple): The pool kernel size. If pool kernel size 
+            is a tuple or list, it must contain three integers, 
+            (pool_size_Depth, pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be the cube of an int.
+        pool_type (string): ${pooling_type_comment}
        pool_stride (int): stride of the pooling layer.
        pool_padding (int): padding size.
        global_pooling (bool): ${global_pooling_comment}
@@ -2520,6 +2527,19 @@ def pool3d(input,

    Returns:
        Variable: output of pool3d layer.
+
+    Examples:
+
+        .. code-block:: python
+
+          data = fluid.layers.data(
+              name='data', shape=[3, 32, 32, 32], dtype='float32')
+          pool3d = fluid.layers.pool3d(
+                            input=data,
+                            pool_size=2,
+                            pool_type='max',
+                            pool_stride=1,
+                            global_pooling=False)
    """
    if pool_type not in ["max", "avg"]:
        raise ValueError(
@@ -2569,7 +2589,27 @@ def adaptive_pool2d(input,
                    require_index=False,
                    name=None):
    """
-    ${comment}
+    **Adaptive Pool2d Operator**
+    The adaptive_pool2d operation calculates the output based on the input, pool_size,
+    pool_type parameters. Input(X) and output(Out) are in NCHW format, where N is batch
+    size, C is the number of channels, H is the height of the feature, and W is
+    the width of the feature. Parameters(pool_size) should contain two elements which
+    represent height and width, respectively. Also the H and W dimensions of output(Out)
+    is same as Parameter(pool_size).
+
+    For average adaptive pool2d:
+
+    ..  math::
+
+       hstart &= floor(i * H_{in} / H_{out})
+
+       hend &= ceil((i + 1) * H_{in} / H_{out})
+
+       wstart &= floor(j * W_{in} / W_{out})
+
+       wend &= ceil((j + 1) * W_{in} / W_{out})
+
+       Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}

    Args:
        input (Variable): The input tensor of pooling operator. The format of
@@ -2579,8 +2619,8 @@ def adaptive_pool2d(input,
        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
            it must contain two integers, (pool_size_Height, pool_size_Width).
        pool_type: ${pooling_type_comment}
-        require_index (bool): If true, the index of max pooling point along with outputs.
-            it cannot be set in average pooling type.
+        require_index (bool): If true, the index of max pooling point will be returned along
+            with outputs. It cannot be set in average pooling type.
        name (str|None): A name for this layer(optional). If set None, the
                        layer will be named automatically.

@@ -2661,18 +2701,42 @@ def adaptive_pool3d(input,
                    require_index=False,
                    name=None):
    """
-    ${comment}
+    **Adaptive Pool3d Operator**
+    The adaptive_pool3d operation calculates the output based on the input, pool_size,
+    pool_type parameters. Input(X) and output(Out) are in NCDHW format, where N is batch
+    size, C is the number of channels, D is the depth of the feature, H is the height of
+    the feature, and W is the width of the feature. Parameters(pool_size) should contain
+    three elements which represent height and width, respectively. Also the D, H and W
+    dimensions of output(Out) is same as Parameter(pool_size).
+
+    For average adaptive pool3d:
+
+    ..  math::
+
+      dstart &= floor(i * D_{in} / D_{out})
+
+      dend &= ceil((i + 1) * D_{in} / D_{out})
+
+      hstart &= floor(j * H_{in} / H_{out})
+
+      hend &= ceil((j + 1) * H_{in} / H_{out})
+
+      wstart &= floor(k * W_{in} / W_{out})
+
+      wend &= ceil((k + 1) * W_{in} / W_{out})
+
+      Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}

    Args:
        input (Variable): The input tensor of pooling operator. The format of
-                          input tensor is NCHW, where N is batch size, C is
-                          the number of channels, H is the height of the
-                          feature, and W is the width of the feature.
+                          input tensor is NCDHW, where N is batch size, C is
+                          the number of channels, D is the depth of the feature,
+                          H is the height of the feature, and W is the width of the feature.
        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain two integers, (Depth, Height, Width).
+            it must contain three integers, (Depth, Height, Width).
        pool_type: ${pooling_type_comment}
-        require_index (bool): If true, the index of max pooling point along with outputs.
-            it cannot be set in average pooling type.
+        require_index (bool): If true, the index of max pooling point will be returned along
+            with outputs. It cannot be set in average pooling type.
        name (str|None): A name for this layer(optional). If set None, the
                        layer will be named automatically.

@@ -2709,7 +2773,7 @@ def adaptive_pool3d(input,
              name='data', shape=[3, 32, 32], dtype='float32')
          pool_out, mask = fluid.layers.adaptive_pool3d(
                            input=data,
-                            pool_size=[3, 3],
+                            pool_size=[3, 3, 3],
                            pool_type='avg')
    """
    if pool_type not in ["max", "avg"]:
@@ -2930,6 +2994,7 @@ def batch_norm(input,
            "momentum": momentum,
            "epsilon": epsilon,
            "is_test": is_test,
+            "data_layout": data_layout,
            "use_mkldnn": False,
            "fuse_with_relu": fuse_with_relu,
            "use_global_stats": use_global_stats
@@ -3235,7 +3300,7 @@ def group_norm(input,
    # create output
    mean_out = helper.create_variable(dtype=dtype, stop_gradient=True)
    variance_out = helper.create_variable(dtype=dtype, stop_gradient=True)
-    group_norm_out = helper.create_variable(dtype)
+    group_norm_out = helper.create_variable(dtype=dtype)

    helper.append_op(
        type="group_norm",

--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -14,7 +14,7 @@

 from __future__ import print_function
 import os
-from .layer_function_generator import generate_layer_fn, generate_layer_fn_noattr
+from .layer_function_generator import generate_layer_fn, generate_activation_fn
 from .. import core
 from ..framework import convert_np_dtype_to_dtype_

@@ -53,7 +53,7 @@ globals()['_elementwise_div'] = generate_layer_fn('elementwise_div')
 __all__ += __activations_noattr__

 for _OP in set(__activations_noattr__):
-    globals()[_OP] = generate_layer_fn_noattr(_OP)
+    globals()[_OP] = generate_activation_fn(_OP)

 __all__ += ["uniform_random"]


--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -1368,9 +1368,9 @@ class FtrlOptimizer(Optimizer):

    Args:
        learning_rate (float|Variable): global learning rate.
-        l1 (float):
-        l2 (float):
-        lr_power (float):
+        l1 (float): L1 regularization strength.
+        l2 (float): L2 regularization strength.
+        lr_power (float): Learning Rate Power.
        regularization: A Regularizer, such as
                        fluid.regularizer.L2DecayRegularizer.
        name: A optional name prefix.

--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -148,7 +148,10 @@ class ParallelExecutor(object):
            else framework.default_main_program()
        # FIXME(dzhwinter): enable_inplace should be after memory_optimize
        # if turn on python memory optimize, turn off the inplace_pass.
-        build_strategy.enable_inplace = False if main._is_mem_optimized else True
+        if build_strategy.memory_optimize is True:
+            build_strategy.memory_optimize = False if main._is_mem_optimized else True
+        if build_strategy.enable_inplace is True:
+            build_strategy.enable_inplace = False if main._is_mem_optimized else True
        scope = scope if scope is not None else executor.global_scope()

        if share_vars_from and not isinstance(share_vars_from,

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -77,6 +77,7 @@ list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
 list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
 list(REMOVE_ITEM TEST_OPS test_imperative_resnet)
 list(REMOVE_ITEM TEST_OPS test_imperative_optimizer)
+list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer)
 foreach(TEST_OP ${TEST_OPS})
    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
@@ -107,10 +108,18 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
 set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450)
 py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
+if(NOT WIN32)
+py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer SERIAL)
+endif()
 if(NOT APPLE)
    py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
+    if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+        # change the timeout from 600 to 1200, because in debug mode, this test need more time.
+        set_tests_properties(test_image_classification_resnet PROPERTIES TIMEOUT 1200)
+    endif()
 endif()

+
 if (WITH_NGRAPH)
    add_subdirectory(ngraph)
 endif()

--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -79,7 +79,7 @@ class TestParallelExecutorBase(unittest.TestCase):
            if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
        build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
        build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv
-        build_strategy.memory_optimize = use_ir_memory_optimize
+        build_strategy.memory_optimize = False if memory_opt else use_ir_memory_optimize
        # python memory optimization is conflict with inplace pass.
        # Use ir graph memory optimization after inplace pass is the correct way.
        build_strategy.enable_inplace = False if memory_opt else enable_inplace

--- a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
@@ -121,6 +121,8 @@ class TestMNIST(TestParallelExecutorBase):
                regularization=fluid.regularizer.L2Decay(1e-6))
            return optimizer

+        # NOTE(dzh):
+        # need to make it compatible with elewise fuse act
        not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
            model,
            feed_dict={"image": img,
@@ -128,6 +130,7 @@ class TestMNIST(TestParallelExecutorBase):
            use_cuda=use_cuda,
            fuse_elewise_add_act_ops=False,
            memory_opt=False,
+            use_ir_memory_optimize=False,
            optimizer=_optimizer)
        fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
            model,
@@ -136,6 +139,7 @@ class TestMNIST(TestParallelExecutorBase):
            use_cuda=use_cuda,
            fuse_elewise_add_act_ops=True,
            memory_opt=False,
+            use_ir_memory_optimize=False,
            optimizer=_optimizer)

        for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss):

--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -1020,7 +1020,11 @@ class DistributeTranspiler(object):
        skip_dim0 = 0
        slice_vars = self.param_var_mapping[orig_var_name]

-        orig_dim1_flatten = reduce(lambda x, y: x * y, slice_vars[0].shape[1:])
+        orig_dim1_flatten = 1
+
+        if len(slice_vars[0].shape) >= 2:
+            orig_dim1_flatten = reduce(lambda x, y: x * y,
+                                       slice_vars[0].shape[1:])

        for slice_var in slice_vars[:block_idx]:
            skip_dim0 += slice_var.shape[0]

--- a/python/requirements.txt
+++ b/python/requirements.txt
 requests==2.9.2
 numpy>=1.12
-protobuf==3.1
+protobuf>=3.1.0
 recordio>=0.1.0
 matplotlib==2.2.3 # TODO: let python3 paddlepaddle package use latest matplotlib
 rarfile

--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -100,6 +100,7 @@ packages=['paddle',
          'paddle.utils',
          'paddle.dataset',
          'paddle.reader',
+          'paddle.distributed',
          'paddle.fluid',
          'paddle.fluid.imperative',
          'paddle.fluid.proto',

--- a/tools/manylinux1/build_scripts/build.sh
+++ b/tools/manylinux1/build_scripts/build.sh
--- a/tools/run_mp.py
+++ b/tools/run_mp.py