From 1202d3fc740b065590f93a6029ab2126ca9dafff Mon Sep 17 00:00:00 2001
From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com>
Date: Sun, 21 Apr 2019 10:47:15 -0500
Subject: [PATCH] Refine model gpu memory (#16993)

* speedup gc and inplace softmax_with_cross_entropy_grad
test=develop

* refine models gpu mem
Merge skip vars and warning messages of mem opt
remove relu mem opt
test=develop

* follow comments
test=develop
---
 paddle/fluid/framework/details/CMakeLists.txt |  4 +-
 .../fluid/framework/details/build_strategy.cc |  4 ++
 .../details/eager_deletion_op_handle.cc       |  2 +-
 .../details/eager_deletion_op_handle.h        | 12 +++-
 .../fast_threaded_ssa_graph_executor.cc       | 48 ++++++++++----
 .../fast_threaded_ssa_graph_executor.h        |  4 ++
 .../framework/details/inplace_op_pass.cc      | 59 +++++++++++------
 .../fluid/framework/details/inplace_op_pass.h |  2 +-
 .../details/memory_optimize_helper.h          |  6 ++
 .../framework/details/memory_optimize_pass.cc | 13 ++--
 .../framework/details/memory_optimize_pass.h  |  3 +-
 .../fluid/framework/details/op_handle_base.h  | 11 ++++
 .../record_skip_memory_opt_vars_pass.cc       | 64 +++++++++++++++++++
 paddle/fluid/framework/garbage_collector.h    |  9 +++
 .../framework/inplace_op_inference_test.cc    |  5 ++
 paddle/fluid/operators/affine_channel_op.cc   | 43 ++++++++++++-
 paddle/fluid/operators/affine_channel_op.cu   |  7 +-
 .../softmax_with_cross_entropy_op.cc          | 15 ++++-
 .../softmax_with_cross_entropy_op.cu          |  7 +-
 .../operators/softmax_with_cross_entropy_op.h |  7 +-
 paddle/fluid/platform/device_context.h        |  2 +
 paddle/fluid/pybind/const_value.cc            |  2 +
 paddle/fluid/pybind/ir.cc                     |  6 ++
 python/paddle/fluid/compiler.py               | 43 ++++++++++---
 python/paddle/fluid/executor.py               | 36 +++++++++++
 .../unittests/parallel_executor_test_base.py  |  3 +
 26 files changed, 358 insertions(+), 59 deletions(-)
 create mode 100644 paddle/fluid/framework/details/record_skip_memory_opt_vars_pass.cc

diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 51231b981bf..ae89f03186d 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -15,6 +15,8 @@ cc_library(alloc_continuous_space_for_grad_pass SRCS alloc_continuous_space_for_
 cc_library(fuse_adam_op_pass SRCS fuse_adam_op_pass.cc fuse_optimizer_op_pass.cc DEPS graph graph_helper)
 cc_library(fuse_sgd_op_pass SRCS fuse_sgd_op_pass.cc fuse_optimizer_op_pass.cc DEPS graph graph_helper)
 
+cc_library(record_skip_memory_opt_vars_pass SRCS record_skip_memory_opt_vars_pass.cc DEPS graph graph_helper)
+
 cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
 
 if(WITH_DISTRIBUTE)
@@ -124,4 +126,4 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS
         fuse_relu_depthwise_conv_pass
         memory_optimize_pass lock_free_optimize_pass
         alloc_continuous_space_for_grad_pass fuse_all_reduce_op_pass
-        fuse_adam_op_pass fuse_sgd_op_pass)
+        fuse_adam_op_pass fuse_sgd_op_pass record_skip_memory_opt_vars_pass)
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 196603bbff1..e5dc89ee69d 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -53,6 +53,9 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
       viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
     }
 
+    // Note(zcd): record_skip_memory_opt_vars_pass should be the first pass.
+    AppendPass("record_skip_memory_opt_vars_pass");
+
     if (strategy_.enable_sequential_execution_) {
       VLOG(10) << "Add sequential_execution_pass";
       AppendPass("sequential_execution_pass");
@@ -341,3 +344,4 @@ USE_PASS(fuse_sgd_op_pass);
 USE_PASS(fuse_all_reduce_op_pass);
 USE_PASS(runtime_context_cache_pass);
 USE_PASS(expected_kernel_cache_pass);
+USE_PASS(record_skip_memory_opt_vars_pass);
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
index dbc90737f22..52e6d599ebb 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -34,7 +34,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
     AtomicReferenceCountMap *ref_cnts)
     : OpHandleBase(node),
       scope_(scope),
-      var_names_(var_names),
+      var_names_(var_names.begin(), var_names.end()),
       gc_(gc),
       ref_cnts_(ref_cnts) {
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h
index 64867afad5b..6300b9173b5 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.h
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h
@@ -15,7 +15,10 @@
 #pragma once
 
 #include <deque>
+#include <memory>
 #include <string>
+#include <unordered_set>
+#include <vector>
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/details/reference_count_pass_helper.h"
 
@@ -37,6 +40,13 @@ class EagerDeletionOpHandle : public OpHandleBase {
 
   std::string Name() const override;
 
+  /**
+   * Currently, EagerDeletionOpHandle has the highest priority.
+   * This priority settings speed up gc 15% in Transformer
+   * V100 8-GPU model.
+   */
+  Priority GetPriority() const override { return kHighest; }
+
  protected:
   void RunImpl() override;
 
@@ -44,7 +54,7 @@ class EagerDeletionOpHandle : public OpHandleBase {
   void ClearGarbages(std::deque<std::shared_ptr<memory::Allocation>> *garbages);
 
   const Scope *scope_;
-  std::unordered_set<std::string> var_names_;
+  std::vector<std::string> var_names_;
   GarbageCollector *gc_;               // not own
   AtomicReferenceCountMap *ref_cnts_;  // not own
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index 3e805bd5b48..c69f148297a 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include <memory>
+#include <queue>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -131,32 +132,53 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
   return fetches;
 }
 
+bool FastThreadedSSAGraphExecutor::RunOp(
+    OpHandleBase *op, const std::shared_ptr<BlockingQueue<size_t>> &complete_q,
+    size_t *complete) {
+  try {
+    if (LIKELY(!strategy_.dry_run_)) {
+      op->Run(strategy_.use_cuda_);
+    }
+    ++(*complete);
+    return true;
+  } catch (...) {
+    exception_.Catch(std::current_exception());
+    --remaining_;
+    complete_q->Push(-1UL);
+    return false;
+  }
+}
+
 void FastThreadedSSAGraphExecutor::RunOpAsync(
     std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
     OpHandleBase *op,
     const std::shared_ptr<BlockingQueue<size_t>> &complete_q) {
   ++remaining_;
   this->pool_.enqueue([=] {
-    OpHandleBase *op_to_run = op;
+    std::queue<OpHandleBase *> op_queue;
+    op_queue.push(op);
+
     size_t complete = 0;
-    while (op_to_run != nullptr) {
-      try {
-        if (LIKELY(!strategy_.dry_run_)) {
-          op_to_run->Run(strategy_.use_cuda_);
-        }
-        ++complete;
-      } catch (...) {
-        exception_.Catch(std::current_exception());
-        --remaining_;
-        complete_q->Push(-1UL);
+    while (!op_queue.empty()) {
+      OpHandleBase *op_to_run = op_queue.front();
+      op_queue.pop();
+
+      if (!RunOp(op_to_run, complete_q, &complete)) {
         return;
       }
+
       auto &outputs = op_to_run->Outputs();
       op_to_run = nullptr;
       for (auto &output : outputs) {
         for (auto &pending_op : output->PendingOps()) {
           std::atomic<int> &deps = op_deps->at(pending_op);
-          if (deps.fetch_sub(1) == 1) {  // pending_op ready
+          if (deps.fetch_sub(1) != 1) continue;
+
+          // NOTE(zjl): op with highest priority should run
+          // first without switching to another thread.
+          if (pending_op->GetPriority() == OpHandleBase::Priority::kHighest) {
+            op_queue.push(pending_op);
+          } else {
             if (op_to_run == nullptr) {
               op_to_run = pending_op;
             } else {
@@ -165,6 +187,8 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
           }
         }
       }
+
+      if (op_to_run != nullptr) op_queue.push(op_to_run);
     }
     --remaining_;
     complete_q->Push(complete);
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
index f6d5160e75c..234da5b9254 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@@ -60,6 +60,10 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
   ::ThreadPool pool_;
   ::ThreadPool prepare_pool_;
 
+  bool RunOp(OpHandleBase *op,
+             const std::shared_ptr<BlockingQueue<size_t>> &complete_q,
+             size_t *complete);
+
   void RunOpAsync(std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
                   OpHandleBase *op,
                   const std::shared_ptr<BlockingQueue<size_t>> &complete_q);
diff --git a/paddle/fluid/framework/details/inplace_op_pass.cc b/paddle/fluid/framework/details/inplace_op_pass.cc
index 84c9e4a379a..03b21ae0ae8 100644
--- a/paddle/fluid/framework/details/inplace_op_pass.cc
+++ b/paddle/fluid/framework/details/inplace_op_pass.cc
@@ -78,6 +78,13 @@ const std::string kInplacedOpWhiteList[] = { // NOLINT
     "elementwise_add",
     "elementwise_add_grad",
 };
+
+// FIXME(zjl): Shapes of in-out of some ops are exactly the same,
+// but the static size during compiling time would be wrong.
+// Use a flag to indicate such ops. Please fix me when found a better way.
+static const std::unordered_set<std::string> kSameShapeOpWhiteSet{ // NOLINT
+    "reshape2"
+};
 // clang-format on
 
 namespace paddle {
@@ -303,7 +310,16 @@ void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
     auto* in_node = view_.GetNodeByName(in_var_name, op->inputs);
     auto* out_node = view_.GetNodeByName(out_var_name, op->outputs);
 
-    VLOG(4) << "Try to inplace " << in_var_name << " with " << out_var_name;
+    VLOG(4) << "Try to replace: " << in_var_name << " => " << out_var_name;
+    if (view_.InSkipSet(in_var_name)) {
+      VLOG(4) << string::Sprintf("SKIP: %s is in skip set", in_var_name);
+      continue;
+    }
+
+    if (view_.InSkipSet(out_var_name)) {
+      VLOG(4) << string::Sprintf("SKIP: %s is in skip set", out_var_name);
+      continue;
+    }
 
     if (var_nodes_[in_var_name].back() != in_node) {
       VLOG(4) << "SKIP since " << in_var_name
@@ -318,21 +334,26 @@ void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
               << out_var_name << " are the same";
     } else if (!NodeCanReused(in_node)) {
       can_replace = false;
-      VLOG(4) << "SKIP: Input varialbe " << in_var_name << "cannot be reused";
+      VLOG(4) << "SKIP: Input variable " << in_var_name << "cannot be reused";
     } else if (!NodeCanReused(out_node)) {
       can_replace = false;
       VLOG(4) << "SKIP: Output variable " << out_var_name
               << " cannot be reused";
+    } else if (in_node->Var()->GetType() != out_node->Var()->GetType()) {
+      can_replace = false;
+      VLOG(4) << "SKIP: Input type : " << in_node->Var()->GetType()
+              << " does not match Output type : " << out_node->Var()->GetType();
     } else if (details::NodeSize(*in_node->Var()) !=
-               details::NodeSize(*out_node->Var())) {
+                   details::NodeSize(*out_node->Var()) &&
+               kSameShapeOpWhiteSet.count(op_desc->Type()) == 0) {
       can_replace = false;
       VLOG(4) << "SKIP: Input and Output varialbe size not match";
     }
 
     if (!can_replace) continue;
 
-    // 2. there is no external pending op on the input node
-    // if (view_.PendingOpsOnVar(in_node).size() > 1) {
+    // 2. If the variable is the input of muliple ops, we need to make sure
+    // current op has dependecny on other ops use the same variable
     if (in_node->outputs.size() > 1 && !view_.CheckDeps(in_node, op)) {
       VLOG(4) << string::Sprintf(
           "Skiped pair %s => %s. %s input has external dependency."
@@ -341,17 +362,6 @@ void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
       continue;
     }
 
-    // 3. if output has been memory optimize by python(fluid.memory_optmize()).
-    // this candidate  can not be inplaced. Will be deprecated in the future.
-    if (view_.InSkipSet(out_node->Name())) {
-      VLOG(4) << string::Sprintf(
-          "Skiped %s => %s reused previous memory block in python memory "
-          "optmize,"
-          "it inplace may generate a circle",
-          out_var_name, in_var_name, op->Name());
-      continue;
-    }
-
     // Debug Interface. Which would be skipped by the pass.
     if (out_node->Name() == FLAGS_memory_optimize_debug) {
       VLOG(3) << "Skiped var by force. FLAGS_memory_optimize_debug="
@@ -424,6 +434,9 @@ void GraphView::TopoSort(ir::Graph* graph) {
   for (auto& node : nodes) {
     if (node->IsOp() && node->Op() != nullptr && deps_map[node] > 0) {
       all_ops_checked = false;
+      LOG(WARNING)
+          << "Node " << node->Name() << " has not been checked. "
+          << "Maybe some passes have not handle node dependency rightly.";
       break;
     }
   }
@@ -519,16 +532,22 @@ void GraphView::Build(ir::Graph* g) {
   // resolve data harzards depends on the var nodes in right order.
   TopoSort(g);
 
+  // fill the skip_set_
+  PADDLE_ENFORCE(g->Has(details::kMemOptSkipVars));
+  auto& mem_opt_whitelist = g->Get<MemOptSkipVars>(kMemOptSkipVars);
+  for (const auto& var : mem_opt_whitelist) skip_set_.emplace(var);
+
   // 2. track the nodes which used by parameter server.
   // these node can not be inplaced, otherwise trainer
   // pserver can not find each other name.
   auto update_skip_set = [&](ir::Node* node) {
     for (auto& in : node->inputs) {
-      if (in->IsVar() && in->Var() != nullptr) dup_nodes_.emplace(in->Name());
+      if (in->IsVar() && in->Var() != nullptr) {
+        skip_set_.emplace(in->Name());
+      }
     }
     for (auto& out : node->outputs) {
-      if (out->IsVar() && out->Var() != nullptr)
-        dup_nodes_.emplace(out->Name());
+      if (out->IsVar() && out->Var() != nullptr) skip_set_.emplace(out->Name());
     }
   };
   for (auto& node : g->Nodes()) {
@@ -545,7 +564,7 @@ void GraphView::Build(ir::Graph* g) {
 const std::vector<ir::Node*>& GraphView::AllOps() { return ops_; }
 
 bool GraphView::InSkipSet(const std::string& var) const {
-  return dup_nodes_.count(var);
+  return skip_set_.count(var);
 }
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/inplace_op_pass.h b/paddle/fluid/framework/details/inplace_op_pass.h
index fbec973ddaa..2cd6cbd1b03 100644
--- a/paddle/fluid/framework/details/inplace_op_pass.h
+++ b/paddle/fluid/framework/details/inplace_op_pass.h
@@ -57,7 +57,7 @@ class GraphView {
 
  private:
   std::vector<ir::Node*> ops_;
-  std::unordered_set<std::string> dup_nodes_;  // mem opt affect nodes
+  std::unordered_set<std::string> skip_set_;  // mem opt affect nodes
   std::map<ir::Node*, std::unordered_set<ir::Node*>> adj_list_;
   std::unordered_map<ir::Node*, uint32_t> op_level_;
 };
diff --git a/paddle/fluid/framework/details/memory_optimize_helper.h b/paddle/fluid/framework/details/memory_optimize_helper.h
index 65c7017d2d4..0a65ec051df 100644
--- a/paddle/fluid/framework/details/memory_optimize_helper.h
+++ b/paddle/fluid/framework/details/memory_optimize_helper.h
@@ -21,6 +21,7 @@
 #include <set>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
@@ -30,6 +31,11 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+/// this attribute is used to avoid some core variables removed/reused
+/// in memory optimize related passes
+constexpr char kMemOptSkipVars[] = "@MEM_OPT_SKIP_VARS@";
+typedef std::unordered_set<std::string> MemOptSkipVars;
+
 std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);
 
 // NOTE(dzh): A ordered set for node reuse in memory optimize.
diff --git a/paddle/fluid/framework/details/memory_optimize_pass.cc b/paddle/fluid/framework/details/memory_optimize_pass.cc
index ddaef206028..ef36f1038e2 100644
--- a/paddle/fluid/framework/details/memory_optimize_pass.cc
+++ b/paddle/fluid/framework/details/memory_optimize_pass.cc
@@ -45,8 +45,7 @@ namespace framework {
 namespace details {
 
 void MemoryOptimizePass::ApplyImpl(ir::Graph* graph) const {
-  auto nodes = graph->Nodes();
-  CollectSkipVarsSet(nodes);
+  CollectSkipVarsSet(graph);
 
   cfg_.reset(new details::ControlFlowGraph(*graph));
   cfg_->LiveVariableAnalysis();
@@ -204,14 +203,20 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
   }
 }
 
-void MemoryOptimizePass::CollectSkipVarsSet(
-    const std::unordered_set<ir::Node*>& nodes) const {
+void MemoryOptimizePass::CollectSkipVarsSet(ir::Graph* graph) const {
+  // fill skip_set_
+  PADDLE_ENFORCE(graph->Has(details::kMemOptSkipVars));
+  auto& mem_opt_whitelist = graph->Get<MemOptSkipVars>(kMemOptSkipVars);
+  for (const auto& var : mem_opt_whitelist) skip_set_.emplace(var);
+
   auto update_skip_set = [&](OpDesc* op_desc) {
     auto inputs = op_desc->InputArgumentNames();
     auto outputs = op_desc->OutputArgumentNames();
     skip_set_.insert(inputs.begin(), inputs.end());
     skip_set_.insert(outputs.begin(), outputs.end());
   };
+
+  auto nodes = graph->Nodes();
   for (auto& op : nodes) {
     if (!op->IsOp() || op->Op() == nullptr) continue;
     auto* op_desc = op->Op();
diff --git a/paddle/fluid/framework/details/memory_optimize_pass.h b/paddle/fluid/framework/details/memory_optimize_pass.h
index ce94890b385..fa5b9b322da 100644
--- a/paddle/fluid/framework/details/memory_optimize_pass.h
+++ b/paddle/fluid/framework/details/memory_optimize_pass.h
@@ -53,7 +53,8 @@ class MemoryOptimizePass : public ir::Pass {
   // 1. scan op with subblock and collect the output/input vars.
   // while, while_grad, conditional_block
   // 2. scan distributed ops and collect the output/input vars
-  void CollectSkipVarsSet(const std::unordered_set<ir::Node*>&) const;
+  // 3. op_role_vars
+  void CollectSkipVarsSet(ir::Graph* graph) const;
 
  private:
   // Reuse Node Pool, Owned.
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index e0aa352e95b..647b238634a 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -15,6 +15,8 @@
 #pragma once
 #include <map>
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/details/var_handle.h"
 #include "paddle/fluid/framework/ir/node.h"
@@ -31,6 +33,13 @@ constexpr char kLocalExecScopeName[] = "@LOCAL_SCOPE@";
 // It's responsible for populating necessary fields of ir::Node.
 class OpHandleBase {
  public:
+  /**
+   * NOTE(zjl): Some op should have higher priority than others.
+   * The higher priority op would run first without switching
+   * threads in Executor.
+   */
+  enum Priority { kHighest = 0, kNormal = 1 };
+
   // Owned by `node`. No need to be deleted explicitly.
   explicit OpHandleBase(ir::Node *node) : node_(node) {
     node_->WrappedBy(this);
@@ -40,6 +49,8 @@ class OpHandleBase {
 
   std::string DebugString() const;
 
+  virtual Priority GetPriority() const { return kNormal; }
+
   virtual std::string Name() const = 0;
 
   void Run(bool use_cuda);
diff --git a/paddle/fluid/framework/details/record_skip_memory_opt_vars_pass.cc b/paddle/fluid/framework/details/record_skip_memory_opt_vars_pass.cc
new file mode 100644
index 00000000000..7cb2544ebbf
--- /dev/null
+++ b/paddle/fluid/framework/details/record_skip_memory_opt_vars_pass.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include "paddle/fluid/framework/details/memory_optimize_helper.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class RecordSkipMemoryOptVarsPass : public ir::Pass {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override {
+    PADDLE_ENFORCE(!graph->Has(kMemOptSkipVars));
+    graph->Set(kMemOptSkipVars, new MemOptSkipVars);
+    auto& skip_vars = graph->Get<MemOptSkipVars>(kMemOptSkipVars);
+
+    // NOTE(zcd): Insert OpRoleVars to SkipVarSet to prevent the vars are rename
+    // in memory optimize pass.
+    InsertOpRoleVarsToSkipVarSet(graph, &skip_vars);
+  }
+
+  void InsertOpRoleVarsToSkipVarSet(const ir::Graph* graph,
+                                    MemOptSkipVars* skip_vars) const {
+    for (auto& node : graph->Nodes()) {
+      PADDLE_ENFORCE_NOT_NULL(node, "The node should not be nullptr.");
+      if (node->IsOp() && node->Op()) {
+        try {
+          auto op_role_vars =
+              boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
+                  OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+          PADDLE_ENFORCE_EQ(op_role_vars.size() % 2, 0);
+          for (size_t i = 0; i < op_role_vars.size(); i += 2) {
+            auto& g_name = op_role_vars[i + 1];
+            skip_vars->insert(g_name);
+          }
+        } catch (boost::bad_get e) {
+        }
+      }
+    }
+  }
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(record_skip_memory_opt_vars_pass,
+              paddle::framework::details::RecordSkipMemoryOptVarsPass);
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index f0b504627ae..6ce797bd962 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -107,6 +107,15 @@ void GarbageCollector::Add(Container &&objs) {
 
 template <typename Container, typename Callback>
 void GarbageCollector::Add(Container &&objs, Callback &&callback) {
+  // Special case when FLAGS_eager_delete_tensor_gb=0.0
+  // It speeds up GC about 2~3%.
+  if (max_memory_size_ <= 1) {
+    callback();
+    auto *container = new Container(std::move(objs));
+    ClearCallback([container] { delete container; });
+    return;
+  }
+
   GarbageQueue *garbage_queue = nullptr;
   {
     std::lock_guard<std::mutex> guard(mutex_);
diff --git a/paddle/fluid/framework/inplace_op_inference_test.cc b/paddle/fluid/framework/inplace_op_inference_test.cc
index a9b3b889229..a2c213945d7 100644
--- a/paddle/fluid/framework/inplace_op_inference_test.cc
+++ b/paddle/fluid/framework/inplace_op_inference_test.cc
@@ -19,6 +19,7 @@
 #include <vector>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/details/inplace_op_pass.h"
+#include "paddle/fluid/framework/details/memory_optimize_helper.h"
 #include "paddle/fluid/framework/ir/pass_builder.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -217,6 +218,7 @@ TEST(InferInplace, SingleOpInplaceInToOut) {
 
   FakeSuccData(&prog);
   std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
+  g->Set(details::kMemOptSkipVars, new std::unordered_set<std::string>());
   g = test_SingleOpInplaceInToOut(std::move(g));
   auto op_node = GetNodeFromGraph(g.get(), "single_op");
 
@@ -232,6 +234,7 @@ TEST(InferInplace, SingleOpInplaceInToOutNoInplace) {
 
   FakeNoInplaceData(&prog);
   std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
+  g->Set(details::kMemOptSkipVars, new std::unordered_set<std::string>());
   g = test_SingleOpInplaceInToOut(std::move(g));
   auto op_node = GetNodeFromGraph(g.get(), "single_op");
 
@@ -264,6 +267,7 @@ TEST(InferInplace, MultiOutInplaceInToOut) {
   prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
 
   std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
+  g->Set(details::kMemOptSkipVars, new std::unordered_set<std::string>());
   std::unique_ptr<details::InplacePass> pass(new details::InplacePass());
   pass->Apply(g.get());
   auto op_node = GetNodeFromGraph(g.get(), "multi_out_op");
@@ -299,6 +303,7 @@ TEST(InferInplace, MultiGradInplaceInToOut) {
   prog.MutableBlock(0)->Var("z0")->SetShape({32, 15, 1024, 1024});
 
   std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
+  g->Set(details::kMemOptSkipVars, new std::unordered_set<std::string>());
   std::unique_ptr<details::InplacePass> pass(new details::InplacePass());
   pass->Apply(g.get());
   auto op_node = GetNodeFromGraph(g.get(), "multi_out_grad");
diff --git a/paddle/fluid/operators/affine_channel_op.cc b/paddle/fluid/operators/affine_channel_op.cc
index 27370a3c29a..da063541438 100644
--- a/paddle/fluid/operators/affine_channel_op.cc
+++ b/paddle/fluid/operators/affine_channel_op.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <string>
+#include <unordered_map>
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -115,6 +117,14 @@ class AffineChannelOpGrad : public framework::OperatorWithKernel {
                         ctx->GetInputDim("Scale"));
     }
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"))->type(),
+        ctx.GetPlace());
+  }
 };
 
 class AffineChannelGradMaker : public framework::SingleGradOpDescMaker {
@@ -217,7 +227,6 @@ class AffineChannelGradKernel : public framework::OpKernel<T> {
                                                    : dims[dims.size() - 1];
     int HxW = x->numel() / N / C;
 
-    auto* x_d = x->data<T>();
     auto* dy_d = dy->data<T>();
     auto* scale_d = scale->data<T>();
     ConstEigenVectorArrayMap<T> scale_e(scale_d, C);
@@ -242,6 +251,7 @@ class AffineChannelGradKernel : public framework::OpKernel<T> {
       }
       // compute dscale and dbias
       if (dscale && dbias) {
+        auto* x_d = x->data<T>();
         dy_d = dy->data<T>();
         for (int i = 0; i < N; i++) {
           ConstEigenArrayMap<T> x_e(x_d, HxW, C);
@@ -270,6 +280,7 @@ class AffineChannelGradKernel : public framework::OpKernel<T> {
       }
       // compute dscale and dbias
       if (dscale && dbias) {
+        auto* x_d = x->data<T>();
         ConstEigenArrayMap<T> x_e(x_d, C, num);
         dscale_e = (x_e * dy_e).rowwise().sum();
         dbias_e = dy_e.rowwise().sum();
@@ -278,6 +289,33 @@ class AffineChannelGradKernel : public framework::OpKernel<T> {
   }
 };
 
+class AffineChannelNoNeedBufferVarsInference
+    : public framework::NoNeedBufferVarsInference {
+ public:
+  using framework::NoNeedBufferVarsInference::NoNeedBufferVarsInference;
+
+ private:
+  inline bool HasInput(const std::string& name) const {
+    auto& inputs = Inputs();
+    auto iter = inputs.find(name);
+    if (iter == inputs.end() || iter->second.empty()) {
+      return false;
+    } else {
+      return iter->second[0] != framework::kEmptyVarName;
+    }
+  }
+
+ public:
+  std::unordered_set<std::string> operator()() const {
+    if (!HasInput(framework::GradVarName("Scale")) &&
+        !HasInput(framework::GradVarName("Bias"))) {
+      return {"X"};
+    } else {
+      return {};
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -286,7 +324,8 @@ using CPU = paddle::platform::CPUDeviceContext;
 
 REGISTER_OPERATOR(affine_channel, ops::AffineChannelOp,
                   ops::AffineChannelOpMaker, ops::AffineChannelGradMaker);
-REGISTER_OPERATOR(affine_channel_grad, ops::AffineChannelOpGrad);
+REGISTER_OPERATOR(affine_channel_grad, ops::AffineChannelOpGrad,
+                  ops::AffineChannelNoNeedBufferVarsInference);
 
 REGISTER_OP_CPU_KERNEL(affine_channel, ops::AffineChannelKernel<CPU, float>,
                        ops::AffineChannelKernel<CPU, double>);
diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu
index c054fdb1ba6..e1435c29d84 100644
--- a/paddle/fluid/operators/affine_channel_op.cu
+++ b/paddle/fluid/operators/affine_channel_op.cu
@@ -128,14 +128,13 @@ class AffineChannelGradCUDAKernel : public framework::OpKernel<T> {
         framework::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
 
-    auto dims = x->dims();
-    const int num = x->numel();
+    auto dims = dy->dims();
+    const int num = dy->numel();
     int N = dims[0];
     int C = layout == framework::DataLayout::kNCHW ? dims[1]
                                                    : dims[dims.size() - 1];
     int HxW = num / N / C;
 
-    const T* x_d = x->data<T>();
     const T* dy_d = dy->data<T>();
     const T* s_d = scale->data<T>();
 
@@ -155,6 +154,7 @@ class AffineChannelGradCUDAKernel : public framework::OpKernel<T> {
             dy_d, s_d, nullptr, C, HxW, num, dx_d);
       }
       if (dscale && dbias) {
+        const T* x_d = x->data<T>();
         AffineChannelScaleBiasGradientCUDAKernel<
             T, block, framework::DataLayout::kNCHW><<<grid2, block, 0,
                                                       dev_ctx.stream()>>>(
@@ -167,6 +167,7 @@ class AffineChannelGradCUDAKernel : public framework::OpKernel<T> {
             dy_d, s_d, nullptr, C, HxW, num, dx_d);
       }
       if (dscale && dbias) {
+        const T* x_d = x->data<T>();
         AffineChannelScaleBiasGradientCUDAKernel<
             T, block, framework::DataLayout::kNHWC><<<grid2, block, 0,
                                                       dev_ctx.stream()>>>(
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
index 7c024e50dd2..d7718bda5cc 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
 #include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -225,6 +228,15 @@ class SoftmaxGradMaker : public framework::SingleGradOpDescMaker {
   }
 };
 
+class SoftmaxWithCrossEntropyGradInplaceInference
+    : public framework::InplaceOpInference {
+ public:
+  std::unordered_map<std::string, std::string> operator()(
+      const framework::OpDesc& op_desc) const {
+    return {{"Softmax", framework::GradVarName("Logits")}};
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -233,7 +245,8 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyOp,
                   ops::SoftmaxWithCrossEntropyOpMaker, ops::SoftmaxGradMaker);
 REGISTER_OPERATOR(softmax_with_cross_entropy_grad,
-                  ops::SoftmaxWithCrossEntropyOpGrad);
+                  ops::SoftmaxWithCrossEntropyOpGrad,
+                  ops::SoftmaxWithCrossEntropyGradInplaceInference);
 REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy,
                        ops::SoftmaxWithCrossEntropyKernel<float>,
                        ops::SoftmaxWithCrossEntropyKernel<double>);
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index d3b8538124f..ed61fb38b5a 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -454,8 +454,11 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
         context.Input<Tensor>(framework::GradVarName("Loss"))->data<T>();
     Tensor* logit_grad =
         context.Output<Tensor>(framework::GradVarName("Logits"));
-    framework::TensorCopy(*context.Input<Tensor>("Softmax"), context.GetPlace(),
-                          context.device_context(), logit_grad);
+    const Tensor* softmax = context.Input<Tensor>("Softmax");
+    if (logit_grad != softmax) {
+      framework::TensorCopy(*softmax, context.GetPlace(),
+                            context.device_context(), logit_grad);
+    }
     T* logit_grad_data = logit_grad->data<T>();
 
     int rank = logit_grad->dims().size();
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
index 8cba960c763..7ef7c4f7424 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
@@ -68,7 +68,12 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
     const Tensor* labels = context.Input<Tensor>("Label");
     Tensor* logit_grad =
         context.Output<Tensor>(framework::GradVarName("Logits"));
-    logit_grad->ShareDataWith(*context.Input<Tensor>("Softmax"));
+
+    const Tensor* softmax = context.Input<Tensor>("Softmax");
+    if (logit_grad != softmax) {
+      framework::TensorCopy(*softmax, context.GetPlace(),
+                            context.device_context(), logit_grad);
+    }
 
     int rank = logit_grad->dims().size();
     const int class_num = logit_grad->dims()[rank - 1];
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 778f6613bd4..a86fef33b4c 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -158,6 +158,8 @@ class CudnnHolder {
     if (required_workspace_len > WorkspaceSize()) {
       ReallocateWorkspace(required_workspace_len);
     }
+    VLOG(2) << "Cudnn workspace size: "
+            << static_cast<double>(WorkspaceSize()) / (1 << 20) << " MB";
     cudnn_func(WorkspacePtr());
   }
 
diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc
index 71eeaf3b53a..4f9885b5839 100644
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/pybind/const_value.h"
+#include "paddle/fluid/framework/details/memory_optimize_pass.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
@@ -33,6 +34,7 @@ void BindConstValue(pybind11::module* m) {
   m->def("kControlDepVarName",
          [] { return framework::ir::Node::kControlDepVarName; });
   m->def("kNewGradSuffix", [] { return framework::kNewGradSuffix; });
+  m->def("kMemOptSkipVars", [] { return framework::details::kMemOptSkipVars; });
 
   auto op_proto_and_checker_maker =
       m->def_submodule("op_proto_and_checker_maker");
diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index c69ccd50721..798e488f5b0 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -84,6 +84,12 @@ void BindGraph(py::module *m) {
              return self.Set(attr_name,
                              new std::unordered_set<const Node *>(attr));
            })
+      .def("set",
+           [](Graph &self, const std::string &attr_name,
+              const std::unordered_set<std::string> &attr) {
+             return self.Set(attr_name,
+                             new std::unordered_set<std::string>(attr));
+           })
       .def("erase", &Graph::Erase)
       .def("nodes", &Graph::Nodes, return_value_policy::reference)
       .def("create_var_node",
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index ac2a40a7c25..624c9934d53 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 import multiprocessing
 import os
 import six
@@ -152,6 +153,39 @@ class CompiledProgram(object):
         else:
             self._places = None
         self._build_strategy.is_distribution = _is_pserver_mode(self._program)
+
+        # FIXME(dzhwinter): enable_inplace should be after memory_optimize
+        # if turn on python memory optimize, turn off the inplace_pass.
+        # memory_optimize and enable_inplace default are True, but we can disable them on purpose
+        if self._program:
+            if self._program._is_mem_optimized:
+                self._build_strategy.memory_optimize = False
+                self._build_strategy.enable_inplace = False
+            elif not self._build_strategy.memory_optimize or not self._build_strategy.enable_inplace:
+                # remind the user to try our memmory optimize strategy
+                logging.warn("""
+     You can try our memory optimize feature to save your memory usage:
+         # create a build_strategy variable to set memory optimize option
+         build_strategy = compiler.BuildStrategy()
+         build_strategy.enable_inplace = True
+         build_strategy.memory_optimize = True
+         
+         # pass the build_strategy to with_data_parallel API
+         compiled_prog = compiler.CompiledProgram(main).with_data_parallel(
+             loss_name=loss.name, build_strategy=build_strategy)
+      
+     !!! Memory optimize is our experimental feature !!!
+         some variables may be removed/reused internal to save memory usage, 
+         in order to fetch the right value of the fetch_list, please set the 
+         persistable property to true for each variable in fetch_list
+
+         # Sample
+         conv1 = fluid.layers.conv2d(data, 4, 5, 1, act=None) 
+         # if you need to fetch conv1, then:
+         conv1.persistable = True
+
+                 """)
+
         return self
 
     def with_inference_optimize(self, config):
@@ -211,15 +245,6 @@ class CompiledProgram(object):
             else:
                 self._exec_strategy.num_threads = len(self._places) * 2
 
-        # FIXME(dzhwinter): enable_inplace should be after memory_optimize
-        # if turn on python memory optimize, turn off the inplace_pass.
-        # memory_optimize and enable_inplace default are True, but we can disable them on purpose
-        if self._program and self._program._is_mem_optimized:
-            self._build_strategy.memory_optimize = False
-
-        if self._program and self._program._is_mem_optimized:
-            self._build_strategy.enable_inplace = False
-
         # TODO(wuyi): trainer endpoings should be passed in through
         # build_strategy, not program.xxx.
         if self._program and self._build_strategy.num_trainers > 1 and \
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index fa8b49a0212..0b9a23e6769 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import logging
 import os
 import multiprocessing
 import numpy as np
@@ -449,6 +450,36 @@ class Executor(object):
             return as_numpy(arr)
         return [arr[i] for i in range(len(arr))]
 
+    def _check_fetch_vars_persistable(self, program, fetch_list):
+        for var in fetch_list:
+            if isinstance(var, Variable):
+                persistable = var.persistable
+            else:
+                block_num = program.desc.num_blocks()
+                persistable = None
+                var_name = cpt.to_bytes(var)
+                for i in six.moves.range(block_num):
+                    var_desc = program.desc.block(i).find_var(var_name)
+                    if var_desc:
+                        persistable = var_desc.persistable()
+                        break
+                assert persistable is not None, "Variable {} is not found".format(
+                    var)
+
+            if not persistable:
+                logging.warn("""
+     Detect that memory optimize or inplace is enabled, but the some variables in the fetch
+     list is not persistable, you may get wrong fetched value, or an exeception may be thrown
+     about cannot find variable of the fetch list. 
+
+     TO FIX this:
+         # Sample
+         conv1 = fluid.layers.conv2d(data, 4, 5, 1, act=None) 
+         # if you need to fetch conv1, then:
+         conv1.persistable = True
+
+                 """)
+
     def run(self,
             program=None,
             feed=None,
@@ -532,6 +563,11 @@ class Executor(object):
                 scope=scope,
                 return_numpy=return_numpy,
                 use_program_cache=use_program_cache)
+        else:
+            if fetch_list and program._is_data_parallel and program._program and (
+                    program._build_strategy.memory_optimize or
+                    program._build_strategy.enable_inplace):
+                self._check_fetch_vars_persistable(program._program, fetch_list)
 
         program._compile(scope, self.place)
         if program._is_data_parallel:
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index 723aafb1712..b1391749c0d 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -58,12 +58,15 @@ class TestParallelExecutorBase(unittest.TestCase):
         startup = fluid.Program()
         startup.random_seed = 1  # Fix random seed
         main.random_seed = 1
+
         with fluid.program_guard(main, startup):
             if seed is not None:
                 startup.random_seed = seed
                 main.random_seed = seed
 
             loss = method(use_feed=feed_dict is not None)
+            loss.persistable = True
+
             if optimizer:
                 optimizer().minimize(loss)
 
-- 
GitLab