Merge pull request #16409 from sneaxiy/feature/advance_gc

Enhance gc to support deleting tensor buffer in advance

Merge pull request #16409 from sneaxiy/feature/advance_gc
Enhance gc to support deleting tensor buffer in advance
c7c6eeb4 · Zeng Jinle · GitHub · 54a73578 · a0f4fefb · c7c6eeb4
42 changed file
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -63,7 +63,7 @@ cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
-cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory)
+cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory gflags glog)
 cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
 cc_test(reader_test SRCS reader_test.cc DEPS reader)
@@ -164,6 +164,8 @@ else()
  set(NGRAPH_EXE_DEPS)
 endif()
+cc_library(executor_gc_helper SRCS executor_gc_helper.cc DEPS scope proto_desc operator garbage_collector)
 if(WITH_DISTRIBUTE)
  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog
    lod_rank_table feed_fetch_method sendrecvop_rpc  ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper ${NGRAPH_EXE_DEPS})
@@ -174,7 +176,7 @@ else()
  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()
-target_link_libraries(executor garbage_collector while_op_helper)
+target_link_libraries(executor while_op_helper executor_gc_helper)
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor
@@ -194,6 +196,7 @@ cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_con
 cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
        proto_desc)
 cc_test(inplace_op_inference_test SRCS inplace_op_inference_test.cc DEPS op_registry proto_desc op_info memory_optimize_helper)
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
 cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)

--- a/paddle/fluid/framework/details/eager_deletion_pass.cc
+++ b/paddle/fluid/framework/details/eager_deletion_pass.cc
@@ -22,14 +22,9 @@
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
-DEFINE_double(memory_fraction_of_eager_deletion, 1.0,
-              "Fraction of eager deletion. If less than 1.0, all variables in "
-              "the program would be sorted according to its memory size, and "
-              "only the FLAGS_memory_fraction_of_eager_deletion of the largest "
-              "variables would be deleted.");
 namespace paddle {
 namespace framework {
 namespace details {
@@ -206,8 +201,9 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
    }
  }
-  op_vars_map = ShrinkGCVars(op_vars_map, vars, places,
+  double memory_fraction = framework::GetEagerDeletionMemoryFraction();
-                             FLAGS_memory_fraction_of_eager_deletion);
+  op_vars_map = ShrinkGCVars(op_vars_map, vars, places, memory_fraction);
  for (auto &pair : op_vars_map) {
    auto *op = pair.first;
@@ -239,8 +235,7 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
    eager_deletion_op->AddOutput(dummy_leaf);
  }
-  VLOG(10) << "FLAGS_memory_fraction_of_eager_deletion = "
+  VLOG(10) << "FLAGS_memory_fraction_of_eager_deletion = " << memory_fraction;
-           << FLAGS_memory_fraction_of_eager_deletion;
  VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)";
  auto while_op_eager_deletion_pass =

--- a/paddle/fluid/framework/details/early_delete_op_handle.h
+++ b/paddle/fluid/framework/details/early_delete_op_handle.h
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/details/computation_op_handle.h"
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/details/var_handle.h"
-#include "paddle/fluid/framework/garbage_collector.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-namespace paddle {
-namespace framework {
-namespace details {
-class EarlyDeleteOpHandle : public OpHandleBase {
- public:
-  EarlyDeleteOpHandle(ir::Node* node, const Scope* scope,
-                      const platform::Place& place,
-                      const std::vector<std::string>& names,
-                      GarbageCollector* gc)
-      : OpHandleBase(node),
-        scope_(scope),
-        place_(place),
-        names_(names),
-        gc_(gc) {
-#ifdef PADDLE_WITH_CUDA
-    if (IsStreamGarabageCollector()) {
-      auto gpu_place = boost::get<platform::CUDAPlace>(place);
-      PADDLE_ENFORCE(cudaSetDevice(gpu_place.device));
-      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
-    }
-#endif
-  }
-  ~EarlyDeleteOpHandle() {
-#ifdef PADDLE_WITH_CUDA
-    if (IsStreamGarabageCollector()) {
-      auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
-      PADDLE_ENFORCE(cudaSetDevice(gpu_place.device));
-      PADDLE_ENFORCE(cudaEventDestroy(event_));
-    }
-#endif
-  }
-  std::string Name() const override { return "early_delete"; }
- protected:
-  void RunImpl() override {
-    std::vector<std::shared_ptr<memory::Allocation>> tensors;
-    auto* local_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope*>();
-    for (auto& var_name : names_) {
-      auto* var = local_scope->FindVar(var_name);
-      PADDLE_ENFORCE(var != nullptr,
-                     string::Sprintf("Local Scope not has var %s", var_name));
-      if (var->IsType<LoDTensor>()) {
-        tensors.emplace_back(var->GetMutable<LoDTensor>()->MoveMemoryHolder());
-      } else if (var->IsType<SelectedRows>()) {
-        tensors.emplace_back(var->GetMutable<SelectedRows>()
-                                 ->mutable_value()
-                                 ->MoveMemoryHolder());
-      } else if (var->IsType<LoDTensorArray>()) {
-        LoDTensorArray* tensor_array = var->GetMutable<LoDTensorArray>();
-        for (auto& tensor : *tensor_array) {
-          tensors.emplace_back(tensor.MoveMemoryHolder());
-        }
-      }
-    }
-    if (!tensors.empty()) {
-      ClearTensors(tensors);
-    }
-  }
- private:
-  void ClearTensors(
-      const std::vector<std::shared_ptr<memory::Allocation>>& tensors) {
-    if (platform::is_cpu_place(place_)) {
-      ClearCPUTensors(tensors);
-    } else {
-      ClearGPUTensors(tensors);
-    }
-  }
-  void ClearCPUTensors(
-      const std::vector<std::shared_ptr<memory::Allocation>>& tensors) {
-    auto* gc = dynamic_cast<CPUGarbageCollector*>(gc_);
-    if (gc != nullptr) {
-      gc->Add(tensors);
-    }
-  }
-  void ClearGPUTensors(
-      const std::vector<std::shared_ptr<memory::Allocation>>& tensors) {
-#ifdef PADDLE_WITH_CUDA
-    auto* gc = dynamic_cast<StreamGarbageCollector*>(gc_);
-    if (gc != nullptr) {
-      auto compute_stream = dev_ctx_->stream();
-      auto callback_stream = gc->stream();
-      auto callback_func = [=]() {
-        PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
-        PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
-      };
-      gc_->Add(tensors, callback_func);
-    } else {
-      gc_->Add(tensors);
-    }
-  }
-  bool IsStreamGarabageCollector() const {
-    return dynamic_cast<const StreamGarbageCollector*>(gc_) != nullptr;
-#endif
-  }
-  const Scope* scope_;
-  const platform::Place place_;
-  std::vector<std::string> names_;
-  GarbageCollector* gc_;
-#ifdef PADDLE_WITH_CUDA
-  platform::CUDADeviceContext* dev_ctx_;
-  cudaEvent_t event_;
-#endif
-};
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/grad_op_desc_maker.h"
 #include "paddle/fluid/framework/inplace_op_inference.h"
+#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
@@ -36,27 +37,86 @@ enum OpInfoFillType {
  kGradOpDescMaker = 2,
  kVarTypeInference = 3,
  kShapeInference = 4,
-  kInplaceOpInference = 5
+  kInplaceOpInference = 5,
+  kNoNeedBufferVarsInference = 6,
+  kUnknown = -1
 };
+namespace internal {
+template <typename T, OpInfoFillType kType>
+struct TypePair {
+  using Type = T;
+  static constexpr OpInfoFillType kFillType = kType;
+};
+using OpRegistryClasses = std::tuple<                                // NOLINT
+    TypePair<OperatorBase, kOperator>,                               // NOLINT
+    TypePair<OpProtoAndCheckerMaker, kOpProtoAndCheckerMaker>,       // NOLINT
+    TypePair<GradOpDescMakerBase, kGradOpDescMaker>,                 // NOLINT
+    TypePair<VarTypeInference, kVarTypeInference>,                   // NOLINT
+    TypePair<InferShapeBase, kShapeInference>,                       // NOLINT
+    TypePair<InplaceOpInference, kInplaceOpInference>,               // NOLINT
+    TypePair<NoNeedBufferVarsInference, kNoNeedBufferVarsInference>  // NOLINT
+    >;
+static constexpr int kOpRegistryClassNumber =
+    std::tuple_size<OpRegistryClasses>::value;
+template <typename T, int kPos, bool kIsBounded /* = true*/>
+struct IsMatchedBaseTypeImpl {
+  using PairType = typename std::tuple_element<kPos, OpRegistryClasses>::type;
+  static constexpr bool kValue =
+      std::is_base_of<typename PairType::Type, T>::value;
+};
+template <typename T, int kPos>
+struct IsMatchedBaseTypeImpl<T, kPos, false> {
+  static constexpr bool kValue = false;
+};
+template <typename T, int kPos>
+static inline constexpr bool IsMatchedBaseType() {
+  return IsMatchedBaseTypeImpl<
+      T, kPos, (kPos >= 0 && kPos < kOpRegistryClassNumber)>::kValue;
+}
+template <typename T, int kStart, int kEnd, bool kIsEnd, bool kIsMatched>
+struct OpInfoFillTypeGetterImpl {};
+// This case should not happen
+template <typename T, int kStart, int kEnd>
+struct OpInfoFillTypeGetterImpl<T, kStart, kEnd, true, true> {};
+template <typename T, int kStart, int kEnd>
+struct OpInfoFillTypeGetterImpl<T, kStart, kEnd, true, false> {
+  static constexpr OpInfoFillType kType = kUnknown;
+};
+template <typename T, int kStart, int kEnd>
+struct OpInfoFillTypeGetterImpl<T, kStart, kEnd, false, false> {
+  static constexpr OpInfoFillType kType =
+      OpInfoFillTypeGetterImpl<T, kStart + 1, kEnd, kStart + 1 == kEnd,
+                               IsMatchedBaseType<T, kStart + 1>()>::kType;
+};
+template <typename T, int kStart, int kEnd>
+struct OpInfoFillTypeGetterImpl<T, kStart, kEnd, false, true> {
+  using PairType = typename std::tuple_element<kStart, OpRegistryClasses>::type;
+  static constexpr OpInfoFillType kType = PairType::kFillType;
+};
+template <typename T>
+using OpInfoFillTypeGetter =
+    OpInfoFillTypeGetterImpl<T, 0, kOpRegistryClassNumber,
+                             kOpRegistryClassNumber == 0,
+                             IsMatchedBaseType<T, 0>()>;
+}  // namespace internal
 template <typename T>
 struct OpInfoFillTypeID {
  static constexpr OpInfoFillType ID() {
-    return std::is_base_of<OperatorBase, T>::value
+    return internal::OpInfoFillTypeGetter<T>::kType;
-               ? kOperator
-               : (std::is_base_of<OpProtoAndCheckerMaker, T>::value
-                      ? kOpProtoAndCheckerMaker
-                      : (std::is_base_of<GradOpDescMakerBase, T>::value
-                             ? kGradOpDescMaker
-                             : (std::is_base_of<VarTypeInference, T>::value
-                                    ? kVarTypeInference
-                                    : (std::is_base_of<InferShapeBase, T>::value
-                                           ? kShapeInference
-                                           : (std::is_base_of<
-                                                  InplaceOpInference, T>::value
-                                                  ? kInplaceOpInference
-                                                  : static_cast<OpInfoFillType>(
-                                                        -1))))));
  }
 };
@@ -156,6 +216,18 @@ struct OpInfoFiller<T, kInplaceOpInference> {
  }
 };
+template <typename T>
+struct OpInfoFiller<T, kNoNeedBufferVarsInference> {
+  void operator()(const char* op_type, OpInfo* info) const {
+    info->infer_no_need_buffer_vars_ = [](const VariableNameMap& inputs,
+                                          const VariableNameMap& outputs,
+                                          const AttributeMap& attrs) {
+      T infer(inputs, outputs, attrs);
+      return infer();
+    };
+  }
+};
 }  // namespace details
 }  // namespace framework

--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -193,6 +193,79 @@ ExtractComputationOpFromLastLivedVar(VarHandle *var, size_t scope_idx,
  return shrink_func(computation_op);
 }
+/**
+ * Shrink op dependencies according to no need buffer vars.
+ *
+ * If some ops do not need Tensor buffer of any input,
+ * just remove the dependency of this op, i.e, decrease reference count.
+ *
+ * For example, input Y of elementwise_add_grad op is only used to infer shape
+ * and lod of Y@GRAD, we do not need the buffer of input Y. Data buffer of
+ * input Y can be collected before elementwise_add_grad op runs.
+ *
+ * This method returns whether the dependency count decreases to 0, and
+ * shrinks op dependency if possible.
+ */
+static bool ShrinkNoNeedBufferVarOpDependency(
+    const std::string &var_name,
+    std::unordered_set<ComputationOpHandle *> *op_handles) {
+  std::vector<ComputationOpHandle *> skip_ops;
+  for (auto *op_handle : *op_handles) {
+    auto *op_base = op_handle->GetOp();
+    auto &inferer = op_base->Info().NoNeedBufferVarsInferer();
+    if (!inferer) {
+      continue;
+    }
+    std::unordered_set<std::string> no_need_buffer_vars =
+        inferer(op_base->Inputs(), op_base->Outputs(), op_base->Attrs());
+    // Check whether var_name occurs in other inputs or outputs of the op
+    // If it occurs, we cannot decrease the dependency number.
+    bool occurred_in_other_vars = false;
+    for (auto &in_pair : op_base->Inputs()) {
+      if (no_need_buffer_vars.count(in_pair.first) > 0) {
+        continue;
+      }
+      auto &args = in_pair.second;
+      auto iter = std::find(args.begin(), args.end(), var_name);
+      if (iter != args.end()) {
+        occurred_in_other_vars = true;
+        break;
+      }
+    }
+    if (occurred_in_other_vars) {
+      continue;
+    }
+    for (auto &out_pair : op_base->Outputs()) {
+      auto &args = out_pair.second;
+      auto iter = std::find(args.begin(), args.end(), var_name);
+      if (iter != args.end()) {
+        occurred_in_other_vars = true;
+        break;
+      }
+    }
+    if (!occurred_in_other_vars) {
+      VLOG(2) << "Shrink var " << var_name << " in op " << op_handle->Name();
+      skip_ops.emplace_back(op_handle);
+    }
+  }
+  if (skip_ops.size() == op_handles->size()) {
+    op_handles->clear();
+    return true;
+  } else {
+    for (auto *skip_op : skip_ops) {
+      op_handles->erase(skip_op);
+    }
+    return false;
+  }
+}
 std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  auto &ref_cnts = Get<std::vector<ReferenceCountMap>>(kGlobalReferenceCount);
@@ -229,17 +302,43 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
        continue;
      }
-      bool ok;
+      auto &var_name = name_var_pair.first;
-      auto result = ExtractComputationOpFromLastLivedVar(
+      auto &var_handles = name_var_pair.second;
-          name_var_pair.second.back(), i, shrink_func, &ok);
+      for (auto iter = var_handles.rbegin(); iter != var_handles.rend();
+           ++iter) {
+        bool ok;
+        auto result =
+            ExtractComputationOpFromLastLivedVar(*iter, i, shrink_func, &ok);
+        // Seldomly, some vars may have no pending or preceding computation ops
+        // Just break;
+        if (!ok) break;
+        VLOG(10) << "Extract " << result.size() << " ops of var " << var_name;
+        size_t original_op_deps = result.size();
+        // If all ops do not need buffer of var_name, calculate reference count
+        // of the previous version of var_name.
+        if (ShrinkNoNeedBufferVarOpDependency(var_name, &result)) {
+          VLOG(10) << "Try to precede reference count computing at var "
+                   << var_name;
+          continue;
+        }
+        size_t final_op_deps = result.size();
+        if (final_op_deps < original_op_deps) {
+          VLOG(5) << "Shrink op deps from " << original_op_deps << " to "
+                  << final_op_deps;
+        }
-      if (ok) {
-        auto &var_name = name_var_pair.first;
        PADDLE_ENFORCE(!result.empty(), "Last living ops of %s cannot be empty",
                       var_name);
        ref_cnts[i].emplace(var_name, result.size());
        last_live_ops_of_vars[i].emplace(var_name, std::move(result));
      }
+      // Seldomly, all preceding trying failed.
+      // Just skip this corner case
    }
  }

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <utility>
+#include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
@@ -48,97 +49,23 @@ namespace {
 int kProgramId = -1;
 }  // namespace
-static std::unordered_map<std::string, size_t> GetNonPersistableReferenceCounts(
-    const BlockDesc& block, const std::vector<std::string>& skip_var_list) {
-  std::unordered_map<std::string, size_t> ref_cnts;
-  std::unordered_set<std::string> skip_vars(skip_var_list.begin(),
-                                            skip_var_list.end());
-  auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) {
-    for (auto& name_pair : name_map) {
-      for (auto& name : name_pair.second) {
-        if (skip_vars.count(name)) continue;
-        auto* var_desc = block.FindVar(name);
-        if (var_desc == nullptr || var_desc->Persistable()) continue;
-        auto type = var_desc->Proto()->type().type();
-        if (type != proto::VarType::LOD_TENSOR &&
-            type != proto::VarType::SELECTED_ROWS &&
-            type != proto::VarType::LOD_TENSOR_ARRAY) {
-          continue;
-        }
-        ++ref_cnts[name];
-      }
-    }
-  };
-  for (auto op_desc : block.AllOps()) {
-    update_ref_cnts(op_desc, op_desc->Inputs());
-    update_ref_cnts(op_desc, op_desc->Outputs());
-  }
-  return ref_cnts;
-}
 ExecutorPrepareContext::ExecutorPrepareContext(
-    const framework::ProgramDesc& prog, size_t block_id,
+    const framework::ProgramDesc& prog, size_t block_id)
-    const std::vector<std::string>& keep_vars, bool force_disable_gc)
+    : prog_(prog), block_id_(block_id) {}
-    : prog_(prog), block_id_(block_id), force_disable_gc_(force_disable_gc) {
-  if (GetEagerDeletionThreshold() >= 0 && !force_disable_gc_) {
+void ExecutorPrepareContext::PrepareUnusedVars(
-    global_ref_cnts_ =
+    const std::vector<std::string>& keep_vars, bool force_disable_gc) {
-        GetNonPersistableReferenceCounts(prog.Block(block_id), keep_vars);
+  force_disable_gc_ = force_disable_gc;
+  if (GetEagerDeletionThreshold() < 0 || force_disable_gc_) {
+    return;
  }
+  unused_vars_ = GetUnusedVars(prog_.Block(block_id_), ops_, keep_vars);
 }
 ExecutorPrepareContext::~ExecutorPrepareContext() {
  VLOG(5) << "destroy ExecutorPrepareContext";
 }
-static void DeleteUnusedTensors(
-    const Scope& scope, const OperatorBase* op, GarbageCollector* gc,
-    std::unordered_map<std::string, size_t>* ref_cnts) {
-  std::deque<std::shared_ptr<memory::Allocation>> garbages;
-  auto handler = [&](const VariableNameMap& name_map) {
-    for (auto& name_pair : name_map) {
-      for (auto& name : name_pair.second) {
-        auto it = ref_cnts->find(name);
-        if (it == ref_cnts->end()) continue;
-        if (--(it->second) != 0) {
-          continue;
-        }
-        auto* var = scope.FindVar(name);
-        if (var == nullptr) {
-          continue;
-        }
-        VLOG(2) << "Erase variable " << name;
-        if (var->IsType<LoDTensor>()) {
-          garbages.emplace_back(
-              var->GetMutable<LoDTensor>()->MoveMemoryHolder());
-        } else if (var->IsType<SelectedRows>()) {
-          garbages.emplace_back(var->GetMutable<SelectedRows>()
-                                    ->mutable_value()
-                                    ->MoveMemoryHolder());
-        } else if (var->IsType<LoDTensorArray>()) {
-          auto* lod_tensor_arr = var->GetMutable<LoDTensorArray>();
-          for (auto& t : *lod_tensor_arr) {
-            garbages.emplace_back(t.MoveMemoryHolder());
-          }
-        } else {
-          PADDLE_THROW("Type %s of %s is not supported eager deletion",
-                       framework::ToTypeName(var->Type()), name);
-        }
-      }
-    }
-  };
-  handler(op->Inputs());
-  handler(op->Outputs());
-  if (!garbages.empty()) {
-    gc->Add(std::move(garbages));
-  }
-}
 Executor::Executor(const platform::Place& place) : place_(place) {}
 void Executor::Close() {
@@ -362,8 +289,8 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
 std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
    const ProgramDesc& program, int block_id,
    const std::vector<std::string>& skip_ref_cnt_vars, bool force_disable_gc) {
-  std::unique_ptr<ExecutorPrepareContext> ctx(new ExecutorPrepareContext(
+  std::unique_ptr<ExecutorPrepareContext> ctx(
-      program, block_id, skip_ref_cnt_vars, force_disable_gc));
+      new ExecutorPrepareContext(program, block_id));
  PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
  auto& block = program.Block(block_id);
  for (auto& op_desc : block.AllOps()) {
@@ -375,6 +302,7 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
        ctx->prog_.Block(ctx->block_id_), &ctx->ops_);
  }
 #endif
+  ctx->PrepareUnusedVars(skip_ref_cnt_vars, force_disable_gc);
  return ctx;
 }
@@ -389,19 +317,17 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
  std::vector<std::shared_ptr<ExecutorPrepareContext>> result;
  size_t idx = 0;
  for (auto& bid : block_ids) {
-    ExecutorPrepareContext* ctx;
-    if (skip_ref_cnt_vars.empty()) {
-      ctx = new ExecutorPrepareContext(program, bid, std::vector<std::string>(),
-                                       force_disable_gc);
-    } else {
-      ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx],
-                                       force_disable_gc);
-    }
    PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
+    auto* ctx = new ExecutorPrepareContext(program, bid);
    auto& block = program.Block(bid);
    for (auto& op_desc : block.AllOps()) {
      ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
    }
+    if (skip_ref_cnt_vars.empty()) {
+      ctx->PrepareUnusedVars(std::vector<std::string>(), force_disable_gc);
+    } else {
+      ctx->PrepareUnusedVars(skip_ref_cnt_vars[idx], force_disable_gc);
+    }
    result.push_back(std::shared_ptr<ExecutorPrepareContext>(ctx));
    ++idx;
  }
@@ -425,7 +351,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
  // FIXME(zjl): recurrent_op is rather complex, we would
  // disable gc forcely in recurrent_op
  if (!ctx->force_disable_gc_ && max_memory_size >= 0) {
-    ctx->ResetReferenceCount();
 #ifdef PADDLE_WITH_CUDA
    if (platform::is_gpu_place(place_)) {
      if (IsFastEagerDeletionModeEnabled()) {
@@ -453,8 +378,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
    op->Run(*local_scope, place_);
    if (gc) {
-      DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
+      DeleteUnusedTensors(*local_scope, op.get(), ctx->unused_vars_, gc.get());
-                          &(ctx->runtime_ref_cnts_));
    }
  }

--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -30,22 +30,20 @@ namespace paddle {
 namespace framework {
 struct ExecutorPrepareContext {
-  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id,
+  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id);
-                         const std::vector<std::string>& skip_ref_cnt_vars =
-                             std::vector<std::string>(),
-                         bool force_disable_gc = false);
  ~ExecutorPrepareContext();
-  void ResetReferenceCount() { runtime_ref_cnts_ = global_ref_cnts_; }
+  void PrepareUnusedVars(const std::vector<std::string>& keep_vars,
+                         bool force_disable_gc = false);
  const framework::ProgramDesc& prog_;
-  size_t block_id_;
+  const size_t block_id_;
-  bool force_disable_gc_;
  std::vector<std::unique_ptr<OperatorBase>> ops_;
-  std::unordered_map<std::string, size_t> global_ref_cnts_;
+  std::unordered_map<OperatorBase*, std::vector<std::string>> unused_vars_;
-  std::unordered_map<std::string, size_t> runtime_ref_cnts_;
+  bool force_disable_gc_{false};
 };
 class Executor {

--- a/paddle/fluid/framework/executor_gc_helper.cc
+++ b/paddle/fluid/framework/executor_gc_helper.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/executor_gc_helper.h"
+#include <deque>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "glog/logging.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace framework {
+struct OpInOutInfo {
+ public:
+  void Build(const OperatorBase *op) {
+    is_built_ = true;
+    auto &inferer = op->Info().NoNeedBufferVarsInferer();
+    if (inferer) {
+      no_need_buffer_ins_ = inferer(op->Inputs(), op->Outputs(), op->Attrs());
+      if (no_need_buffer_ins_.empty()) return;
+      for (auto &in_name_pair : op->Inputs()) {
+        if (no_need_buffer_ins_.count(in_name_pair.first) != 0) {
+          continue;
+        }
+        for (auto &in_arg_name : in_name_pair.second) {
+          other_args_set_.insert(in_arg_name);
+        }
+      }
+      for (auto &out_name_pair : op->Outputs()) {
+        for (auto &out_arg_name : out_name_pair.second) {
+          other_args_set_.insert(out_arg_name);
+        }
+      }
+    }
+  }
+  bool IsBuilt() const { return is_built_; }
+  bool IsInArgBufferNeeded(const std::string &in_arg_name) const {
+    return no_need_buffer_ins_.empty() ||
+           other_args_set_.count(in_arg_name) != 0;
+  }
+ private:
+  // A set to record unused buffer input vars of op
+  std::unordered_set<std::string> no_need_buffer_ins_;
+  // A set to record other args of op (including in, out)
+  std::unordered_set<std::string> other_args_set_;
+  bool is_built_{false};
+};
+static bool VarCanBeDeleted(const std::string &name, const BlockDesc &block,
+                            const std::unordered_set<std::string> &skip_vars) {
+  if (skip_vars.count(name) != 0) {
+    return false;
+  }
+  auto *var_desc = block.FindVar(name);
+  if (var_desc == nullptr || var_desc->Persistable()) {
+    return false;
+  }
+  auto type = var_desc->Proto()->type().type();
+  return type == proto::VarType::LOD_TENSOR ||
+         type == proto::VarType::SELECTED_ROWS ||
+         type == proto::VarType::LOD_TENSOR_ARRAY;
+}
+std::unordered_map<OperatorBase *, std::vector<std::string>> GetUnusedVars(
+    const BlockDesc &block,
+    const std::vector<std::unique_ptr<OperatorBase>> &ops,
+    const std::vector<std::string> &skip_var_list) {
+  std::unordered_set<std::string> skip_vars(skip_var_list.begin(),
+                                            skip_var_list.end());
+  std::unordered_map<std::string, size_t> var_op_idx_map;
+  for (size_t i = 0; i < ops.size(); ++i) {
+    auto *op = ops[i].get();
+    OpInOutInfo info;
+    for (auto &name_pair : op->Inputs()) {
+      for (auto &name : name_pair.second) {
+        if (!VarCanBeDeleted(name, block, skip_vars)) {
+          continue;
+        }
+        // var can be gc-ed
+        if (!info.IsBuilt()) {
+          info.Build(op);
+        }
+        if (info.IsInArgBufferNeeded(name)) {
+          // Update the last living op of variable to current op
+          var_op_idx_map[name] = i;
+        } else {
+          VLOG(10) << "Skip reference count computing of variable "
+                   << name_pair.first << "(" << name << ") in Operator "
+                   << op->Type();
+        }
+      }
+    }
+    for (auto &name_pair : op->Outputs()) {
+      for (auto &name : name_pair.second) {
+        if (VarCanBeDeleted(name, block, skip_vars)) {
+          // Update the last living op of variable to current op
+          var_op_idx_map[name] = i;
+        }
+      }
+    }
+  }
+  std::unordered_map<OperatorBase *, std::vector<std::string>> result;
+  for (auto &name_op_idx_pair : var_op_idx_map) {
+    auto &name = name_op_idx_pair.first;
+    size_t op_idx = name_op_idx_pair.second;
+    result[ops[op_idx].get()].emplace_back(name);
+  }
+  return result;
+}
+void DeleteUnusedTensors(
+    const Scope &scope, OperatorBase *op,
+    const std::unordered_map<OperatorBase *, std::vector<std::string>>
+        &delete_vars_map,
+    GarbageCollector *gc) {
+  auto iter = delete_vars_map.find(op);
+  if (iter == delete_vars_map.end()) {
+    return;
+  }
+  auto &delete_vars = iter->second;
+  std::deque<std::shared_ptr<memory::Allocation>> garbages;
+  for (auto &var_name : delete_vars) {
+    auto *var = scope.FindVar(var_name);
+    if (var == nullptr) {
+      continue;
+    }
+    VLOG(2) << "Erase variable " << var_name;
+    if (var->IsType<LoDTensor>()) {
+      garbages.emplace_back(var->GetMutable<LoDTensor>()->MoveMemoryHolder());
+    } else if (var->IsType<SelectedRows>()) {
+      garbages.emplace_back(
+          var->GetMutable<SelectedRows>()->mutable_value()->MoveMemoryHolder());
+    } else if (var->IsType<LoDTensorArray>()) {
+      auto *lod_tensor_arr = var->GetMutable<LoDTensorArray>();
+      for (auto &t : *lod_tensor_arr) {
+        garbages.emplace_back(t.MoveMemoryHolder());
+      }
+    } else {
+      PADDLE_THROW("Type %s of %s is not supported eager deletion",
+                   framework::ToTypeName(var->Type()), var_name);
+    }
+  }
+  if (!garbages.empty()) {
+    gc->Add(std::move(garbages));
+  }
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/executor_gc_helper.h
+++ b/paddle/fluid/framework/executor_gc_helper.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/framework/garbage_collector.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+namespace paddle {
+namespace framework {
+// Result map: op -> variable names that can be deleted after op runs
+std::unordered_map<OperatorBase *, std::vector<std::string>> GetUnusedVars(
+    const BlockDesc &block,
+    const std::vector<std::unique_ptr<OperatorBase>> &ops,
+    const std::vector<std::string> &skip_vars);
+// Collect unused tensors after op runs
+void DeleteUnusedTensors(
+    const Scope &scope, OperatorBase *op,
+    const std::unordered_map<OperatorBase *, std::vector<std::string>>
+        &delete_vars_map,
+    GarbageCollector *gc);
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -13,14 +13,36 @@
 // limitations under the License.
 #include <algorithm>
+#include <deque>
+#include <functional>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <utility>
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
+#include "gflags/gflags.h"
+#include "glog/logging.h"
 #include "paddle/fluid/framework/garbage_collector.h"
 namespace paddle {
 namespace framework {
+DEFINE_double(
+    eager_delete_tensor_gb, -1.0,
+    "Memory size threshold (GB) when the garbage collector clear tensors."
+    "Disabled when this value is less than 0");
+DEFINE_bool(fast_eager_deletion_mode, true,
+            "Fast eager deletion mode. If enabled, memory would release "
+            "immediately without waiting GPU kernel ends.");
+DEFINE_double(memory_fraction_of_eager_deletion, 1.0,
+              "Fraction of eager deletion. If less than 1.0, all variables in "
+              "the program would be sorted according to its memory size, and "
+              "only the FLAGS_memory_fraction_of_eager_deletion of the largest "
+              "variables would be deleted.");
 GarbageCollector::GarbageCollector(const platform::Place &place,
                                   size_t max_memory_size)
    : max_memory_size_((std::max)(max_memory_size, static_cast<size_t>(1))) {
@@ -85,5 +107,25 @@ void StreamGarbageCollector::ClearCallback(
  callback_manager_->AddCallback(callback);
 }
 #endif
+int64_t GetEagerDeletionThreshold() {
+  return FLAGS_eager_delete_tensor_gb < 0
+             ? -1
+             : static_cast<int64_t>(FLAGS_eager_delete_tensor_gb *
+                                    (static_cast<int64_t>(1) << 30));
+}
+bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; }
+void SetEagerDeletionMode(double threshold, double fraction, bool fast_mode) {
+  FLAGS_eager_delete_tensor_gb = threshold;
+  FLAGS_memory_fraction_of_eager_deletion = fraction;
+  FLAGS_fast_eager_deletion_mode = fast_mode;
+}
+double GetEagerDeletionMemoryFraction() {
+  return FLAGS_memory_fraction_of_eager_deletion;
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -18,6 +18,8 @@
 #include <functional>
 #include <memory>
 #include <mutex>  // NOLINT
+#include <utility>
+#include "gflags/gflags.h"
 #include "paddle/fluid/platform/device_context.h"
 namespace paddle {
@@ -126,5 +128,12 @@ void GarbageCollector::Add(Container &&objs, Callback &&callback) {
  }
 }
+int64_t GetEagerDeletionThreshold();
+bool IsFastEagerDeletionModeEnabled();
+void SetEagerDeletionMode(double threshold, double fraction, bool fast_mode);
+double GetEagerDeletionMemoryFraction();
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/no_need_buffer_vars_inference.h
+++ b/paddle/fluid/framework/no_need_buffer_vars_inference.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/op_desc.h"
+namespace paddle {
+namespace framework {
+class NoNeedBufferVarsInference {
+ public:
+  NoNeedBufferVarsInference(const VariableNameMap &inputs,
+                            const VariableNameMap &outputs,
+                            const AttributeMap &attrs)
+      : inputs_(inputs), outputs_(outputs), attrs_(attrs) {}
+  virtual ~NoNeedBufferVarsInference() = default;
+  const VariableNameMap &Inputs() const { return inputs_; }
+  const VariableNameMap &Outputs() const { return outputs_; }
+  const AttributeMap &Attrs() const { return attrs_; }
+  virtual std::unordered_set<std::string> operator()() const = 0;
+ private:
+  const VariableNameMap &inputs_;
+  const VariableNameMap &outputs_;
+  const AttributeMap &attrs_;
+};
+#define DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(class_type, ...)               \
+  class class_type : public ::paddle::framework::NoNeedBufferVarsInference { \
+   public:                                                                   \
+    using ::paddle::framework::NoNeedBufferVarsInference::                   \
+        NoNeedBufferVarsInference;                                           \
+                                                                             \
+    std::unordered_set<std::string> operator()() const override {            \
+      return {__VA_ARGS__};                                                  \
+    }                                                                        \
+  }
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/op_info.h
+++ b/paddle/fluid/framework/op_info.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <unordered_map>
 #include "paddle/fluid/framework/attribute.h"
+#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/platform/macros.h"
@@ -39,6 +40,7 @@ struct OpInfo {
  InferVarTypeFN infer_var_type_;
  InferShapeFN infer_shape_;
  InferInplaceOpFN infer_inplace_;
+  InferNoNeedBufferVarsFN infer_no_need_buffer_vars_;
  bool HasOpProtoAndChecker() const {
    return proto_ != nullptr && checker_ != nullptr;
@@ -64,6 +66,10 @@ struct OpInfo {
  }
  const OpAttrChecker* Checker() const { return checker_; }
+  const InferNoNeedBufferVarsFN& NoNeedBufferVarsInferer() const {
+    return infer_no_need_buffer_vars_;
+  }
 };
 class OpInfoMap {

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <algorithm>
 #include <sstream>
 #include <string>
+#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/executor.h"
@@ -326,7 +327,12 @@ OperatorBase::OperatorBase(const std::string& type,
                           const VariableNameMap& inputs,
                           const VariableNameMap& outputs,
                           const AttributeMap& attrs)
-    : type_(type), inputs_(inputs), outputs_(outputs), attrs_(attrs) {
+    : type_(type),
+      inputs_(inputs),
+      outputs_(outputs),
+      attrs_(attrs),
+      // NOTE(zjl): why op_info may be nullptr?
+      info_(OpInfoMap::Instance().GetNullable(type)) {
  GenerateTemporaryNames();
  CheckAllInputOutputSet();
 }
@@ -350,7 +356,7 @@ std::vector<std::string> OperatorBase::OutputVars(bool has_intermediate) const {
    }
    return ret_val;
  }
-  auto& info = OpInfoMap::Instance().Get(Type());
+  auto& info = Info();
  // get all OpProto::Var for outputs
  for (auto& o : info.Proto().outputs()) {
@@ -366,18 +372,16 @@ std::vector<std::string> OperatorBase::OutputVars(bool has_intermediate) const {
 }
 void OperatorBase::CheckAllInputOutputSet() const {
-  auto& info_map = OpInfoMap::Instance();
+  if (info_ == nullptr || info_->proto_ == nullptr) return;
-  auto* op_info = info_map.GetNullable(Type());
-  if (op_info == nullptr || op_info->proto_ == nullptr) return;
-  for (auto& in : op_info->Proto().inputs()) {
+  for (auto& in : info_->Proto().inputs()) {
    if (!in.dispensable()) {
      PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(),
                     "Operator %s's input, %s, is not set", Type(), in.name());
    }
  }
-  for (auto& out : op_info->Proto().outputs()) {
+  for (auto& out : info_->Proto().outputs()) {
    if (!out.dispensable()) {
      PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(),
                     "Operator %s's output, %s, is not set", Type(),
@@ -997,7 +1001,27 @@ Scope* OperatorWithKernel::PrepareData(
    std::vector<std::string>* transfered_inplace_vars,
    RuntimeContext* ctx) const {
  Scope* new_scope = nullptr;
+  std::unordered_set<std::string> no_buffer_ins;
+  if (info_) {
+    auto& no_buffer_inferer = info_->NoNeedBufferVarsInferer();
+    // Some op may not register NoNeedBufferVarsInferer
+    if (no_buffer_inferer) {
+      no_buffer_ins = no_buffer_inferer(Inputs(), Outputs(), Attrs());
+    }
+  }
  for (auto& var_name_item : Inputs()) {
+    // NOTE(zjl): STL does not guarantee fast std::unordered_set::count when set
+    // is empty. At least STL implemented on my mac does calculate hash code
+    // of search key even though the set is empty.
+    if (!no_buffer_ins.empty() &&
+        no_buffer_ins.count(var_name_item.first) > 0) {
+      VLOG(1) << "Skip scanning input " << var_name_item.first
+              << " in Operator " << type_;
+      continue;
+    }
    std::vector<Variable*>& input_vars = ctx->inputs[var_name_item.first];
    for (size_t i = 0; i < var_name_item.second.size(); ++i) {

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -160,6 +160,11 @@ class OperatorBase {
  const VariableNameMap& Inputs() const { return inputs_; }
  const VariableNameMap& Outputs() const { return outputs_; }
+  const OpInfo& Info() const {
+    PADDLE_ENFORCE_NOT_NULL(info_, "OpInfo of %s is not found", type_);
+    return *info_;
+  }
  bool HasInputs(const std::string& name) const;
  //! Get a input with argument's name described in `op_proto`
  std::string Input(const std::string& name) const;
@@ -194,6 +199,10 @@ class OperatorBase {
  // IG (Inputs Gradients)
  VariableNameMap outputs_;
  AttributeMap attrs_;
+  // OpInfo
+  const OpInfo* info_;
  // Whether this operator executes in an Executor.
  bool run_by_executor_{true};
@@ -444,7 +453,7 @@ class OperatorWithKernel : public OperatorBase {
  }
  virtual void InferShape(InferShapeContext* ctx) const {
-    OpInfoMap::Instance().Get(Type()).infer_shape_(ctx);
+    Info().infer_shape_(ctx);
  }
  void RuntimeInferShape(const Scope& scope, const platform::Place& place,

--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -29,15 +29,6 @@ DEFINE_bool(
    "Delete local scope eagerly. It will reduce GPU memory usage but "
    "slow down the destruction of variables.(around 1% performance harm)");
-DEFINE_double(
-    eager_delete_tensor_gb, -1.0,
-    "Memory size threshold (GB) when the garbage collector clear tensors."
-    "Disabled when this value is less than 0");
-DEFINE_bool(fast_eager_deletion_mode, true,
-            "Fast eager deletion mode. If enabled, memory would release "
-            "immediately without waiting GPU kernel ends.");
 // When in inference scenario, the scopes will not be written by two threads in
 // a mean time, but a scope may be read by multiple threads concurrently, and
 // the mutex will cause serious performance issue.
@@ -57,15 +48,6 @@ DEFINE_bool(fast_eager_deletion_mode, true,
 namespace paddle {
 namespace framework {
-int64_t GetEagerDeletionThreshold() {
-  return FLAGS_eager_delete_tensor_gb < 0
-             ? -1
-             : static_cast<int64_t>(FLAGS_eager_delete_tensor_gb *
-                                    (static_cast<int64_t>(1) << 30));
-}
-bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; }
 Scope::~Scope() { DropKids(); }
 Scope& Scope::NewScope() const {

--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -32,9 +32,6 @@ extern "C" {
 namespace paddle {
 namespace framework {
-int64_t GetEagerDeletionThreshold();
-bool IsFastEagerDeletionModeEnabled();
 class Scope;
 /**

--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -30,6 +30,7 @@ class InferShapeContext;
 class InferVarTypeContext;
 class BlockDesc;
 class Variable;
+class NoNeedBufferVarsInference;
 using VariableNameMap = std::map<std::string, std::vector<std::string>>;
 // TODO(panyx0718): Replace vector with something like gtl::Vector.
@@ -61,5 +62,9 @@ using InferShapeFN = std::function<void(InferShapeContext*)>;
 using InplacePair = std::unordered_map<std::string, std::string>;
 using InferInplaceOpFN = std::function<InplacePair(const OpDesc&)>;
+using InferNoNeedBufferVarsFN = std::function<std::unordered_set<std::string>(
+    const VariableNameMap& /*inputs*/, const VariableNameMap& /*outputs*/,
+    const AttributeMap& /*attrs*/)>;
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/operators/add_position_encoding_op.cc
+++ b/paddle/fluid/operators/add_position_encoding_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/add_position_encoding_op.h"
+#include <memory>
 namespace paddle {
 namespace operators {
@@ -39,13 +40,8 @@ class AddPositionEncodingOpGrad : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) must not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Out"), "Out must not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Out@GRAD must not be null.");
-    auto out_dims = ctx->GetInputDim("Out");
    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
      ctx->SetOutputDim(framework::GradVarName("X"), out_dims);
    }
  }
@@ -75,6 +71,22 @@ class AddPositionEncodingOpMaker : public framework::OpProtoAndCheckerMaker {
  }
 };
+class AddPositionEncodingGradOpDescMaker
+    : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("add_position_encoding_grad");
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
 }  // namespace operators
 }  // namespace paddle
@@ -83,7 +95,7 @@ namespace plt = paddle::platform;
 REGISTER_OPERATOR(add_position_encoding, ops::AddPositionEncodingOp,
                  ops::AddPositionEncodingOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::AddPositionEncodingGradOpDescMaker);
 REGISTER_OPERATOR(add_position_encoding_grad, ops::AddPositionEncodingOpGrad);
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/operators/clip_op.cc
+++ b/paddle/fluid/operators/clip_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/clip_op.h"
+#include <memory>
 namespace paddle {
 namespace operators {
@@ -76,12 +77,28 @@ class ClipOpGrad : public framework::OperatorWithKernel {
  }
 };
+class ClipGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("clip_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(clip, ops::ClipOp, ops::ClipOpMaker<float>,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::ClipGradOpDescMaker);
 REGISTER_OPERATOR(clip_grad, ops::ClipOpGrad);
 REGISTER_OP_CPU_KERNEL(
    clip, ops::ClipKernel<paddle::platform::CPUDeviceContext, float>);

--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/concat_op.h"
+#include <memory>
 #include <string>
 #include <vector>
@@ -120,11 +121,7 @@ Examples:
 class ConcatOpGrad : public framework::OperatorWithKernel {
 public:
-  ConcatOpGrad(const std::string &type,
+  using framework::OperatorWithKernel::OperatorWithKernel;
-               const framework::VariableNameMap &inputs,
-               const framework::VariableNameMap &outputs,
-               const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
  void InferShape(framework::InferShapeContext *ctx) const override {
    auto in_x = "X";
@@ -142,6 +139,33 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
      }
    }
  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
+        ctx.GetPlace());
+  }
+};
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ConcatOpGradNoNeedBufferVarInference,
+                                      "X");
+class ConcatGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("concat_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X", false));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
 };
 }  // namespace operators
@@ -149,9 +173,9 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<
+                  ops::ConcatGradOpDescMaker);
-                      false> /* set false to disable empty grad */);
+REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad,
-REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad);
+                  ops::ConcatOpGradNoNeedBufferVarInference);
 REGISTER_OP_CPU_KERNEL(
    concat, ops::ConcatKernel<paddle::platform::CPUDeviceContext, double>,
    ops::ConcatKernel<paddle::platform::CPUDeviceContext, float>,

--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -455,13 +455,13 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
  return type;
 }
-class Conv2dGradMaker : public framework::SingleGradOpDescMaker {
+class Conv2DGradMaker : public framework::SingleGradOpDescMaker {
 public:
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
  std::unique_ptr<framework::OpDesc> Apply() const override {
    auto* op = new framework::OpDesc();
-    op->SetType(GradOpType());
+    op->SetType(this->ForwardOpType() + "_grad");
    op->SetInput("Input", Input("Input"));
    op->SetInput("Filter", Input("Filter"));
    op->SetInput("Bias", Input("Bias"));
@@ -470,14 +470,33 @@ class Conv2dGradMaker : public framework::SingleGradOpDescMaker {
    op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
    op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter"));
    op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
    op->SetAttrMap(Attrs());
    return std::unique_ptr<framework::OpDesc>(op);
  }
+};
+class Conv3DGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-  virtual std::string GradOpType() const {
+  std::unique_ptr<framework::OpDesc> Apply() const override {
-    return this->ForwardOpType() + "_grad";
+    auto* op = new framework::OpDesc();
+    op->SetType(this->ForwardOpType() + "_grad");
+    op->SetInput("Input", Input("Input"));
+    op->SetInput("Filter", Input("Filter"));
+    op->SetInput(framework::GradVarName("Output"), OutputGrad("Output"));
+    op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
+    op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter"));
+    if (ForwardOp().Inputs().count("ResidualData") != 0) {
+      op->SetInput("ResidualData", Input("ResidualData"));
+    }
+    op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(op);
  }
 };
@@ -486,17 +505,16 @@ class Conv2dGradMaker : public framework::SingleGradOpDescMaker {
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker,
-                  ops::ConvOpInferVarType, ops::Conv2dGradMaker);
+                  ops::ConvOpInferVarType, ops::Conv2DGradMaker);
 REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad);
 // depthwise convolution op
 REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
-                  ops::ConvOpInferVarType, ops::Conv2dGradMaker);
+                  ops::ConvOpInferVarType, ops::Conv2DGradMaker);
 REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad);
 REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker,
-                  ops::ConvOpInferVarType,
+                  ops::ConvOpInferVarType, ops::Conv3DGradMaker);
-                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad);
 // depthwise conv kernel

--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/crop_op.h"
-#include <boost/lexical_cast.hpp>
+#include <memory>
+#include <string>
+#include <vector>
 namespace paddle {
 namespace operators {
@@ -178,12 +180,31 @@ class CropOpGrad : public framework::OperatorWithKernel {
  }
 };
+class CropGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("crop_grad");
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetInput("X", Input("X"));
+    if (ForwardOp().Inputs().count("Offsets") > 0) {
+      op->SetInput("Offsets", Input("Offsets"));
+    }
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(crop, ops::CropOp, ops::CropOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::CropGradOpDescMaker);
 REGISTER_OPERATOR(crop_grad, ops::CropOpGrad);
 REGISTER_OP_CPU_KERNEL(
    crop, ops::CropKernel<paddle::platform::CPUDeviceContext, float>);

--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -238,6 +238,23 @@ class CrossEntropyGradientOp : public CrossEntropyGradientOpBase {
  }
 };
+class CrossEntropyGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("cross_entropy_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput("Label", Input("Label"));
+    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
 class CrossEntropyOp2 : public CrossEntropyOpBase {
 public:
  using CrossEntropyOpBase::CrossEntropyOpBase;
@@ -354,7 +371,7 @@ using CPUCtx = paddle::platform::CPUDeviceContext;
 REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOpBase,
                  ops::CrossEntropyOpMaker, ops::CrossEntropyOpInferVarType,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::CrossEntropyGradOpDescMaker);
 REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp);
 REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<CPUCtx, float>,
                       ops::CrossEntropyOpKernel<CPUCtx, double>);

--- a/paddle/fluid/operators/cudnn_lstm_op.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <memory>
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
@@ -170,11 +171,6 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(ctx->HasInput("Input"),
                   "Input(Input) of LSTM should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("last_h"),
-                   "Input(last_h) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("last_c"),
-                   "Input(last_c) of LSTM should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Cache"),
                   "Input(last_c) of LSTM should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("InitH"),
@@ -197,6 +193,35 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel {
  }
 };
+class CudnnLSTMGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("cudnn_lstm_grad");
+    op->SetInput("Input", Input("Input"));
+    op->SetInput("InitH", Input("InitH"));
+    op->SetInput("InitC", Input("InitC"));
+    op->SetInput("W", Input("W"));
+    if (ForwardOp().Inputs().count("Cache") > 0) {
+      op->SetInput("Cache", Input("Cache"));
+    }
+    op->SetInput("Out", Output("Out"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetInput(framework::GradVarName("last_c"), OutputGrad("last_c"));
+    op->SetInput(framework::GradVarName("last_h"), OutputGrad("last_h"));
+    op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
+    op->SetOutput(framework::GradVarName("W"), InputGrad("W"));
+    op->SetOutput(framework::GradVarName("InitH"), InputGrad("InitH"));
+    op->SetOutput(framework::GradVarName("InitC"), InputGrad("InitC"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
 template <typename T>
 class NotImpleKernel : public framework::OpKernel<T> {
 public:
@@ -211,7 +236,7 @@ class NotImpleKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(cudnn_lstm, ops::CudnnLSTMOp, ops::CudnnLSTMOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::CudnnLSTMGradOpDescMaker);
 REGISTER_OPERATOR(cudnn_lstm_grad, ops::CudnnLSTMGradOp);
 REGISTER_OP_CPU_KERNEL(cudnn_lstm, ops::NotImpleKernel<float>);

--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -14,6 +14,7 @@
 #include <set>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/operators/distributed/parameter_prefetch.h"
@@ -218,7 +219,7 @@ void prefetch(const std::string& id_name, const std::string& out_name,
                 boost::get<platform::CUDAPlace>(id_tensor.place()),
                 id_tensor.data<int64_t>(), sizeof(int64_t) * id_tensor.numel(),
                 stream);
-    for (size_t i = 0; i < cpu_tensor.numel(); ++i) {
+    for (int64_t i = 0; i < cpu_tensor.numel(); ++i) {
      ids_vector.push_back(cpu_tensor_data[i]);
    }
 #endif

--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
@@ -16,8 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 namespace ops = paddle::operators;
 REGISTER_ELEMWISE_GRAD_MAKER(elementwise_add, Add);
-REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_add, "Add", "Out = X + Y", "Out",
+REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_add, "Add", "Out = X + Y");
-                              "X");
 REGISTER_OP_CPU_KERNEL(
    elementwise_add,

--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -272,12 +272,11 @@ class ElementwiseGradOpInplace : public framework::InplaceOpInference {
  }
 };
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ElementwiseGradNoBufVarsInference, "Y");
 }  // namespace operators
 }  // namespace paddle
-/*
-*/
 #define REGISTER_ELEMWISE_GRAD_MAKER(kernel_type, op_name)                   \
  class kernel_type##GradMaker                                               \
      : public paddle::framework::SingleGradOpDescMaker {                    \
@@ -311,18 +310,19 @@ class ElementwiseGradOpInplace : public framework::InplaceOpInference {
                    ::paddle::framework::DefaultGradOpDescMaker<true>); \
  REGISTER_OPERATOR(op_type##_grad, ::paddle::operators::ElementwiseOpGrad)
-#define REGISTER_ELEMWISE_EXPLICIT_OP(op_type, op_name, equation, ...) \
+#define REGISTER_ELEMWISE_EXPLICIT_OP(op_type, op_name, equation)   \
-  class __ElemwiseOp##op_type##Maker__                                 \
+  class __ElemwiseOp##op_type##Maker__                              \
-      : public ::paddle::operators::ElementwiseOpMaker {               \
+      : public ::paddle::operators::ElementwiseOpMaker {            \
-   protected:                                                          \
+   protected:                                                       \
-    virtual std::string GetName() const { return op_name; }            \
+    virtual std::string GetName() const { return op_name; }         \
-    virtual std::string GetEquation() const { return equation; }       \
+    virtual std::string GetEquation() const { return equation; }    \
-  };                                                                   \
+  };                                                                \
-  REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp,       \
+  REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp,    \
-                    __ElemwiseOp##op_type##Maker__,                    \
+                    __ElemwiseOp##op_type##Maker__,                 \
-                    ::paddle::operators::ElementwiseOpInferVarType,    \
+                    ::paddle::operators::ElementwiseOpInferVarType, \
-                    op_type##GradMaker,                                \
+                    op_type##GradMaker,                             \
-                    ::paddle::operators::ElementwiseOpInplace);        \
+                    ::paddle::operators::ElementwiseOpInplace);     \
-  REGISTER_OPERATOR(op_type##_grad,                                    \
+  REGISTER_OPERATOR(op_type##_grad,                                 \
-                    ::paddle::operators::ElementwiseOpExplicitGrad,    \
+                    ::paddle::operators::ElementwiseOpExplicitGrad, \
-                    ::paddle::operators::ElementwiseGradOpInplace)
+                    ::paddle::operators::ElementwiseGradOpInplace,  \
+                    ::paddle::operators::ElementwiseGradNoBufVarsInference)
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
@@ -16,8 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 namespace ops = paddle::operators;
 REGISTER_ELEMWISE_GRAD_MAKER(elementwise_sub, Sub);
-REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_sub, "Sub", "Out = X - Y", "Out",
+REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_sub, "Sub", "Out = X - Y");
-                              "X");
 REGISTER_OP_CPU_KERNEL(
    elementwise_sub,

--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/gather_op.h"
+#include <memory>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/ddim.h"
 namespace paddle {
@@ -59,8 +62,9 @@ class GatherGradOp : public framework::OperatorWithKernel {
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+    return framework::OpKernelType(
-                                   ctx.device_context());
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
+        ctx.device_context());
  }
 };
@@ -94,13 +98,34 @@ Out = [[3, 4],
 )DOC");
  }
 };
+class GatherGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("gather_grad");
+    op->SetInput("Index", Input("Index"));
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(GatherGradNoNeedBufferVarInference, "X");
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::GatherGradOpDescMaker);
-REGISTER_OPERATOR(gather_grad, ops::GatherGradOp);
+REGISTER_OPERATOR(gather_grad, ops::GatherGradOp,
+                  ops::GatherGradNoNeedBufferVarInference);
 REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>,
                       ops::GatherOpKernel<double>, ops::GatherOpKernel<int>,
                       ops::GatherOpKernel<uint8_t>,

--- a/paddle/fluid/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/lod_reset_op.h"
+#include <memory>
 namespace paddle {
 namespace operators {
@@ -146,18 +147,39 @@ class LoDResetGradOp : public framework::OperatorWithKernel {
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+    return framework::OpKernelType(
-                                   ctx.device_context());
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
+        ctx.device_context());
  }
 };
+class LoDResetGradDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("lod_reset_grad");
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetInput("X", Input("X"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(LoDResetGradNoNeedBufferVarInference,
+                                      "X");
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::LoDResetGradDescMaker);
-REGISTER_OPERATOR(lod_reset_grad, ops::LoDResetGradOp);
+REGISTER_OPERATOR(lod_reset_grad, ops::LoDResetGradOp,
+                  ops::LoDResetGradNoNeedBufferVarInference);
 REGISTER_OP_CPU_KERNEL(
    lod_reset, ops::LoDResetKernel<paddle::platform::CPUPlace, float>,
    ops::LoDResetKernel<paddle::platform::CPUPlace, double>,

--- a/paddle/fluid/operators/reader/ctr_reader.h
+++ b/paddle/fluid/operators/reader/ctr_reader.h
@@ -21,6 +21,7 @@
 #include <cstdlib>
 #include <fstream>
 #include <iostream>
+#include <memory>
 #include <sstream>
 #include <string>
 #include <unordered_map>
@@ -152,7 +153,7 @@ class CTRReader : public framework::FileReader {
    queue_->ReOpen();
    VLOG(3) << "reopen success";
    VLOG(3) << "thread_num " << thread_num_;
-    for (int thread_id = 0; thread_id < thread_num_; thread_id++) {
+    for (size_t thread_id = 0; thread_id < thread_num_; thread_id++) {
      read_threads_.emplace_back(new std::thread(std::bind(
          &ReadThread, file_groups_[thread_id], data_desc_,
          static_cast<int>(thread_id), &read_thread_status_, queue_)));

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/framework/ir/pass_builder.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -139,6 +140,7 @@ PYBIND11_MODULE(core, m) {
  paddle::platform::CpuTotalPhysicalMemory();
  paddle::memory::allocation::UseAllocatorStrategyGFlag();
  m.doc() = "C++ core of PaddlePaddle";
  // using framework in this function. Since it is inside a function, it will
@@ -153,6 +155,11 @@ PYBIND11_MODULE(core, m) {
        return paddle::operators::AppendPythonCallableObjectAndReturnId(py_obj);
      });
+  // NOTE(zjl): ctest would load environment variables at the beginning even
+  // though we have not `import paddle.fluid as fluid`. So we add this API
+  // to enable eager deletion mode in unittest.
+  m.def("_set_eager_deletion_mode", &paddle::framework::SetEagerDeletionMode);
  m.add_object("_cleanup",
               py::capsule([]() { ScopePool::Instance().Clear(); }));
@@ -281,6 +288,8 @@ PYBIND11_MODULE(core, m) {
  py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
      .def_buffer(
          [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
+      .def("_is_initialized",
+           [](const Tensor &self) { return self.IsInitialized(); })
      .def("_get_dims",
           [](const Tensor &self) { return vectorize(self.dims()); })
      .def("_set_dims",
@@ -681,7 +690,8 @@ All parameter, weight, gradient are variables in Paddle.
      .def("drop_kids", &Scope::DropKids,
           R"DOC(
           Delete all sub-scopes of the current scope.
-           )DOC");
+           )DOC")
+      .def("_kids", &Scope::kids);
  m.def("Scope",
        []() -> Scope * {

--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import numpy as np
+os.environ['FLAGS_use_ngraph'] = '0'
+os.environ['FLAGS_use_mkldnn'] = '0'
+os.environ['CPU_NUM'] = '4'
+import paddle.fluid as fluid
+import six
+import unittest
+import multiprocessing
+fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
+def simple_fc_net():
+    image = fluid.layers.data(name='image', shape=[784], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    hidden = image
+    for _ in range(4):
+        hidden = fluid.layers.fc(
+            hidden,
+            size=200,
+            act='tanh',
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    optimizer = fluid.optimizer.Adam(learning_rate=1e-3)
+    optimizer.minimize(loss)
+    return image, label, loss
+def get_persistables_and_non_persistables(prog, fetch_list):
+    num_block = prog.num_blocks
+    persitables = set()
+    non_persistables = set()
+    for bid in six.moves.range(num_block):
+        block = prog.block(bid)
+        for _, var in block.vars.items():
+            if var.persistable or var.name in fetch_list:
+                persitables.add(var.name)
+            else:
+                non_persistables.add(var.name)
+    return persitables, non_persistables
+class TestExecutor(unittest.TestCase):
+    def test_executor_main(self):
+        places = [fluid.CPUPlace()]
+        if fluid.core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.place = p
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                with fluid.scope_guard(fluid.Scope()):
+                    with fluid.unique_name.guard():
+                        self.executor_main()
+        for p in places:
+            self.place = p
+            with fluid.program_guard(fluid.Program(), fluid.Program()):
+                with fluid.scope_guard(fluid.Scope()):
+                    with fluid.unique_name.guard():
+                        self.pe_main()
+    def prepare_feed(self, image, label, dev_cnt=1):
+        batch_size = 32 * dev_cnt
+        image_shape = (batch_size, ) + tuple(image.shape[1:])
+        label_shape = (batch_size, ) + tuple(label.shape[1:])
+        image_np = np.random.random(size=image_shape).astype('float32')
+        label_np = np.random.random_integers(
+            low=0, high=9, size=label_shape).astype('int64')
+        return image_np, label_np
+    def assertScopeVar(self, scope, persitables, non_persistables):
+        outline_p_vars = []
+        for name in persitables:
+            var = scope.find_var(name)
+            self.assertTrue(var is not None)
+            t = var.get_tensor()
+            if not t._is_initialized():
+                outline_p_vars.append(name)
+        outline_np_vars = []
+        for name in non_persistables:
+            var = scope.find_var(name)
+            self.assertTrue(var is not None)
+            t = var.get_tensor()
+            if t._is_initialized():
+                outline_np_vars.append(name)
+        print('Non-alive persistable vars {} in {}'.format(outline_p_vars,
+                                                           persitables))
+        print('Alive non-persistable vars {} in {}'.format(outline_np_vars,
+                                                           non_persistables))
+        self.assertEqual(len(outline_p_vars), 0)
+        self.assertEqual(len(outline_np_vars), 0)
+    def executor_main(self):
+        image, label, loss = simple_fc_net()
+        loss.persistable = False
+        persistables, non_persistables = get_persistables_and_non_persistables(
+            fluid.default_main_program(), [loss.name])
+        print('Non-persistable var number {}'.format(len(non_persistables)))
+        print(non_persistables)
+        exe = fluid.Executor(self.place)
+        exe.run(fluid.default_startup_program())
+        p = fluid.core.Place()
+        p.set_place(self.place)
+        exe = fluid.core.Executor(p)
+        for _ in six.moves.range(10):
+            image_np, label_np = self.prepare_feed(image, label)
+            fluid.global_scope().var(image.name).get_tensor().set(image_np,
+                                                                  self.place)
+            fluid.global_scope().var(label.name).get_tensor().set(label_np,
+                                                                  self.place)
+            # exe.run would not create local scope
+            # so that we can detect whether gc clears temporary variables
+            exe.run(fluid.default_main_program().desc,
+                    fluid.global_scope(), 0, False, True, [loss.name])
+            self.assertScopeVar(fluid.global_scope(), persistables,
+                                non_persistables)
+    def pe_main(self):
+        image, label, loss = simple_fc_net()
+        loss.persistable = False
+        persitables, non_persistables = get_persistables_and_non_persistables(
+            fluid.default_main_program(), [loss.name])
+        exe = fluid.Executor(self.place)
+        exe.run(fluid.default_startup_program())
+        exec_strategy = fluid.ExecutionStrategy()
+        exec_strategy.num_iteration_per_drop_scope = 100
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.memory_optimize = False
+        build_strategy.enable_inplace = False
+        prog = fluid.CompiledProgram(fluid.default_main_program(
+        )).with_data_parallel(
+            loss_name=loss.name, exec_strategy=exec_strategy)
+        dev_cnt = fluid.core.get_cuda_device_count() if isinstance(self.place, fluid.CUDAPlace)    \
+            else int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+        for idx in six.moves.range(10):
+            image_np, label_np = self.prepare_feed(image, label, dev_cnt)
+            feed = {image.name: image_np, label.name: label_np}
+            exe.run(program=prog, feed=feed, fetch_list=[loss])
+            local_scopes = prog._local_scopes
+            for scope in local_scopes:
+                kids = scope._kids()
+                self.assertTrue(len(kids) == 1)
+                self.assertScopeVar(kids[0], persistables, non_persistables)
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 import os
-os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0'
 os.environ['CPU_NUM'] = '2'
 import six

--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
@@ -16,6 +16,8 @@ import unittest
 from test_eager_deletion_dynamic_rnn_base import TestBase
 import paddle.fluid as fluid
+fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
 def gru_net(data,
            label,

--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
@@ -16,6 +16,8 @@ from test_eager_deletion_dynamic_rnn_base import TestBase
 import paddle.fluid as fluid
 import unittest
+fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
 def lstm_net(data,
             label,

--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
@@ -14,7 +14,9 @@
 import os
 import unittest
-os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
+import paddle.fluid as fluid
+fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
 # FIXME(zjl): It seems that this unittest fails randomly 
 # when comparing all reduce last loss and reduce last loss

--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
@@ -14,7 +14,9 @@
 import os
 import unittest
-os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
+import paddle.fluid as fluid
+fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
 os.environ['RECORDIO_FILENAME'] = './eager_deletion_transformer.wmt16.recordio'

--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py
@@ -16,8 +16,6 @@ from __future__ import print_function
 import os
 os.environ['CPU_NUM'] = '2'
-os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0'
-os.environ['FLAGS_fast_eager_deletion_mode'] = '1'
 import unittest
 import paddle.fluid as fluid
@@ -29,6 +27,8 @@ import paddle.fluid.compiler as compiler
 import numpy
 import multiprocessing
+fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
 class TestEagerDeletionWhileOpBase(unittest.TestCase):
    def test_main(self):

--- a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py
@@ -14,11 +14,12 @@
 import os
 import unittest
-os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
+import paddle.fluid as fluid
-os.environ['FLAGS_memory_fraction_of_eager_deletion'] = "0.55"
 os.environ['RECORDIO_FILENAME'] = './p_gc_transformer.wmt16.recordio'
+fluid.core._set_eager_deletion_mode(0.0, 0.55, True)
 from test_parallel_executor_transformer import TestTransformer
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_roi_align_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_align_op.py
@@ -168,3 +168,7 @@ class TestROIAlignOp(OpTest):
    def test_check_grad(self):
        self.check_grad(['X'], 'Out')
+if __name__ == '__main__':
+    unittest.main()