Run_program_op add scope cache & reuse (#45813) (#46223)

* add scope cache & reuse * add gc scope for end of each train step * del scope reuse for jit * refine code * test

Run_program_op add scope cache & reuse (#45813) (#46223)
* add scope cache & reuse * add gc scope for end of each train step * del scope reuse for jit * refine code * test
4f28a4c2 · zhangbo9674 · GitHub · c0324e82 · 4f28a4c2 · 4f28a4c2
4 changed file
--- a/paddle/fluid/eager/to_static/run_program_op_node.h
+++ b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -246,6 +246,34 @@ static void BuildScopeByBlock(
  }
 }
+static void GcScope(paddle::framework::Scope *scope) {
+  std::deque<std::shared_ptr<paddle::memory::Allocation>> *garbages =
+      new std::deque<std::shared_ptr<paddle::memory::Allocation>>();
+  for (auto &var : scope->LocalVars()) {
+    if (var != nullptr) {
+      if (var->IsType<paddle::framework::LoDTensor>()) {
+        garbages->emplace_back(var->GetMutable<paddle::framework::LoDTensor>()
+                                   ->MoveMemoryHolder());
+      }
+      if (var->IsType<phi::SelectedRows>()) {
+        garbages->emplace_back(var->GetMutable<phi::SelectedRows>()
+                                   ->mutable_value()
+                                   ->MoveMemoryHolder());
+      }
+      if (var->IsType<paddle::framework::LoDTensorArray>()) {
+        auto *lod_tensor_arr =
+            var->GetMutable<paddle::framework::LoDTensorArray>();
+        for (auto &t : *lod_tensor_arr) {
+          garbages->emplace_back(t.MoveMemoryHolder());
+        }
+        lod_tensor_arr->clear();
+      }
+    }
+  }
+  delete garbages;  // free mem
+}
 }  // namespace details
 inline void RunProgramAPI(
@@ -274,16 +302,6 @@ inline void RunProgramAPI(
      1,
      paddle::platform::errors::InvalidArgument(
          "The OutScope of RunProgramGradOp should only hold one scope."));
-  // Step 2. prepare executor and init persistable variables
-  // NOTE(Aurelius84): While training some models, forward can be called many
-  // times and then apply backpropagation all at once, such as Reinforcement
-  // Learning. Tensor data in multi-step training should be saved into single
-  // scope separately. Otherwise, the gradients can be miscalculated because
-  // always using the Tensor data of the last step in forward.
-  paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
-  VLOG(2) << "The number of sub scopes before forward: "
-          << out_scope_vec->front()->kids().size();
-  paddle::framework::Scope &scope = global_inner_scope->NewScope();
  bool use_interpretorcore =
      PADDLE_GET_CONST(bool, attrs.at("use_interpretorcore"));
@@ -291,6 +309,8 @@ inline void RunProgramAPI(
  if (use_interpretorcore) {
    VLOG(0) << "RunProgramOp use interpretercore to execute program.";
+    paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
    auto input_names = details::GetTensorsName(x);
    auto output_names = details::GetTensorsName(out);
    auto dout_names = details::GetTensorsName(dout);
@@ -308,12 +328,16 @@ inline void RunProgramAPI(
    if (!interpretercore_info_cache.Has(program_id, /*is_grad=*/false)) {
      VLOG(2) << "No interpretercore cahce, so create a new interpretercore";
      // Step 1. share input_vars & parameters into scope
-      details::ShareTensorsIntoScope(x, &scope);
+      details::ShareTensorsIntoScope(x, global_inner_scope);
-      details::ShareTensorsIntoScope(params, &scope);
+      details::ShareTensorsIntoScope(params, global_inner_scope);
      // Step 2. create new interpretercore
      auto interpreter_core =
          paddle::framework::CreateInterpreterCoreInfoToCache(
-              *forward_program, place, /*is_grad=*/false, program_id, &scope);
+              *forward_program,
+              place,
+              /*is_grad=*/false,
+              program_id,
+              global_inner_scope);
      // Step 3. get all eager gc vars
      std::set<std::string> skip_eager_delete_vars =
          paddle::framework::details::ParseSafeEagerDeletionSkipVarsSet(
@@ -331,10 +355,14 @@ inline void RunProgramAPI(
        interpreter_core->Run({});
      }
      // Step 5. Get Output
-      details::ShareTensorsFromScopeWithPartialBlock(
+      details::ShareTensorsFromScopeWithPartialBlock(out,
-          out, *forward_global_block, *backward_global_block, &scope);
+                                                     *forward_global_block,
-      details::ShareTensorsFromScopeWithPartialBlock(
+                                                     *backward_global_block,
-          dout, *forward_global_block, *backward_global_block, &scope);
+                                                     global_inner_scope);
+      details::ShareTensorsFromScopeWithPartialBlock(dout,
+                                                     *forward_global_block,
+                                                     *backward_global_block,
+                                                     global_inner_scope);
    } else {
      VLOG(2) << "Get interpretercore cahce by program:" << program_id;
      // Step 1. get cache interpretercore
@@ -342,34 +370,55 @@ inline void RunProgramAPI(
          interpretercore_info_cache.GetMutable(program_id, /*is_grad=*/false);
      auto &interpreter_core = cached_value.core_;
      // Step 2. update scope for cache interpretercore
-      details::ShareTensorsIntoScope(x, &scope);
+      details::ShareTensorsIntoScope(x, global_inner_scope);
-      details::ShareTensorsIntoScope(params, &scope);
+      details::ShareTensorsIntoScope(params, global_inner_scope);
-      details::BuildScopeByBlock(
+      if (interpreter_core->GetVariableScope()->GetMutableScope() !=
-          *interpreter_core.get(), *forward_global_block, &scope);
+          global_inner_scope) {
-      interpreter_core->reset_scope(&scope);
+        details::BuildScopeByBlock(
+            *interpreter_core.get(), *forward_global_block, global_inner_scope);
+        interpreter_core->reset_scope(global_inner_scope);
+      }
      // Step 3. interpretercore run
      if (forward_global_block->OpSize() > 0) {
        interpreter_core->Run({});
      }
      // Step 4. Get Output
-      details::ShareTensorsFromScopeWithPartialBlock(
+      details::ShareTensorsFromScopeWithPartialBlock(out,
-          out, *forward_global_block, *backward_global_block, &scope);
+                                                     *forward_global_block,
-      details::ShareTensorsFromScopeWithPartialBlock(
+                                                     *backward_global_block,
-          dout, *forward_global_block, *backward_global_block, &scope);
+                                                     global_inner_scope);
+      details::ShareTensorsFromScopeWithPartialBlock(dout,
+                                                     *forward_global_block,
+                                                     *backward_global_block,
+                                                     global_inner_scope);
    }
    VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front());
    if (is_test) {
-      VLOG(1) << "is test, after forward, drop kids";
+      VLOG(4) << "is test, set this scope can reused";
-      out_scope_vec->front()->DropKids();
+      global_inner_scope->SetCanReuesd(true);
+      details::GcScope(global_inner_scope);
+    } else {
+      VLOG(4) << "not test, set this scope can not reused";
+      global_inner_scope->SetCanReuesd(false);
    }
-    VLOG(2) << "The number of sub scopes after forward: "
-            << out_scope_vec->front()->kids().size();
 #ifdef PADDLE_WITH_MKLDNN
    if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place);
 #endif
  } else {
    VLOG(2) << "RunProgramOp execute with parallel_executor.";
+    // Step 2. prepare executor and init persistable variables
+    // NOTE(Aurelius84): While training some models, forward can be called many
+    // times and then apply backpropagation all at once, such as Reinforcement
+    // Learning. Tensor data in multi-step training should be saved into single
+    // scope separately. Otherwise, the gradients can be miscalculated because
+    // always using the Tensor data of the last step in forward.
+    paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
+    VLOG(2) << "The number of sub scopes before forward: "
+            << out_scope_vec->front()->kids().size();
+    paddle::framework::Scope &scope = global_inner_scope->NewScope();
    // share input_vars & parameters into scope
    details::ShareTensorsIntoScope(x, &scope);
    details::ShareTensorsIntoScope(params, &scope);
@@ -454,21 +503,14 @@ inline void RunProgramGradAPI(
      1,
      paddle::platform::errors::InvalidArgument(
          "The OutScope of RunProgramGradOp should only hold one scope."));
-  paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
-  auto sub_scope_num = global_inner_scope->kids().size();
-  VLOG(2) << "The number of sub scopes before backward: " << sub_scope_num;
-  PADDLE_ENFORCE_GT(sub_scope_num,
-                    0,
-                    paddle::platform::errors::InvalidArgument(
-                        "The OutScope of RunProgramGradOp should hold at "
-                        "least one sub scope."));
-  auto &scope = *(global_inner_scope->kids().front());
  auto place = egr::Controller::Instance().GetExpectedPlace();
  if (use_interpretorcore) {
    VLOG(0) << "RunProgramGradOp use interpretercore to execute program.";
+    paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
    auto *forward_global_block = PADDLE_GET_CONST(
        paddle::framework::BlockDesc *, attrs.at("forward_global_block"));
    auto *backward_global_block = PADDLE_GET_CONST(
@@ -490,10 +532,14 @@ inline void RunProgramGradAPI(
        paddle::framework::InterpreterCoreInfoCache::Instance();
    if (!interpretercore_info_cache.Has(program_id, /*is_grad=*/true)) {
      VLOG(2) << "No interpretercore cahce, so create a new interpretercore";
-      details::ShareTensorsIntoScope(out_grad, &scope);
+      details::ShareTensorsIntoScope(out_grad, global_inner_scope);
      auto interpreter_core =
          paddle::framework::CreateInterpreterCoreInfoToCache(
-              *backward_program, place, /*is_grad=*/true, program_id, &scope);
+              *backward_program,
+              place,
+              /*is_grad=*/true,
+              program_id,
+              global_inner_scope);
      // get all eager gc vars
      std::set<std::string> skip_eager_delete_vars;
@@ -518,10 +564,14 @@ inline void RunProgramGradAPI(
          interpretercore_info_cache.GetMutable(program_id, /*is_grad=*/true);
      auto &interpreter_core = cached_value.core_;
      // update scope
-      details::ShareTensorsIntoScope(out_grad, &scope);
+      details::ShareTensorsIntoScope(out_grad, global_inner_scope);
-      details::BuildScopeByBlock(
+      if (interpreter_core->GetVariableScope()->GetMutableScope() !=
-          *interpreter_core.get(), *backward_global_block, &scope);
+          global_inner_scope) {
-      interpreter_core->reset_scope(&scope);
+        details::BuildScopeByBlock(*interpreter_core.get(),
+                                   *backward_global_block,
+                                   global_inner_scope);
+        interpreter_core->reset_scope(global_inner_scope);
+      }
      if (backward_global_block->OpSize() > 0) {
        // Debug info: scope info when run end
@@ -531,16 +581,31 @@ inline void RunProgramGradAPI(
      }
    }
    // Step 4. get outputs
-    details::ShareTensorsFromScopeWithPartialBlock(
+    details::ShareTensorsFromScopeWithPartialBlock(x_grad,
-        x_grad, *forward_global_block, *backward_global_block, &scope);
+                                                   *forward_global_block,
-    details::ShareTensorsFromScopeWithPartialBlock(
+                                                   *backward_global_block,
-        params_grad, *forward_global_block, *backward_global_block, &scope);
+                                                   global_inner_scope);
+    details::ShareTensorsFromScopeWithPartialBlock(params_grad,
-    // Step5. drop current scope
+                                                   *forward_global_block,
-    global_inner_scope->DeleteScope(&scope);
+                                                   *backward_global_block,
-    VLOG(2) << "The number of sub scopes after backward: "
+                                                   global_inner_scope);
-            << global_inner_scope->kids().size();
+    VLOG(4) << "after backward gc all vars";
+    global_inner_scope->SetCanReuesd(true);
+    details::GcScope(global_inner_scope);
  } else {
+    VLOG(2) << "RunProgramGradOp use pe to execute program.";
+    paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
+    auto sub_scope_num = global_inner_scope->kids().size();
+    VLOG(2) << "The number of sub scopes before backward: " << sub_scope_num;
+    PADDLE_ENFORCE_GT(sub_scope_num,
+                      0,
+                      paddle::platform::errors::InvalidArgument(
+                          "The OutScope of RunProgramGradOp should hold at "
+                          "least one sub scope."));
+    auto &scope = *(global_inner_scope->kids().front());
    auto *global_block = PADDLE_GET_CONST(paddle::framework::BlockDesc *,
                                          attrs.at("global_block"));
    auto orig_end_op_index =

--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -132,6 +132,11 @@ class Scope : public ScopeBase {
  // Rename variable to a new name and return the new name
  std::string Rename(const std::string& origin_name) const;
+  // only for dygraph_to_static
+  bool CanReuesd() const { return can_reused_; }
+  void SetCanReuesd(bool can_reused) { can_reused_ = can_reused; }
 protected:
  struct KeyHasher {
    std::size_t operator()(const std::string& key) const {
@@ -169,6 +174,9 @@ class Scope : public ScopeBase {
  mutable std::list<Scope*> kids_;
  const Scope* parent_{nullptr};
+  // only for dygraph_to_static
+  bool can_reused_{false};
  DISABLE_COPY_AND_ASSIGN(Scope);
 private:

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1080,7 +1080,8 @@ All parameter, weight, gradient are variables in Paddle.
           R"DOC(
           Delete all sub-scopes of the current scope.
           )DOC")
-      .def("_kids", &Scope::kids);
+      .def("_kids", &Scope::kids)
+      .def_property("_can_reuesd", &Scope::CanReuesd, &Scope::SetCanReuesd);
  m.def(
      "Scope",

--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
@@ -169,6 +169,25 @@ class PartialProgramLayer:
            custom_white_list=custom_white_list,
            custom_black_list=custom_black_list)
+        # program_id -> list(scope)
+        self._scope_cache = {}
+    def _get_scope(self, program_id=None, use_scope_cache=False):
+        if use_scope_cache:
+            if program_id not in self._scope_cache:
+                scope = core.Scope()
+                self._scope_cache[program_id] = [scope]
+                return scope
+            else:
+                for scope in self._scope_cache[program_id]:
+                    if scope._can_reuesd:
+                        return scope
+                scope = core.Scope()
+                self._scope_cache[program_id].append(scope)
+                return scope
+        else:
+            return core.Scope()
    @LazyInitialized
    def __fake_vars(self):
        return _create_fake_var()
@@ -555,11 +574,19 @@ class PartialProgramLayer:
                ('forward_global_block', self.forward_program.desc.block(0),
                 'backward_global_block', self.backward_program.desc.block(0)))
-        _legacy_C_ops.run_program(self._valid_vars(in_vars),
+            _legacy_C_ops.run_program(
-                                  self._valid_vars(self._params),
+                self._valid_vars(in_vars), self._valid_vars(self._params),
-                                  self._valid_vars(out_vars),
+                self._valid_vars(out_vars),
-                                  self._create_scope_vec(), self._double_grads,
+                self._create_scope_vec(program_id=self.program_id,
-                                  self._cuda_graph_vec, *attrs)
+                                       use_scope_cache=True),
+                self._double_grads, self._cuda_graph_vec, *attrs)
+        else:
+            _legacy_C_ops.run_program(self._valid_vars(in_vars),
+                                      self._valid_vars(self._params),
+                                      self._valid_vars(out_vars),
+                                      self._create_scope_vec(),
+                                      self._double_grads, self._cuda_graph_vec,
+                                      *attrs)
        restored_nest_out = self._restore_out(out_vars)
        return self._remove_no_value(restored_nest_out)
@@ -735,10 +762,11 @@ class PartialProgramLayer:
        return input_vars, out_vars
-    def _create_scope_vec(self):
+    def _create_scope_vec(self, program_id=None, use_scope_cache=False):
        # Hold forward variables
        tmp_scope_vec = None
-        inner_scope = core.Scope()
+        inner_scope = self._get_scope(program_id=program_id,
+                                      use_scope_cache=use_scope_cache)
        if not framework._in_eager_mode_:
            tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [],
                                         "program_out_scope",