From 4f28a4c2929a2b37f220b0c4e6f720e5b558f083 Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Tue, 20 Sep 2022 10:50:16 +0800 Subject: [PATCH] Run_program_op add scope cache & reuse (#45813) (#46223) * add scope cache & reuse * add gc scope for end of each train step * del scope reuse for jit * refine code * test --- .../eager/to_static/run_program_op_node.h | 173 ++++++++++++------ paddle/fluid/framework/scope.h | 8 + paddle/fluid/pybind/pybind.cc | 3 +- .../dygraph_to_static/partial_program.py | 42 ++++- 4 files changed, 164 insertions(+), 62 deletions(-) diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h index 93e957e20fa..b5f0278e2d0 100644 --- a/paddle/fluid/eager/to_static/run_program_op_node.h +++ b/paddle/fluid/eager/to_static/run_program_op_node.h @@ -246,6 +246,34 @@ static void BuildScopeByBlock( } } +static void GcScope(paddle::framework::Scope *scope) { + std::deque> *garbages = + new std::deque>(); + + for (auto &var : scope->LocalVars()) { + if (var != nullptr) { + if (var->IsType()) { + garbages->emplace_back(var->GetMutable() + ->MoveMemoryHolder()); + } + if (var->IsType()) { + garbages->emplace_back(var->GetMutable() + ->mutable_value() + ->MoveMemoryHolder()); + } + if (var->IsType()) { + auto *lod_tensor_arr = + var->GetMutable(); + for (auto &t : *lod_tensor_arr) { + garbages->emplace_back(t.MoveMemoryHolder()); + } + lod_tensor_arr->clear(); + } + } + } + delete garbages; // free mem +} + } // namespace details inline void RunProgramAPI( @@ -274,16 +302,6 @@ inline void RunProgramAPI( 1, paddle::platform::errors::InvalidArgument( "The OutScope of RunProgramGradOp should only hold one scope.")); - // Step 2. prepare executor and init persistable variables - // NOTE(Aurelius84): While training some models, forward can be called many - // times and then apply backpropagation all at once, such as Reinforcement - // Learning. Tensor data in multi-step training should be saved into single - // scope separately. Otherwise, the gradients can be miscalculated because - // always using the Tensor data of the last step in forward. - paddle::framework::Scope *global_inner_scope = out_scope_vec->front(); - VLOG(2) << "The number of sub scopes before forward: " - << out_scope_vec->front()->kids().size(); - paddle::framework::Scope &scope = global_inner_scope->NewScope(); bool use_interpretorcore = PADDLE_GET_CONST(bool, attrs.at("use_interpretorcore")); @@ -291,6 +309,8 @@ inline void RunProgramAPI( if (use_interpretorcore) { VLOG(0) << "RunProgramOp use interpretercore to execute program."; + paddle::framework::Scope *global_inner_scope = out_scope_vec->front(); + auto input_names = details::GetTensorsName(x); auto output_names = details::GetTensorsName(out); auto dout_names = details::GetTensorsName(dout); @@ -308,12 +328,16 @@ inline void RunProgramAPI( if (!interpretercore_info_cache.Has(program_id, /*is_grad=*/false)) { VLOG(2) << "No interpretercore cahce, so create a new interpretercore"; // Step 1. share input_vars & parameters into scope - details::ShareTensorsIntoScope(x, &scope); - details::ShareTensorsIntoScope(params, &scope); + details::ShareTensorsIntoScope(x, global_inner_scope); + details::ShareTensorsIntoScope(params, global_inner_scope); // Step 2. create new interpretercore auto interpreter_core = paddle::framework::CreateInterpreterCoreInfoToCache( - *forward_program, place, /*is_grad=*/false, program_id, &scope); + *forward_program, + place, + /*is_grad=*/false, + program_id, + global_inner_scope); // Step 3. get all eager gc vars std::set skip_eager_delete_vars = paddle::framework::details::ParseSafeEagerDeletionSkipVarsSet( @@ -331,10 +355,14 @@ inline void RunProgramAPI( interpreter_core->Run({}); } // Step 5. Get Output - details::ShareTensorsFromScopeWithPartialBlock( - out, *forward_global_block, *backward_global_block, &scope); - details::ShareTensorsFromScopeWithPartialBlock( - dout, *forward_global_block, *backward_global_block, &scope); + details::ShareTensorsFromScopeWithPartialBlock(out, + *forward_global_block, + *backward_global_block, + global_inner_scope); + details::ShareTensorsFromScopeWithPartialBlock(dout, + *forward_global_block, + *backward_global_block, + global_inner_scope); } else { VLOG(2) << "Get interpretercore cahce by program:" << program_id; // Step 1. get cache interpretercore @@ -342,34 +370,55 @@ inline void RunProgramAPI( interpretercore_info_cache.GetMutable(program_id, /*is_grad=*/false); auto &interpreter_core = cached_value.core_; // Step 2. update scope for cache interpretercore - details::ShareTensorsIntoScope(x, &scope); - details::ShareTensorsIntoScope(params, &scope); - details::BuildScopeByBlock( - *interpreter_core.get(), *forward_global_block, &scope); - interpreter_core->reset_scope(&scope); + details::ShareTensorsIntoScope(x, global_inner_scope); + details::ShareTensorsIntoScope(params, global_inner_scope); + if (interpreter_core->GetVariableScope()->GetMutableScope() != + global_inner_scope) { + details::BuildScopeByBlock( + *interpreter_core.get(), *forward_global_block, global_inner_scope); + interpreter_core->reset_scope(global_inner_scope); + } // Step 3. interpretercore run if (forward_global_block->OpSize() > 0) { interpreter_core->Run({}); } // Step 4. Get Output - details::ShareTensorsFromScopeWithPartialBlock( - out, *forward_global_block, *backward_global_block, &scope); - details::ShareTensorsFromScopeWithPartialBlock( - dout, *forward_global_block, *backward_global_block, &scope); + details::ShareTensorsFromScopeWithPartialBlock(out, + *forward_global_block, + *backward_global_block, + global_inner_scope); + details::ShareTensorsFromScopeWithPartialBlock(dout, + *forward_global_block, + *backward_global_block, + global_inner_scope); } VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front()); if (is_test) { - VLOG(1) << "is test, after forward, drop kids"; - out_scope_vec->front()->DropKids(); + VLOG(4) << "is test, set this scope can reused"; + global_inner_scope->SetCanReuesd(true); + details::GcScope(global_inner_scope); + } else { + VLOG(4) << "not test, set this scope can not reused"; + global_inner_scope->SetCanReuesd(false); } - VLOG(2) << "The number of sub scopes after forward: " - << out_scope_vec->front()->kids().size(); #ifdef PADDLE_WITH_MKLDNN if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place); #endif } else { VLOG(2) << "RunProgramOp execute with parallel_executor."; + + // Step 2. prepare executor and init persistable variables + // NOTE(Aurelius84): While training some models, forward can be called many + // times and then apply backpropagation all at once, such as Reinforcement + // Learning. Tensor data in multi-step training should be saved into single + // scope separately. Otherwise, the gradients can be miscalculated because + // always using the Tensor data of the last step in forward. + paddle::framework::Scope *global_inner_scope = out_scope_vec->front(); + VLOG(2) << "The number of sub scopes before forward: " + << out_scope_vec->front()->kids().size(); + paddle::framework::Scope &scope = global_inner_scope->NewScope(); + // share input_vars & parameters into scope details::ShareTensorsIntoScope(x, &scope); details::ShareTensorsIntoScope(params, &scope); @@ -454,21 +503,14 @@ inline void RunProgramGradAPI( 1, paddle::platform::errors::InvalidArgument( "The OutScope of RunProgramGradOp should only hold one scope.")); - paddle::framework::Scope *global_inner_scope = out_scope_vec->front(); - auto sub_scope_num = global_inner_scope->kids().size(); - VLOG(2) << "The number of sub scopes before backward: " << sub_scope_num; - PADDLE_ENFORCE_GT(sub_scope_num, - 0, - paddle::platform::errors::InvalidArgument( - "The OutScope of RunProgramGradOp should hold at " - "least one sub scope.")); - auto &scope = *(global_inner_scope->kids().front()); auto place = egr::Controller::Instance().GetExpectedPlace(); if (use_interpretorcore) { VLOG(0) << "RunProgramGradOp use interpretercore to execute program."; + paddle::framework::Scope *global_inner_scope = out_scope_vec->front(); + auto *forward_global_block = PADDLE_GET_CONST( paddle::framework::BlockDesc *, attrs.at("forward_global_block")); auto *backward_global_block = PADDLE_GET_CONST( @@ -490,10 +532,14 @@ inline void RunProgramGradAPI( paddle::framework::InterpreterCoreInfoCache::Instance(); if (!interpretercore_info_cache.Has(program_id, /*is_grad=*/true)) { VLOG(2) << "No interpretercore cahce, so create a new interpretercore"; - details::ShareTensorsIntoScope(out_grad, &scope); + details::ShareTensorsIntoScope(out_grad, global_inner_scope); auto interpreter_core = paddle::framework::CreateInterpreterCoreInfoToCache( - *backward_program, place, /*is_grad=*/true, program_id, &scope); + *backward_program, + place, + /*is_grad=*/true, + program_id, + global_inner_scope); // get all eager gc vars std::set skip_eager_delete_vars; @@ -518,10 +564,14 @@ inline void RunProgramGradAPI( interpretercore_info_cache.GetMutable(program_id, /*is_grad=*/true); auto &interpreter_core = cached_value.core_; // update scope - details::ShareTensorsIntoScope(out_grad, &scope); - details::BuildScopeByBlock( - *interpreter_core.get(), *backward_global_block, &scope); - interpreter_core->reset_scope(&scope); + details::ShareTensorsIntoScope(out_grad, global_inner_scope); + if (interpreter_core->GetVariableScope()->GetMutableScope() != + global_inner_scope) { + details::BuildScopeByBlock(*interpreter_core.get(), + *backward_global_block, + global_inner_scope); + interpreter_core->reset_scope(global_inner_scope); + } if (backward_global_block->OpSize() > 0) { // Debug info: scope info when run end @@ -531,16 +581,31 @@ inline void RunProgramGradAPI( } } // Step 4. get outputs - details::ShareTensorsFromScopeWithPartialBlock( - x_grad, *forward_global_block, *backward_global_block, &scope); - details::ShareTensorsFromScopeWithPartialBlock( - params_grad, *forward_global_block, *backward_global_block, &scope); - - // Step5. drop current scope - global_inner_scope->DeleteScope(&scope); - VLOG(2) << "The number of sub scopes after backward: " - << global_inner_scope->kids().size(); + details::ShareTensorsFromScopeWithPartialBlock(x_grad, + *forward_global_block, + *backward_global_block, + global_inner_scope); + details::ShareTensorsFromScopeWithPartialBlock(params_grad, + *forward_global_block, + *backward_global_block, + global_inner_scope); + VLOG(4) << "after backward gc all vars"; + global_inner_scope->SetCanReuesd(true); + details::GcScope(global_inner_scope); } else { + VLOG(2) << "RunProgramGradOp use pe to execute program."; + + paddle::framework::Scope *global_inner_scope = out_scope_vec->front(); + auto sub_scope_num = global_inner_scope->kids().size(); + VLOG(2) << "The number of sub scopes before backward: " << sub_scope_num; + PADDLE_ENFORCE_GT(sub_scope_num, + 0, + paddle::platform::errors::InvalidArgument( + "The OutScope of RunProgramGradOp should hold at " + "least one sub scope.")); + + auto &scope = *(global_inner_scope->kids().front()); + auto *global_block = PADDLE_GET_CONST(paddle::framework::BlockDesc *, attrs.at("global_block")); auto orig_end_op_index = diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index c560db03949..7f08fc9b4e2 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -132,6 +132,11 @@ class Scope : public ScopeBase { // Rename variable to a new name and return the new name std::string Rename(const std::string& origin_name) const; + // only for dygraph_to_static + bool CanReuesd() const { return can_reused_; } + + void SetCanReuesd(bool can_reused) { can_reused_ = can_reused; } + protected: struct KeyHasher { std::size_t operator()(const std::string& key) const { @@ -169,6 +174,9 @@ class Scope : public ScopeBase { mutable std::list kids_; const Scope* parent_{nullptr}; + // only for dygraph_to_static + bool can_reused_{false}; + DISABLE_COPY_AND_ASSIGN(Scope); private: diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 4883183b383..ffb963055d4 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1080,7 +1080,8 @@ All parameter, weight, gradient are variables in Paddle. R"DOC( Delete all sub-scopes of the current scope. )DOC") - .def("_kids", &Scope::kids); + .def("_kids", &Scope::kids) + .def_property("_can_reuesd", &Scope::CanReuesd, &Scope::SetCanReuesd); m.def( "Scope", diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py index 1dfdda102c9..c9e659cb68b 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py @@ -169,6 +169,25 @@ class PartialProgramLayer: custom_white_list=custom_white_list, custom_black_list=custom_black_list) + # program_id -> list(scope) + self._scope_cache = {} + + def _get_scope(self, program_id=None, use_scope_cache=False): + if use_scope_cache: + if program_id not in self._scope_cache: + scope = core.Scope() + self._scope_cache[program_id] = [scope] + return scope + else: + for scope in self._scope_cache[program_id]: + if scope._can_reuesd: + return scope + scope = core.Scope() + self._scope_cache[program_id].append(scope) + return scope + else: + return core.Scope() + @LazyInitialized def __fake_vars(self): return _create_fake_var() @@ -555,11 +574,19 @@ class PartialProgramLayer: ('forward_global_block', self.forward_program.desc.block(0), 'backward_global_block', self.backward_program.desc.block(0))) - _legacy_C_ops.run_program(self._valid_vars(in_vars), - self._valid_vars(self._params), - self._valid_vars(out_vars), - self._create_scope_vec(), self._double_grads, - self._cuda_graph_vec, *attrs) + _legacy_C_ops.run_program( + self._valid_vars(in_vars), self._valid_vars(self._params), + self._valid_vars(out_vars), + self._create_scope_vec(program_id=self.program_id, + use_scope_cache=True), + self._double_grads, self._cuda_graph_vec, *attrs) + else: + _legacy_C_ops.run_program(self._valid_vars(in_vars), + self._valid_vars(self._params), + self._valid_vars(out_vars), + self._create_scope_vec(), + self._double_grads, self._cuda_graph_vec, + *attrs) restored_nest_out = self._restore_out(out_vars) return self._remove_no_value(restored_nest_out) @@ -735,10 +762,11 @@ class PartialProgramLayer: return input_vars, out_vars - def _create_scope_vec(self): + def _create_scope_vec(self, program_id=None, use_scope_cache=False): # Hold forward variables tmp_scope_vec = None - inner_scope = core.Scope() + inner_scope = self._get_scope(program_id=program_id, + use_scope_cache=use_scope_cache) if not framework._in_eager_mode_: tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [], "program_out_scope", -- GitLab