未验证 提交 369a235d 编写于 作者: Z zhangbo9674 提交者: GitHub

Run_program_op add scope cache & reuse (#45813)

* add scope cache & reuse

* add gc scope for end of each train step

* del scope reuse for jit

* refine code

* test
上级 1ed8e9b8
...@@ -246,6 +246,34 @@ static void BuildScopeByBlock( ...@@ -246,6 +246,34 @@ static void BuildScopeByBlock(
} }
} }
static void GcScope(paddle::framework::Scope *scope) {
std::deque<std::shared_ptr<paddle::memory::Allocation>> *garbages =
new std::deque<std::shared_ptr<paddle::memory::Allocation>>();
for (auto &var : scope->LocalVars()) {
if (var != nullptr) {
if (var->IsType<paddle::framework::LoDTensor>()) {
garbages->emplace_back(var->GetMutable<paddle::framework::LoDTensor>()
->MoveMemoryHolder());
}
if (var->IsType<phi::SelectedRows>()) {
garbages->emplace_back(var->GetMutable<phi::SelectedRows>()
->mutable_value()
->MoveMemoryHolder());
}
if (var->IsType<paddle::framework::LoDTensorArray>()) {
auto *lod_tensor_arr =
var->GetMutable<paddle::framework::LoDTensorArray>();
for (auto &t : *lod_tensor_arr) {
garbages->emplace_back(t.MoveMemoryHolder());
}
lod_tensor_arr->clear();
}
}
}
delete garbages; // free mem
}
} // namespace details } // namespace details
inline void RunProgramAPI( inline void RunProgramAPI(
...@@ -274,16 +302,6 @@ inline void RunProgramAPI( ...@@ -274,16 +302,6 @@ inline void RunProgramAPI(
1, 1,
paddle::platform::errors::InvalidArgument( paddle::platform::errors::InvalidArgument(
"The OutScope of RunProgramGradOp should only hold one scope.")); "The OutScope of RunProgramGradOp should only hold one scope."));
// Step 2. prepare executor and init persistable variables
// NOTE(Aurelius84): While training some models, forward can be called many
// times and then apply backpropagation all at once, such as Reinforcement
// Learning. Tensor data in multi-step training should be saved into single
// scope separately. Otherwise, the gradients can be miscalculated because
// always using the Tensor data of the last step in forward.
paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
VLOG(2) << "The number of sub scopes before forward: "
<< out_scope_vec->front()->kids().size();
paddle::framework::Scope &scope = global_inner_scope->NewScope();
bool use_interpretorcore = bool use_interpretorcore =
PADDLE_GET_CONST(bool, attrs.at("use_interpretorcore")); PADDLE_GET_CONST(bool, attrs.at("use_interpretorcore"));
...@@ -291,6 +309,8 @@ inline void RunProgramAPI( ...@@ -291,6 +309,8 @@ inline void RunProgramAPI(
if (use_interpretorcore) { if (use_interpretorcore) {
VLOG(0) << "RunProgramOp use interpretercore to execute program."; VLOG(0) << "RunProgramOp use interpretercore to execute program.";
paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
auto input_names = details::GetTensorsName(x); auto input_names = details::GetTensorsName(x);
auto output_names = details::GetTensorsName(out); auto output_names = details::GetTensorsName(out);
auto dout_names = details::GetTensorsName(dout); auto dout_names = details::GetTensorsName(dout);
...@@ -308,12 +328,16 @@ inline void RunProgramAPI( ...@@ -308,12 +328,16 @@ inline void RunProgramAPI(
if (!interpretercore_info_cache.Has(program_id, /*is_grad=*/false)) { if (!interpretercore_info_cache.Has(program_id, /*is_grad=*/false)) {
VLOG(2) << "No interpretercore cahce, so create a new interpretercore"; VLOG(2) << "No interpretercore cahce, so create a new interpretercore";
// Step 1. share input_vars & parameters into scope // Step 1. share input_vars & parameters into scope
details::ShareTensorsIntoScope(x, &scope); details::ShareTensorsIntoScope(x, global_inner_scope);
details::ShareTensorsIntoScope(params, &scope); details::ShareTensorsIntoScope(params, global_inner_scope);
// Step 2. create new interpretercore // Step 2. create new interpretercore
auto interpreter_core = auto interpreter_core =
paddle::framework::CreateInterpreterCoreInfoToCache( paddle::framework::CreateInterpreterCoreInfoToCache(
*forward_program, place, /*is_grad=*/false, program_id, &scope); *forward_program,
place,
/*is_grad=*/false,
program_id,
global_inner_scope);
// Step 3. get all eager gc vars // Step 3. get all eager gc vars
std::set<std::string> skip_eager_delete_vars = std::set<std::string> skip_eager_delete_vars =
paddle::framework::details::ParseSafeEagerDeletionSkipVarsSet( paddle::framework::details::ParseSafeEagerDeletionSkipVarsSet(
...@@ -331,10 +355,14 @@ inline void RunProgramAPI( ...@@ -331,10 +355,14 @@ inline void RunProgramAPI(
interpreter_core->Run({}); interpreter_core->Run({});
} }
// Step 5. Get Output // Step 5. Get Output
details::ShareTensorsFromScopeWithPartialBlock( details::ShareTensorsFromScopeWithPartialBlock(out,
out, *forward_global_block, *backward_global_block, &scope); *forward_global_block,
details::ShareTensorsFromScopeWithPartialBlock( *backward_global_block,
dout, *forward_global_block, *backward_global_block, &scope); global_inner_scope);
details::ShareTensorsFromScopeWithPartialBlock(dout,
*forward_global_block,
*backward_global_block,
global_inner_scope);
} else { } else {
VLOG(2) << "Get interpretercore cahce by program:" << program_id; VLOG(2) << "Get interpretercore cahce by program:" << program_id;
// Step 1. get cache interpretercore // Step 1. get cache interpretercore
...@@ -342,34 +370,55 @@ inline void RunProgramAPI( ...@@ -342,34 +370,55 @@ inline void RunProgramAPI(
interpretercore_info_cache.GetMutable(program_id, /*is_grad=*/false); interpretercore_info_cache.GetMutable(program_id, /*is_grad=*/false);
auto &interpreter_core = cached_value.core_; auto &interpreter_core = cached_value.core_;
// Step 2. update scope for cache interpretercore // Step 2. update scope for cache interpretercore
details::ShareTensorsIntoScope(x, &scope); details::ShareTensorsIntoScope(x, global_inner_scope);
details::ShareTensorsIntoScope(params, &scope); details::ShareTensorsIntoScope(params, global_inner_scope);
details::BuildScopeByBlock( if (interpreter_core->GetVariableScope()->GetMutableScope() !=
*interpreter_core.get(), *forward_global_block, &scope); global_inner_scope) {
interpreter_core->reset_scope(&scope); details::BuildScopeByBlock(
*interpreter_core.get(), *forward_global_block, global_inner_scope);
interpreter_core->reset_scope(global_inner_scope);
}
// Step 3. interpretercore run // Step 3. interpretercore run
if (forward_global_block->OpSize() > 0) { if (forward_global_block->OpSize() > 0) {
interpreter_core->Run({}); interpreter_core->Run({});
} }
// Step 4. Get Output // Step 4. Get Output
details::ShareTensorsFromScopeWithPartialBlock( details::ShareTensorsFromScopeWithPartialBlock(out,
out, *forward_global_block, *backward_global_block, &scope); *forward_global_block,
details::ShareTensorsFromScopeWithPartialBlock( *backward_global_block,
dout, *forward_global_block, *backward_global_block, &scope); global_inner_scope);
details::ShareTensorsFromScopeWithPartialBlock(dout,
*forward_global_block,
*backward_global_block,
global_inner_scope);
} }
VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front()); VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front());
if (is_test) { if (is_test) {
VLOG(1) << "is test, after forward, drop kids"; VLOG(4) << "is test, set this scope can reused";
out_scope_vec->front()->DropKids(); global_inner_scope->SetCanReuesd(true);
details::GcScope(global_inner_scope);
} else {
VLOG(4) << "not test, set this scope can not reused";
global_inner_scope->SetCanReuesd(false);
} }
VLOG(2) << "The number of sub scopes after forward: "
<< out_scope_vec->front()->kids().size();
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place); if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place);
#endif #endif
} else { } else {
VLOG(2) << "RunProgramOp execute with parallel_executor."; VLOG(2) << "RunProgramOp execute with parallel_executor.";
// Step 2. prepare executor and init persistable variables
// NOTE(Aurelius84): While training some models, forward can be called many
// times and then apply backpropagation all at once, such as Reinforcement
// Learning. Tensor data in multi-step training should be saved into single
// scope separately. Otherwise, the gradients can be miscalculated because
// always using the Tensor data of the last step in forward.
paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
VLOG(2) << "The number of sub scopes before forward: "
<< out_scope_vec->front()->kids().size();
paddle::framework::Scope &scope = global_inner_scope->NewScope();
// share input_vars & parameters into scope // share input_vars & parameters into scope
details::ShareTensorsIntoScope(x, &scope); details::ShareTensorsIntoScope(x, &scope);
details::ShareTensorsIntoScope(params, &scope); details::ShareTensorsIntoScope(params, &scope);
...@@ -454,21 +503,14 @@ inline void RunProgramGradAPI( ...@@ -454,21 +503,14 @@ inline void RunProgramGradAPI(
1, 1,
paddle::platform::errors::InvalidArgument( paddle::platform::errors::InvalidArgument(
"The OutScope of RunProgramGradOp should only hold one scope.")); "The OutScope of RunProgramGradOp should only hold one scope."));
paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
auto sub_scope_num = global_inner_scope->kids().size();
VLOG(2) << "The number of sub scopes before backward: " << sub_scope_num;
PADDLE_ENFORCE_GT(sub_scope_num,
0,
paddle::platform::errors::InvalidArgument(
"The OutScope of RunProgramGradOp should hold at "
"least one sub scope."));
auto &scope = *(global_inner_scope->kids().front());
auto place = egr::Controller::Instance().GetExpectedPlace(); auto place = egr::Controller::Instance().GetExpectedPlace();
if (use_interpretorcore) { if (use_interpretorcore) {
VLOG(0) << "RunProgramGradOp use interpretercore to execute program."; VLOG(0) << "RunProgramGradOp use interpretercore to execute program.";
paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
auto *forward_global_block = PADDLE_GET_CONST( auto *forward_global_block = PADDLE_GET_CONST(
paddle::framework::BlockDesc *, attrs.at("forward_global_block")); paddle::framework::BlockDesc *, attrs.at("forward_global_block"));
auto *backward_global_block = PADDLE_GET_CONST( auto *backward_global_block = PADDLE_GET_CONST(
...@@ -490,10 +532,14 @@ inline void RunProgramGradAPI( ...@@ -490,10 +532,14 @@ inline void RunProgramGradAPI(
paddle::framework::InterpreterCoreInfoCache::Instance(); paddle::framework::InterpreterCoreInfoCache::Instance();
if (!interpretercore_info_cache.Has(program_id, /*is_grad=*/true)) { if (!interpretercore_info_cache.Has(program_id, /*is_grad=*/true)) {
VLOG(2) << "No interpretercore cahce, so create a new interpretercore"; VLOG(2) << "No interpretercore cahce, so create a new interpretercore";
details::ShareTensorsIntoScope(out_grad, &scope); details::ShareTensorsIntoScope(out_grad, global_inner_scope);
auto interpreter_core = auto interpreter_core =
paddle::framework::CreateInterpreterCoreInfoToCache( paddle::framework::CreateInterpreterCoreInfoToCache(
*backward_program, place, /*is_grad=*/true, program_id, &scope); *backward_program,
place,
/*is_grad=*/true,
program_id,
global_inner_scope);
// get all eager gc vars // get all eager gc vars
std::set<std::string> skip_eager_delete_vars; std::set<std::string> skip_eager_delete_vars;
...@@ -518,10 +564,14 @@ inline void RunProgramGradAPI( ...@@ -518,10 +564,14 @@ inline void RunProgramGradAPI(
interpretercore_info_cache.GetMutable(program_id, /*is_grad=*/true); interpretercore_info_cache.GetMutable(program_id, /*is_grad=*/true);
auto &interpreter_core = cached_value.core_; auto &interpreter_core = cached_value.core_;
// update scope // update scope
details::ShareTensorsIntoScope(out_grad, &scope); details::ShareTensorsIntoScope(out_grad, global_inner_scope);
details::BuildScopeByBlock( if (interpreter_core->GetVariableScope()->GetMutableScope() !=
*interpreter_core.get(), *backward_global_block, &scope); global_inner_scope) {
interpreter_core->reset_scope(&scope); details::BuildScopeByBlock(*interpreter_core.get(),
*backward_global_block,
global_inner_scope);
interpreter_core->reset_scope(global_inner_scope);
}
if (backward_global_block->OpSize() > 0) { if (backward_global_block->OpSize() > 0) {
// Debug info: scope info when run end // Debug info: scope info when run end
...@@ -531,16 +581,31 @@ inline void RunProgramGradAPI( ...@@ -531,16 +581,31 @@ inline void RunProgramGradAPI(
} }
} }
// Step 4. get outputs // Step 4. get outputs
details::ShareTensorsFromScopeWithPartialBlock( details::ShareTensorsFromScopeWithPartialBlock(x_grad,
x_grad, *forward_global_block, *backward_global_block, &scope); *forward_global_block,
details::ShareTensorsFromScopeWithPartialBlock( *backward_global_block,
params_grad, *forward_global_block, *backward_global_block, &scope); global_inner_scope);
details::ShareTensorsFromScopeWithPartialBlock(params_grad,
// Step5. drop current scope *forward_global_block,
global_inner_scope->DeleteScope(&scope); *backward_global_block,
VLOG(2) << "The number of sub scopes after backward: " global_inner_scope);
<< global_inner_scope->kids().size(); VLOG(4) << "after backward gc all vars";
global_inner_scope->SetCanReuesd(true);
details::GcScope(global_inner_scope);
} else { } else {
VLOG(2) << "RunProgramGradOp use pe to execute program.";
paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
auto sub_scope_num = global_inner_scope->kids().size();
VLOG(2) << "The number of sub scopes before backward: " << sub_scope_num;
PADDLE_ENFORCE_GT(sub_scope_num,
0,
paddle::platform::errors::InvalidArgument(
"The OutScope of RunProgramGradOp should hold at "
"least one sub scope."));
auto &scope = *(global_inner_scope->kids().front());
auto *global_block = PADDLE_GET_CONST(paddle::framework::BlockDesc *, auto *global_block = PADDLE_GET_CONST(paddle::framework::BlockDesc *,
attrs.at("global_block")); attrs.at("global_block"));
auto orig_end_op_index = auto orig_end_op_index =
......
...@@ -132,6 +132,11 @@ class Scope : public ScopeBase { ...@@ -132,6 +132,11 @@ class Scope : public ScopeBase {
// Rename variable to a new name and return the new name // Rename variable to a new name and return the new name
std::string Rename(const std::string& origin_name) const; std::string Rename(const std::string& origin_name) const;
// only for dygraph_to_static
bool CanReuesd() const { return can_reused_; }
void SetCanReuesd(bool can_reused) { can_reused_ = can_reused; }
protected: protected:
struct KeyHasher { struct KeyHasher {
std::size_t operator()(const std::string& key) const { std::size_t operator()(const std::string& key) const {
...@@ -169,6 +174,9 @@ class Scope : public ScopeBase { ...@@ -169,6 +174,9 @@ class Scope : public ScopeBase {
mutable std::list<Scope*> kids_; mutable std::list<Scope*> kids_;
const Scope* parent_{nullptr}; const Scope* parent_{nullptr};
// only for dygraph_to_static
bool can_reused_{false};
DISABLE_COPY_AND_ASSIGN(Scope); DISABLE_COPY_AND_ASSIGN(Scope);
#ifndef PADDLE_ON_INFERENCE #ifndef PADDLE_ON_INFERENCE
......
...@@ -1077,7 +1077,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1077,7 +1077,8 @@ All parameter, weight, gradient are variables in Paddle.
R"DOC( R"DOC(
Delete all sub-scopes of the current scope. Delete all sub-scopes of the current scope.
)DOC") )DOC")
.def("_kids", &Scope::kids); .def("_kids", &Scope::kids)
.def_property("_can_reuesd", &Scope::CanReuesd, &Scope::SetCanReuesd);
m.def( m.def(
"Scope", "Scope",
......
...@@ -169,6 +169,25 @@ class PartialProgramLayer: ...@@ -169,6 +169,25 @@ class PartialProgramLayer:
custom_white_list=custom_white_list, custom_white_list=custom_white_list,
custom_black_list=custom_black_list) custom_black_list=custom_black_list)
# program_id -> list(scope)
self._scope_cache = {}
def _get_scope(self, program_id=None, use_scope_cache=False):
if use_scope_cache:
if program_id not in self._scope_cache:
scope = core.Scope()
self._scope_cache[program_id] = [scope]
return scope
else:
for scope in self._scope_cache[program_id]:
if scope._can_reuesd:
return scope
scope = core.Scope()
self._scope_cache[program_id].append(scope)
return scope
else:
return core.Scope()
@LazyInitialized @LazyInitialized
def __fake_vars(self): def __fake_vars(self):
return _create_fake_var() return _create_fake_var()
...@@ -555,11 +574,19 @@ class PartialProgramLayer: ...@@ -555,11 +574,19 @@ class PartialProgramLayer:
('forward_global_block', self.forward_program.desc.block(0), ('forward_global_block', self.forward_program.desc.block(0),
'backward_global_block', self.backward_program.desc.block(0))) 'backward_global_block', self.backward_program.desc.block(0)))
_legacy_C_ops.run_program(self._valid_vars(in_vars), _legacy_C_ops.run_program(
self._valid_vars(self._params), self._valid_vars(in_vars), self._valid_vars(self._params),
self._valid_vars(out_vars), self._valid_vars(out_vars),
self._create_scope_vec(), self._double_grads, self._create_scope_vec(program_id=self.program_id,
self._cuda_graph_vec, *attrs) use_scope_cache=True),
self._double_grads, self._cuda_graph_vec, *attrs)
else:
_legacy_C_ops.run_program(self._valid_vars(in_vars),
self._valid_vars(self._params),
self._valid_vars(out_vars),
self._create_scope_vec(),
self._double_grads, self._cuda_graph_vec,
*attrs)
restored_nest_out = self._restore_out(out_vars) restored_nest_out = self._restore_out(out_vars)
return self._remove_no_value(restored_nest_out) return self._remove_no_value(restored_nest_out)
...@@ -735,10 +762,11 @@ class PartialProgramLayer: ...@@ -735,10 +762,11 @@ class PartialProgramLayer:
return input_vars, out_vars return input_vars, out_vars
def _create_scope_vec(self): def _create_scope_vec(self, program_id=None, use_scope_cache=False):
# Hold forward variables # Hold forward variables
tmp_scope_vec = None tmp_scope_vec = None
inner_scope = core.Scope() inner_scope = self._get_scope(program_id=program_id,
use_scope_cache=use_scope_cache)
if not framework._in_eager_mode_: if not framework._in_eager_mode_:
tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [], tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [],
"program_out_scope", "program_out_scope",
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册