未验证 提交 4230bd87 编写于 作者: A Aurelius84 提交者: GitHub

[Dy2St]Remove PE logic in @to_static (#50512)

* [Dy2St]Remove PE logic in @to_static

* fix typo

* fix infer_program

* fix typo

* fix op_size
上级 bc731487
......@@ -96,25 +96,16 @@ inline void run_program_ad_func(
grad_node->SetGradOutMeta(x, /*slot id*/ 0);
grad_node->SetGradOutMeta(params, /*slot id*/ 1);
bool use_interpretorcore =
PADDLE_GET_CONST(bool, attrs.at("use_interpretorcore"));
VLOG(2) << "clear_no_grad_edges.";
if (use_interpretorcore) {
auto* forward_global_block = PADDLE_GET_CONST(
paddle::framework::BlockDesc*, attrs.at("forward_global_block"));
auto* backward_global_block = PADDLE_GET_CONST(
paddle::framework::BlockDesc*, attrs.at("backward_global_block"));
clear_no_grad_edges_with_partial_block(params,
forward_global_block,
backward_global_block,
grad_node.get(),
/*slot id*/ 1);
} else {
auto* global_block = PADDLE_GET_CONST(paddle::framework::BlockDesc*,
attrs.at("global_block"));
clear_no_grad_edges(params, global_block, grad_node.get(), /*slot id*/ 1);
}
auto* forward_global_block = PADDLE_GET_CONST(
paddle::framework::BlockDesc*, attrs.at("forward_global_block"));
auto* backward_global_block = PADDLE_GET_CONST(
paddle::framework::BlockDesc*, attrs.at("backward_global_block"));
clear_no_grad_edges_with_partial_block(params,
forward_global_block,
backward_global_block,
grad_node.get(),
/*slot id*/ 1);
grad_node->SetGradInMeta(deref_out, 0);
......
......@@ -304,226 +304,145 @@ inline void RunProgramAPI(
paddle::platform::errors::InvalidArgument(
"The OutScope of RunProgramGradOp should only hold one scope."));
bool use_interpretorcore =
PADDLE_GET_CONST(bool, attrs.at("use_interpretorcore"));
VLOG(2) << "RunProgramOp use interpretercore to execute program.";
if (use_interpretorcore) {
VLOG(2) << "RunProgramOp use interpretercore to execute program.";
paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
auto input_names = details::GetTensorsName(x);
auto output_names = details::GetTensorsName(out);
auto dout_names = details::GetTensorsName(dout);
auto input_names = details::GetTensorsName(x);
auto output_names = details::GetTensorsName(out);
auto dout_names = details::GetTensorsName(dout);
if (VLOG_IS_ON(6)) {
std::stringstream s;
s << "input_names: ";
for (auto name : input_names) {
s << name << " ";
}
s << std::endl;
s << "output_names: ";
for (auto name : output_names) {
s << name << " ";
}
s << std::endl;
s << "dout_names: ";
for (auto name : dout_names) {
s << name << " ";
}
s << std::endl;
VLOG(6) << s.str();
}
auto *forward_global_block = PADDLE_GET_CONST(
paddle::framework::BlockDesc *, attrs.at("forward_global_block"));
auto *backward_global_block = PADDLE_GET_CONST(
paddle::framework::BlockDesc *, attrs.at("backward_global_block"));
auto *forward_program = forward_global_block->Program();
auto *backward_program = backward_global_block->Program();
auto &interpretercore_info_cache =
paddle::framework::InterpreterCoreInfoCache::Instance();
std::shared_ptr<paddle::framework::InterpreterCore> interpreter_core =
nullptr;
if (!interpretercore_info_cache.Has(program_id, /*is_grad=*/false)) {
paddle::platform::RecordEvent record_event(
"create_new_interpretercore",
paddle::platform::TracerEventType::UserDefined,
1);
VLOG(2) << "No interpretercore cahce, so create a new interpretercore "
"for program: "
<< program_id;
// Step 1. share input_vars & parameters into scope
details::ShareTensorsIntoScope(x, global_inner_scope);
details::ShareTensorsIntoScope(params, global_inner_scope);
// Step 2. create new interpretercore
interpreter_core =
paddle::framework::CreateInterpreterCoreInfoToCache(*forward_program,
place,
/*is_grad=*/false,
program_id,
global_inner_scope);
// Step 3. get all eager gc vars
std::set<std::string> skip_eager_delete_vars =
paddle::framework::details::ParseSafeEagerDeletionSkipVarsSet(
*backward_program);
// all out_vars are skip_eager_var
skip_eager_delete_vars.insert(output_names.begin(), output_names.end());
skip_eager_delete_vars.insert(dout_names.begin(), dout_names.end());
// update interpretercore skip_gc_var
interpreter_core->SetSkipGcVars(skip_eager_delete_vars);
std::set<std::string> input_vars;
input_vars.insert(input_names.begin(), input_names.end());
interpreter_core->SetJitInputVars(input_vars);
if (VLOG_IS_ON(6)) {
std::stringstream s;
s << "input_names: ";
for (auto name : input_names) {
s << "skip_eager_delete_vars: ";
for (auto name : skip_eager_delete_vars) {
s << name << " ";
}
s << std::endl;
s << "output_names: ";
for (auto name : output_names) {
s << name << " ";
}
s << std::endl;
s << "dout_names: ";
for (auto name : dout_names) {
s << name << " ";
}
s << std::endl;
VLOG(6) << s.str();
}
auto *forward_global_block = PADDLE_GET_CONST(
paddle::framework::BlockDesc *, attrs.at("forward_global_block"));
auto *backward_global_block = PADDLE_GET_CONST(
paddle::framework::BlockDesc *, attrs.at("backward_global_block"));
auto *forward_program = forward_global_block->Program();
auto *backward_program = backward_global_block->Program();
auto &interpretercore_info_cache =
paddle::framework::InterpreterCoreInfoCache::Instance();
std::shared_ptr<paddle::framework::InterpreterCore> interpreter_core =
nullptr;
if (!interpretercore_info_cache.Has(program_id, /*is_grad=*/false)) {
paddle::platform::RecordEvent record_event(
"create_new_interpretercore",
paddle::platform::TracerEventType::UserDefined,
1);
VLOG(2) << "No interpretercore cahce, so create a new interpretercore "
"for program: "
<< program_id;
// Step 1. share input_vars & parameters into scope
details::ShareTensorsIntoScope(x, global_inner_scope);
details::ShareTensorsIntoScope(params, global_inner_scope);
// Step 2. create new interpretercore
interpreter_core = paddle::framework::CreateInterpreterCoreInfoToCache(
*forward_program,
place,
/*is_grad=*/false,
program_id,
global_inner_scope);
// Step 3. get all eager gc vars
std::set<std::string> skip_eager_delete_vars =
paddle::framework::details::ParseSafeEagerDeletionSkipVarsSet(
*backward_program);
// all out_vars are skip_eager_var
skip_eager_delete_vars.insert(output_names.begin(), output_names.end());
skip_eager_delete_vars.insert(dout_names.begin(), dout_names.end());
// update interpretercore skip_gc_var
interpreter_core->SetSkipGcVars(skip_eager_delete_vars);
std::set<std::string> input_vars;
input_vars.insert(input_names.begin(), input_names.end());
interpreter_core->SetJitInputVars(input_vars);
if (VLOG_IS_ON(6)) {
std::stringstream s;
s << "skip_eager_delete_vars: ";
for (auto name : skip_eager_delete_vars) {
s << name << " ";
}
VLOG(6) << s.str();
}
interpretercore_info_cache.UpdateSkipEagerDeleteVars(
program_id, false, skip_eager_delete_vars);
VLOG(2) << "Get skip GC vars size is: " << skip_eager_delete_vars.size();
} else {
paddle::platform::RecordEvent record_event(
"get_interpretercore_cahce",
paddle::platform::TracerEventType::UserDefined,
1);
VLOG(2) << "Get interpretercore cahce by program:" << program_id;
// Step 1. get cache interpretercore
auto &cached_value =
interpretercore_info_cache.GetMutable(program_id, /*is_grad=*/false);
interpreter_core = cached_value.core_;
// Step 2. update scope for cache interpretercore
details::ShareTensorsIntoScope(x, global_inner_scope);
details::ShareTensorsIntoScope(params, global_inner_scope);
if (interpreter_core->GetVariableScope()->GetMutableScope() !=
global_inner_scope) {
details::BuildScopeByBlock(
*interpreter_core.get(), *forward_global_block, global_inner_scope);
interpreter_core->reset_scope(global_inner_scope);
}
}
// interpretercore run
if (forward_global_block->OpSize() > 0) {
paddle::platform::RecordEvent record_event(
"interpreter_core_run",
paddle::platform::TracerEventType::UserDefined,
1);
interpreter_core->Run({});
}
{
paddle::platform::RecordEvent record_event(
"fetch_and_gc", paddle::platform::TracerEventType::UserDefined, 1);
// Get Output
details::ShareTensorsFromScopeWithPartialBlock(out,
*forward_global_block,
*backward_global_block,
global_inner_scope);
details::ShareTensorsFromScopeWithPartialBlock(dout,
*forward_global_block,
*backward_global_block,
global_inner_scope);
VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(
out_scope_vec->front());
if (is_test || !egr::Controller::Instance().HasGrad()) {
VLOG(4) << "is test, set this scope can reused";
global_inner_scope->SetCanReuesd(true);
details::GcScope(global_inner_scope);
} else {
VLOG(4) << "not test, set this scope can not reused";
global_inner_scope->SetCanReuesd(false);
}
interpretercore_info_cache.UpdateSkipEagerDeleteVars(
program_id, false, skip_eager_delete_vars);
VLOG(2) << "Get skip GC vars size is: " << skip_eager_delete_vars.size();
} else {
paddle::platform::RecordEvent record_event(
"get_interpretercore_cahce",
paddle::platform::TracerEventType::UserDefined,
1);
VLOG(2) << "Get interpretercore cahce by program:" << program_id;
// Step 1. get cache interpretercore
auto &cached_value =
interpretercore_info_cache.GetMutable(program_id, /*is_grad=*/false);
interpreter_core = cached_value.core_;
// Step 2. update scope for cache interpretercore
details::ShareTensorsIntoScope(x, global_inner_scope);
details::ShareTensorsIntoScope(params, global_inner_scope);
if (interpreter_core->GetVariableScope()->GetMutableScope() !=
global_inner_scope) {
details::BuildScopeByBlock(
*interpreter_core.get(), *forward_global_block, global_inner_scope);
interpreter_core->reset_scope(global_inner_scope);
}
}
#ifdef PADDLE_WITH_MKLDNN
if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place);
#endif
} else {
VLOG(2) << "RunProgramOp execute with parallel_executor.";
// Step 2. prepare executor and init persistable variables
// NOTE(Aurelius84): While training some models, forward can be called many
// times and then apply backpropagation all at once, such as Reinforcement
// Learning. Tensor data in multi-step training should be saved into single
// scope separately. Otherwise, the gradients can be miscalculated because
// always using the Tensor data of the last step in forward.
paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
VLOG(2) << "The number of sub scopes before forward: "
<< out_scope_vec->front()->kids().size();
paddle::framework::Scope &scope = global_inner_scope->NewScope();
// share input_vars & parameters into scope
details::ShareTensorsIntoScope(x, &scope);
details::ShareTensorsIntoScope(params, &scope);
const auto &place = egr::Controller::Instance().GetExpectedPlace();
auto *global_block = PADDLE_GET_CONST(paddle::framework::BlockDesc *,
attrs.at("global_block"));
auto start_op_index = PADDLE_GET_CONST(int64_t, attrs.at("start_op_index"));
auto end_op_index = PADDLE_GET_CONST(int64_t, attrs.at("end_op_index"));
if (end_op_index > start_op_index) {
auto input_names = details::GetTensorsName(x);
auto output_names = details::GetTensorsName(out);
auto dout_names = details::GetTensorsName(dout);
auto *program = global_block->Program();
auto cache_info =
paddle::framework::GetExecutorInfoFromCache(*program,
place,
start_op_index,
end_op_index,
/*is_grad=*/false,
program_id,
&scope);
auto &parallel_executor = cache_info.first;
// all out_vars are skip_eager_var
auto &skip_eager_delete_vars =
paddle::framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars(
program_id, false);
if (cache_info.second /*is_new_created*/) {
parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, input_names);
skip_eager_delete_vars.insert(skip_eager_delete_vars.end(),
output_names.begin(),
output_names.end());
skip_eager_delete_vars.insert(
skip_eager_delete_vars.end(), dout_names.begin(), dout_names.end());
paddle::framework::details::ParseSafeEagerDeletionSkipVars(
*program, end_op_index, output_names, &skip_eager_delete_vars);
}
// interpretercore run
if (forward_global_block->OpSize() > 0) {
paddle::platform::RecordEvent record_event(
"interpreter_core_run",
paddle::platform::TracerEventType::UserDefined,
1);
interpreter_core->Run({});
}
// Step 3. run ops
parallel_executor->RunWithoutFetch(skip_eager_delete_vars);
}
// Step 4. Get Output
details::ShareTensorsFromScope(out, *global_block, &scope);
details::ShareTensorsFromScope(dout, *global_block, &scope);
{
paddle::platform::RecordEvent record_event(
"fetch_and_gc", paddle::platform::TracerEventType::UserDefined, 1);
// Get Output
details::ShareTensorsFromScopeWithPartialBlock(
out, *forward_global_block, *backward_global_block, global_inner_scope);
details::ShareTensorsFromScopeWithPartialBlock(dout,
*forward_global_block,
*backward_global_block,
global_inner_scope);
// Debug info: scope info when run end
VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front());
// Step 5. Drop all children scopes while testing.
if (is_test || !egr::Controller::Instance().HasGrad()) {
out_scope_vec->front()->DropKids();
VLOG(4) << "is test, set this scope can reused";
global_inner_scope->SetCanReuesd(true);
details::GcScope(global_inner_scope);
} else {
VLOG(4) << "not test, set this scope can not reused";
global_inner_scope->SetCanReuesd(false);
}
VLOG(2) << "The number of sub scopes after forward: "
<< out_scope_vec->front()->kids().size();
}
#ifdef PADDLE_WITH_MKLDNN
if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place);
if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place);
#endif
}
}
inline void RunProgramGradAPI(
......@@ -538,8 +457,6 @@ inline void RunProgramGradAPI(
// if all output vars are set to stop_gradient, grad op no need to executed
if (x_grad.empty() && params_grad.empty()) return;
bool use_interpretorcore =
PADDLE_GET_CONST(bool, attrs.at("use_interpretorcore"));
auto program_id = PADDLE_GET_CONST(int64_t, attrs.at("program_id"));
auto *out_scope_vec = &step_scope;
......@@ -550,196 +467,111 @@ inline void RunProgramGradAPI(
"The OutScope of RunProgramGradOp should only hold one scope."));
auto place = egr::Controller::Instance().GetExpectedPlace();
if (use_interpretorcore) {
VLOG(2) << "RunProgramGradOp use interpretercore to execute program.";
paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
auto *forward_global_block = PADDLE_GET_CONST(
paddle::framework::BlockDesc *, attrs.at("forward_global_block"));
auto *backward_global_block = PADDLE_GET_CONST(
paddle::framework::BlockDesc *, attrs.at("backward_global_block"));
auto *backward_program = backward_global_block->Program();
auto out_grad_names = details::GetTensorsName(out_grad);
auto &interpretercore_info_cache =
paddle::framework::InterpreterCoreInfoCache::Instance();
std::shared_ptr<paddle::framework::InterpreterCore> interpreter_core =
nullptr;
if (!interpretercore_info_cache.Has(program_id, /*is_grad=*/true)) {
paddle::platform::RecordEvent record_event(
"create_new_interpretercore",
paddle::platform::TracerEventType::UserDefined,
1);
VLOG(2) << "No interpretercore cahce, so create a new interpretercore";
details::ShareTensorsIntoScope(out_grad, global_inner_scope);
interpreter_core = paddle::framework::CreateInterpreterCoreInfoToCache(
*backward_program,
place,
/*is_grad=*/true,
program_id,
global_inner_scope);
// share threadpool
// NOTE(zhiqiu): this only works interpreter_core is executed strictly
// after the related fwd_interpreter_core.
if (interpretercore_info_cache.Has(program_id, false)) {
auto fwd_interpreter_core =
interpretercore_info_cache.GetMutable(program_id, /*is_grad=*/false)
.core_;
interpreter_core->ShareWorkQueueFrom(fwd_interpreter_core);
VLOG(4) << "Share workqueue from " << fwd_interpreter_core.get()
<< " to " << interpreter_core.get();
}
std::vector<std::string> x_grad_names;
std::vector<std::string> param_grad_names;
if (!x_grad.empty()) {
x_grad_names = details::GetTensorsName(x_grad);
}
if (!params_grad.empty()) {
param_grad_names = details::GetTensorsName(params_grad);
}
// get all eager gc vars
std::set<std::string> skip_eager_delete_vars;
// all out_vars are skip_eager_var
skip_eager_delete_vars.insert(x_grad_names.begin(), x_grad_names.end());
// initialize skip gc vars by forward_program and backward_program
paddle::framework::details::AppendSkipDeletionVars(
param_grad_names, &skip_eager_delete_vars);
interpreter_core->SetSkipGcVars(skip_eager_delete_vars);
interpretercore_info_cache.UpdateSkipEagerDeleteVars(
program_id, /*is_grad=*/true, skip_eager_delete_vars);
VLOG(2) << "Get skip GC vars size is: " << skip_eager_delete_vars.size();
} else {
paddle::platform::RecordEvent record_event(
"get_interpretercore_cahce",
paddle::platform::TracerEventType::UserDefined,
1);
VLOG(2) << "Get interpretercore cahce by program:" << program_id;
auto &cached_value =
interpretercore_info_cache.GetMutable(program_id, /*is_grad=*/true);
interpreter_core = cached_value.core_;
// update scope
details::ShareTensorsIntoScope(out_grad, global_inner_scope);
if (interpreter_core->GetVariableScope()->GetMutableScope() !=
global_inner_scope) {
details::BuildScopeByBlock(*interpreter_core.get(),
*backward_global_block,
global_inner_scope);
interpreter_core->reset_scope(global_inner_scope);
}
VLOG(2) << "RunProgramGradOp use interpretercore to execute program.";
paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
auto *forward_global_block = PADDLE_GET_CONST(
paddle::framework::BlockDesc *, attrs.at("forward_global_block"));
auto *backward_global_block = PADDLE_GET_CONST(
paddle::framework::BlockDesc *, attrs.at("backward_global_block"));
auto *backward_program = backward_global_block->Program();
auto out_grad_names = details::GetTensorsName(out_grad);
auto &interpretercore_info_cache =
paddle::framework::InterpreterCoreInfoCache::Instance();
std::shared_ptr<paddle::framework::InterpreterCore> interpreter_core =
nullptr;
if (!interpretercore_info_cache.Has(program_id, /*is_grad=*/true)) {
paddle::platform::RecordEvent record_event(
"create_new_interpretercore",
paddle::platform::TracerEventType::UserDefined,
1);
VLOG(2) << "No interpretercore cahce, so create a new interpretercore";
details::ShareTensorsIntoScope(out_grad, global_inner_scope);
interpreter_core =
paddle::framework::CreateInterpreterCoreInfoToCache(*backward_program,
place,
/*is_grad=*/true,
program_id,
global_inner_scope);
// share threadpool
// NOTE(zhiqiu): this only works interpreter_core is executed strictly
// after the related fwd_interpreter_core.
if (interpretercore_info_cache.Has(program_id, false)) {
auto fwd_interpreter_core =
interpretercore_info_cache.GetMutable(program_id, /*is_grad=*/false)
.core_;
interpreter_core->ShareWorkQueueFrom(fwd_interpreter_core);
VLOG(4) << "Share workqueue from " << fwd_interpreter_core.get() << " to "
<< interpreter_core.get();
}
if (backward_global_block->OpSize() > 0) {
paddle::platform::RecordEvent record_event(
"interpreter_core_run",
paddle::platform::TracerEventType::UserDefined,
1);
// Debug info: scope info when run end
VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(
out_scope_vec->front());
interpreter_core->Run({});
std::vector<std::string> x_grad_names;
std::vector<std::string> param_grad_names;
if (!x_grad.empty()) {
x_grad_names = details::GetTensorsName(x_grad);
}
{
paddle::platform::RecordEvent record_event(
"fetch_and_gc", paddle::platform::TracerEventType::UserDefined, 1);
// Step 4. get outputs
details::ShareTensorsFromScopeWithPartialBlock(x_grad,
*forward_global_block,
*backward_global_block,
global_inner_scope);
details::ShareTensorsFromScopeWithPartialBlock(params_grad,
*forward_global_block,
*backward_global_block,
global_inner_scope);
VLOG(4) << "after backward gc all vars";
global_inner_scope->SetCanReuesd(true);
details::GcScope(global_inner_scope);
if (!params_grad.empty()) {
param_grad_names = details::GetTensorsName(params_grad);
}
// get all eager gc vars
std::set<std::string> skip_eager_delete_vars;
// all out_vars are skip_eager_var
skip_eager_delete_vars.insert(x_grad_names.begin(), x_grad_names.end());
// initialize skip gc vars by forward_program and backward_program
paddle::framework::details::AppendSkipDeletionVars(param_grad_names,
&skip_eager_delete_vars);
interpreter_core->SetSkipGcVars(skip_eager_delete_vars);
interpretercore_info_cache.UpdateSkipEagerDeleteVars(
program_id, /*is_grad=*/true, skip_eager_delete_vars);
VLOG(2) << "Get skip GC vars size is: " << skip_eager_delete_vars.size();
} else {
VLOG(2) << "RunProgramGradOp use pe to execute program.";
paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
auto sub_scope_num = global_inner_scope->kids().size();
VLOG(2) << "The number of sub scopes before backward: " << sub_scope_num;
PADDLE_ENFORCE_GT(sub_scope_num,
0,
paddle::platform::errors::InvalidArgument(
"The OutScope of RunProgramGradOp should hold at "
"least one sub scope."));
auto &scope = *(global_inner_scope->kids().front());
auto *global_block = PADDLE_GET_CONST(paddle::framework::BlockDesc *,
attrs.at("global_block"));
auto orig_end_op_index =
PADDLE_GET_CONST(int64_t, attrs.at("end_op_index"));
// NOTE: skip `shape` and `fill_constant` op created by
// fluid.backward.gradients, one forward output will generate one `shape`
// and `fill_constant`
int64_t start_op_index = orig_end_op_index + (out_grad.size() * 2);
int64_t end_op_index = global_block->OpSize();
if (end_op_index > start_op_index) {
auto out_grad_names = details::GetTensorsName(out_grad);
// Step 2. prepare executor and scope
auto *program = global_block->Program();
auto cache_info =
paddle::framework::GetExecutorInfoFromCache(*program,
place,
start_op_index,
end_op_index,
/*is_grad*/ true,
program_id,
&scope);
auto &parallel_executor = cache_info.first;
auto &skip_eager_delete_vars =
paddle::framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars(
program_id, true);
if (cache_info.second /*is_new_created*/) {
parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, out_grad_names);
// NOTE: after PR22939 [Add double grad] merged, the grad op maker's
// SetOutput will set to None if the input var stop_gradient=True,
// it will cause an NotFound error when ctx.OutputNames() is called
std::vector<std::string> x_grad_names;
std::vector<std::string> param_grad_names;
if (!x_grad.empty()) {
x_grad_names = details::GetTensorsName(x_grad);
}
if (!params_grad.empty()) {
param_grad_names = details::GetTensorsName(params_grad);
}
skip_eager_delete_vars.insert(skip_eager_delete_vars.end(),
x_grad_names.begin(),
x_grad_names.end());
paddle::framework::details::AppendSkipDeletionVars(
param_grad_names, &skip_eager_delete_vars);
}
details::ShareTensorsIntoScope(out_grad, &scope);
// Debug info: scope info when run end
VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(
out_scope_vec->front());
// Step 3. run ops
parallel_executor->RunWithoutFetch(
/*skip_eager_delete_vars=*/skip_eager_delete_vars);
paddle::platform::RecordEvent record_event(
"get_interpretercore_cahce",
paddle::platform::TracerEventType::UserDefined,
1);
VLOG(2) << "Get interpretercore cahce by program:" << program_id;
auto &cached_value =
interpretercore_info_cache.GetMutable(program_id, /*is_grad=*/true);
interpreter_core = cached_value.core_;
// update scope
details::ShareTensorsIntoScope(out_grad, global_inner_scope);
if (interpreter_core->GetVariableScope()->GetMutableScope() !=
global_inner_scope) {
details::BuildScopeByBlock(
*interpreter_core.get(), *backward_global_block, global_inner_scope);
interpreter_core->reset_scope(global_inner_scope);
}
}
// Step 4. get outputs
details::ShareTensorsFromScope(x_grad, *global_block, &scope);
details::ShareTensorsFromScope(params_grad, *global_block, &scope);
if (backward_global_block->OpSize() > 0) {
paddle::platform::RecordEvent record_event(
"interpreter_core_run",
paddle::platform::TracerEventType::UserDefined,
1);
// Debug info: scope info when run end
VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front());
interpreter_core->Run({});
}
// Step5. drop current scope
global_inner_scope->DeleteScope(&scope);
VLOG(2) << "The number of sub scopes after backward: "
<< global_inner_scope->kids().size();
{
paddle::platform::RecordEvent record_event(
"fetch_and_gc", paddle::platform::TracerEventType::UserDefined, 1);
// Step 4. get outputs
details::ShareTensorsFromScopeWithPartialBlock(x_grad,
*forward_global_block,
*backward_global_block,
global_inner_scope);
details::ShareTensorsFromScopeWithPartialBlock(params_grad,
*forward_global_block,
*backward_global_block,
global_inner_scope);
VLOG(4) << "after backward gc all vars";
global_inner_scope->SetCanReuesd(true);
details::GcScope(global_inner_scope);
}
}
......
......@@ -23,10 +23,6 @@ from paddle.fluid import backward, core, framework, program_guard
from paddle.fluid.compiler import BuildStrategy
from paddle.fluid.dygraph import layers
from paddle.fluid.dygraph.base import switch_to_static_graph
from paddle.fluid.executor import (
_is_dy2st_enable_standalone_executor,
_is_enable_standalone_executor,
)
from paddle.fluid.framework import _apply_pass
from paddle.fluid.layers.utils import _hash_with_id, flatten, pack_sequence_as
......@@ -128,14 +124,26 @@ class ProgramInfo:
A helper class to recoder Program information
"""
def __init__(self, mode='infer'):
def __init__(self):
self.op_size = {
'fp32': -1,
'amp': -1,
'fp16': -1,
}
assert mode in ['train', 'infer']
self.mode = mode
self.programs = {}
self.mode = "infer"
def __call__(self, key, prog_creator):
"""
Recoder infer program and op size.
"""
assert key in ['fp32', 'amp', 'fp16']
if key not in self.programs:
infer_prog = prog_creator(is_infer_mode=True)
self.programs[key] = infer_prog
self.op_size[key] = infer_prog.desc.block(0).op_size()
return self.programs[key], self.op_size[key]
class PartialProgramLayer:
......@@ -176,7 +184,7 @@ class PartialProgramLayer:
self._cuda_graph_pool_id = 0
# Set default mode to train
self.training = True
self._infer_info = ProgramInfo(mode='infer')
self._infer_info = ProgramInfo()
custom_white_list, custom_black_list = None, None
tracer = framework._dygraph_tracer()
......@@ -191,6 +199,28 @@ class PartialProgramLayer:
# program_id -> list(scope)
self._scope_cache = {}
def __call__(self, inputs):
"""
Execute static graph by Interpreter and Return dynamic Tensors.
"""
in_vars, out_vars = self._prepare(inputs)
self._cast_fp16_if_pure_fp16(in_vars)
attrs = self._prepare_attributes()
_legacy_C_ops.run_program(
self._valid_vars(in_vars),
self._valid_vars(self._params),
self._valid_vars(out_vars),
self._create_scope_vec(
program_id=self.program_id, use_scope_cache=True
),
self._double_grads,
self._cuda_graph_vec,
*attrs
)
restored_nest_out = self._restore_out(out_vars)
return self._remove_no_value(restored_nest_out)
def _get_scope(self, program_id=None, use_scope_cache=False):
if use_scope_cache:
if program_id not in self._scope_cache:
......@@ -259,8 +289,9 @@ class PartialProgramLayer:
@switch_to_static_graph
def _create_forward_backward_train_program(self):
whole_program = self._train_program
forward_end_op_index = self._infer_info.op_size['fp32']
_, forward_end_op_index = self._infer_info('fp32', self._create_program)
assert forward_end_op_index >= 0
return self._get_forward_backward_program_form(
whole_program, forward_end_op_index
)
......@@ -268,8 +299,11 @@ class PartialProgramLayer:
@switch_to_static_graph
def _create_forward_backward_train_amp_program(self):
whole_program = self._train_amp_program
forward_end_op_index = self._infer_info.op_size['amp']
_, forward_end_op_index = self._infer_info(
'amp', self._create_amp_program
)
assert forward_end_op_index >= 0
return self._get_forward_backward_program_form(
whole_program, forward_end_op_index
)
......@@ -277,8 +311,11 @@ class PartialProgramLayer:
@switch_to_static_graph
def _create_forward_backward_train_pure_fp16_program(self):
whole_program = self._train_pure_fp16_program
forward_end_op_index = self._infer_info.op_size['fp16']
_, forward_end_op_index = self._infer_info(
'fp16', self._create_pure_fp16_program
)
assert forward_end_op_index >= 0
return self._get_forward_backward_program_form(
whole_program, forward_end_op_index
)
......@@ -289,11 +326,8 @@ class PartialProgramLayer:
@LazyInitialized
def _infer_program(self):
program = self._create_program(is_infer_mode=True)
self._infer_info.op_size['fp32'] = program.desc.block(0).op_size()
return self._build_infer_program(
program, self._infer_info.op_size['fp32']
)
program, op_size = self._infer_info('fp32', self._create_program)
return self._build_infer_program(program, op_size)
@LazyInitialized
def _train_amp_program(self):
......@@ -301,11 +335,8 @@ class PartialProgramLayer:
@LazyInitialized
def _infer_amp_program(self):
program = self._create_amp_program(is_infer_mode=True)
self._infer_info.op_size['amp'] = program.desc.block(0).op_size()
return self._build_infer_program(
program, self._infer_info.op_size['amp']
)
program, op_size = self._infer_info('amp', self._create_amp_program)
return self._build_infer_program(program, op_size)
@LazyInitialized
def _train_pure_fp16_program(self):
......@@ -313,11 +344,10 @@ class PartialProgramLayer:
@LazyInitialized
def _infer_pure_fp16_program(self):
program = self._create_pure_fp16_program(is_infer_mode=True)
self._infer_info.op_size['fp16'] = program.desc.block(0).op_size()
return self._build_infer_program(
program, self._infer_info.op_size['fp16']
program, op_size = self._infer_info(
'fp16', self._create_pure_fp16_program
)
return self._build_infer_program(program, op_size)
@LazyInitialized
def _train_forward_backward_program(self):
......@@ -632,27 +662,24 @@ class PartialProgramLayer:
double_grads.append(var_base)
return self._valid_vars(double_grads)
def _get_end_op_index(self):
if _in_amp_guard():
infer_program = self._infer_amp_program
elif _in_pure_fp16_guard():
infer_program = self._infer_pure_fp16_program
else:
infer_program = self._infer_program
return infer_program.desc.block(0).op_size()
def __call__(self, inputs):
in_vars, out_vars = self._prepare(inputs)
self._cast_fp16_if_pure_fp16(in_vars)
def _cast_fp16_if_pure_fp16(self, in_vars):
if _in_pure_fp16_guard():
for i, var in enumerate(in_vars):
name = var.name
if (
self.program.global_block().has_var(name)
and self.program.global_block().var(name).dtype
== paddle.float16
):
in_vars[i] = var.astype('float16')
in_vars[i].name = name
def _prepare_attributes(self):
attrs = [
'global_block',
self.program.desc.block(0),
'start_op_index',
0,
'end_op_index',
self._get_end_op_index(),
'forward_global_block',
self.forward_program.desc.block(0),
'backward_global_block',
self.backward_program.desc.block(0),
'is_test',
not self.training,
'program_id',
......@@ -679,57 +706,7 @@ class PartialProgramLayer:
self._cuda_graph_pool_id,
)
)
use_interpretorcore = (
_is_enable_standalone_executor()
and _is_dy2st_enable_standalone_executor()
)
attrs.extend(('use_interpretorcore', use_interpretorcore))
if use_interpretorcore:
attrs.extend(
(
'forward_global_block',
self.forward_program.desc.block(0),
'backward_global_block',
self.backward_program.desc.block(0),
)
)
_legacy_C_ops.run_program(
self._valid_vars(in_vars),
self._valid_vars(self._params),
self._valid_vars(out_vars),
self._create_scope_vec(
program_id=self.program_id, use_scope_cache=True
),
self._double_grads,
self._cuda_graph_vec,
*attrs
)
else:
_legacy_C_ops.run_program(
self._valid_vars(in_vars),
self._valid_vars(self._params),
self._valid_vars(out_vars),
self._create_scope_vec(),
self._double_grads,
self._cuda_graph_vec,
*attrs
)
restored_nest_out = self._restore_out(out_vars)
return self._remove_no_value(restored_nest_out)
def _cast_fp16_if_pure_fp16(self, in_vars):
if _in_pure_fp16_guard():
for i, var in enumerate(in_vars):
name = var.name
if (
self.program.global_block().has_var(name)
and self.program.global_block().var(name).dtype
== paddle.float16
):
in_vars[i] = var.astype('float16')
in_vars[i].name = name
return attrs
@switch_to_static_graph
def _build_infer_program(self, infer_program, forward_end_op_index):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册