From 4230bd87ff0ac843851f31cd849b13aa4068e9b2 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Fri, 17 Feb 2023 10:27:41 +0800 Subject: [PATCH] [Dy2St]Remove PE logic in @to_static (#50512) * [Dy2St]Remove PE logic in @to_static * fix typo * fix infer_program * fix typo * fix op_size --- .../eager/to_static/run_program_op_func.h | 27 +- .../eager/to_static/run_program_op_node.h | 600 +++++++----------- .../paddle/jit/dy2static/partial_program.py | 167 +++-- 3 files changed, 297 insertions(+), 497 deletions(-) diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h index 7305e79cd73..6bbf62ea6c0 100644 --- a/paddle/fluid/eager/to_static/run_program_op_func.h +++ b/paddle/fluid/eager/to_static/run_program_op_func.h @@ -96,25 +96,16 @@ inline void run_program_ad_func( grad_node->SetGradOutMeta(x, /*slot id*/ 0); grad_node->SetGradOutMeta(params, /*slot id*/ 1); - bool use_interpretorcore = - PADDLE_GET_CONST(bool, attrs.at("use_interpretorcore")); VLOG(2) << "clear_no_grad_edges."; - if (use_interpretorcore) { - auto* forward_global_block = PADDLE_GET_CONST( - paddle::framework::BlockDesc*, attrs.at("forward_global_block")); - auto* backward_global_block = PADDLE_GET_CONST( - paddle::framework::BlockDesc*, attrs.at("backward_global_block")); - clear_no_grad_edges_with_partial_block(params, - forward_global_block, - backward_global_block, - grad_node.get(), - /*slot id*/ 1); - - } else { - auto* global_block = PADDLE_GET_CONST(paddle::framework::BlockDesc*, - attrs.at("global_block")); - clear_no_grad_edges(params, global_block, grad_node.get(), /*slot id*/ 1); - } + auto* forward_global_block = PADDLE_GET_CONST( + paddle::framework::BlockDesc*, attrs.at("forward_global_block")); + auto* backward_global_block = PADDLE_GET_CONST( + paddle::framework::BlockDesc*, attrs.at("backward_global_block")); + clear_no_grad_edges_with_partial_block(params, + forward_global_block, + backward_global_block, + grad_node.get(), + /*slot id*/ 1); grad_node->SetGradInMeta(deref_out, 0); diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h index a5936989631..fd1f5acdf21 100644 --- a/paddle/fluid/eager/to_static/run_program_op_node.h +++ b/paddle/fluid/eager/to_static/run_program_op_node.h @@ -304,226 +304,145 @@ inline void RunProgramAPI( paddle::platform::errors::InvalidArgument( "The OutScope of RunProgramGradOp should only hold one scope.")); - bool use_interpretorcore = - PADDLE_GET_CONST(bool, attrs.at("use_interpretorcore")); + VLOG(2) << "RunProgramOp use interpretercore to execute program."; - if (use_interpretorcore) { - VLOG(2) << "RunProgramOp use interpretercore to execute program."; + paddle::framework::Scope *global_inner_scope = out_scope_vec->front(); - paddle::framework::Scope *global_inner_scope = out_scope_vec->front(); + auto input_names = details::GetTensorsName(x); + auto output_names = details::GetTensorsName(out); + auto dout_names = details::GetTensorsName(dout); - auto input_names = details::GetTensorsName(x); - auto output_names = details::GetTensorsName(out); - auto dout_names = details::GetTensorsName(dout); + if (VLOG_IS_ON(6)) { + std::stringstream s; + s << "input_names: "; + for (auto name : input_names) { + s << name << " "; + } + s << std::endl; + s << "output_names: "; + for (auto name : output_names) { + s << name << " "; + } + s << std::endl; + s << "dout_names: "; + for (auto name : dout_names) { + s << name << " "; + } + s << std::endl; + VLOG(6) << s.str(); + } + + auto *forward_global_block = PADDLE_GET_CONST( + paddle::framework::BlockDesc *, attrs.at("forward_global_block")); + auto *backward_global_block = PADDLE_GET_CONST( + paddle::framework::BlockDesc *, attrs.at("backward_global_block")); + auto *forward_program = forward_global_block->Program(); + auto *backward_program = backward_global_block->Program(); + + auto &interpretercore_info_cache = + paddle::framework::InterpreterCoreInfoCache::Instance(); + std::shared_ptr interpreter_core = + nullptr; + if (!interpretercore_info_cache.Has(program_id, /*is_grad=*/false)) { + paddle::platform::RecordEvent record_event( + "create_new_interpretercore", + paddle::platform::TracerEventType::UserDefined, + 1); + VLOG(2) << "No interpretercore cahce, so create a new interpretercore " + "for program: " + << program_id; + // Step 1. share input_vars & parameters into scope + details::ShareTensorsIntoScope(x, global_inner_scope); + details::ShareTensorsIntoScope(params, global_inner_scope); + // Step 2. create new interpretercore + interpreter_core = + paddle::framework::CreateInterpreterCoreInfoToCache(*forward_program, + place, + /*is_grad=*/false, + program_id, + global_inner_scope); + // Step 3. get all eager gc vars + std::set skip_eager_delete_vars = + paddle::framework::details::ParseSafeEagerDeletionSkipVarsSet( + *backward_program); + // all out_vars are skip_eager_var + skip_eager_delete_vars.insert(output_names.begin(), output_names.end()); + skip_eager_delete_vars.insert(dout_names.begin(), dout_names.end()); + // update interpretercore skip_gc_var + interpreter_core->SetSkipGcVars(skip_eager_delete_vars); + + std::set input_vars; + input_vars.insert(input_names.begin(), input_names.end()); + interpreter_core->SetJitInputVars(input_vars); if (VLOG_IS_ON(6)) { std::stringstream s; - s << "input_names: "; - for (auto name : input_names) { + s << "skip_eager_delete_vars: "; + for (auto name : skip_eager_delete_vars) { s << name << " "; } - s << std::endl; - s << "output_names: "; - for (auto name : output_names) { - s << name << " "; - } - s << std::endl; - s << "dout_names: "; - for (auto name : dout_names) { - s << name << " "; - } - s << std::endl; VLOG(6) << s.str(); } - auto *forward_global_block = PADDLE_GET_CONST( - paddle::framework::BlockDesc *, attrs.at("forward_global_block")); - auto *backward_global_block = PADDLE_GET_CONST( - paddle::framework::BlockDesc *, attrs.at("backward_global_block")); - auto *forward_program = forward_global_block->Program(); - auto *backward_program = backward_global_block->Program(); - - auto &interpretercore_info_cache = - paddle::framework::InterpreterCoreInfoCache::Instance(); - std::shared_ptr interpreter_core = - nullptr; - if (!interpretercore_info_cache.Has(program_id, /*is_grad=*/false)) { - paddle::platform::RecordEvent record_event( - "create_new_interpretercore", - paddle::platform::TracerEventType::UserDefined, - 1); - VLOG(2) << "No interpretercore cahce, so create a new interpretercore " - "for program: " - << program_id; - // Step 1. share input_vars & parameters into scope - details::ShareTensorsIntoScope(x, global_inner_scope); - details::ShareTensorsIntoScope(params, global_inner_scope); - // Step 2. create new interpretercore - interpreter_core = paddle::framework::CreateInterpreterCoreInfoToCache( - *forward_program, - place, - /*is_grad=*/false, - program_id, - global_inner_scope); - // Step 3. get all eager gc vars - std::set skip_eager_delete_vars = - paddle::framework::details::ParseSafeEagerDeletionSkipVarsSet( - *backward_program); - // all out_vars are skip_eager_var - skip_eager_delete_vars.insert(output_names.begin(), output_names.end()); - skip_eager_delete_vars.insert(dout_names.begin(), dout_names.end()); - // update interpretercore skip_gc_var - interpreter_core->SetSkipGcVars(skip_eager_delete_vars); - - std::set input_vars; - input_vars.insert(input_names.begin(), input_names.end()); - interpreter_core->SetJitInputVars(input_vars); - - if (VLOG_IS_ON(6)) { - std::stringstream s; - s << "skip_eager_delete_vars: "; - for (auto name : skip_eager_delete_vars) { - s << name << " "; - } - VLOG(6) << s.str(); - } - - interpretercore_info_cache.UpdateSkipEagerDeleteVars( - program_id, false, skip_eager_delete_vars); - VLOG(2) << "Get skip GC vars size is: " << skip_eager_delete_vars.size(); - } else { - paddle::platform::RecordEvent record_event( - "get_interpretercore_cahce", - paddle::platform::TracerEventType::UserDefined, - 1); - VLOG(2) << "Get interpretercore cahce by program:" << program_id; - // Step 1. get cache interpretercore - auto &cached_value = - interpretercore_info_cache.GetMutable(program_id, /*is_grad=*/false); - interpreter_core = cached_value.core_; - // Step 2. update scope for cache interpretercore - details::ShareTensorsIntoScope(x, global_inner_scope); - details::ShareTensorsIntoScope(params, global_inner_scope); - if (interpreter_core->GetVariableScope()->GetMutableScope() != - global_inner_scope) { - details::BuildScopeByBlock( - *interpreter_core.get(), *forward_global_block, global_inner_scope); - interpreter_core->reset_scope(global_inner_scope); - } - } - - // interpretercore run - if (forward_global_block->OpSize() > 0) { - paddle::platform::RecordEvent record_event( - "interpreter_core_run", - paddle::platform::TracerEventType::UserDefined, - 1); - interpreter_core->Run({}); - } - - { - paddle::platform::RecordEvent record_event( - "fetch_and_gc", paddle::platform::TracerEventType::UserDefined, 1); - // Get Output - details::ShareTensorsFromScopeWithPartialBlock(out, - *forward_global_block, - *backward_global_block, - global_inner_scope); - details::ShareTensorsFromScopeWithPartialBlock(dout, - *forward_global_block, - *backward_global_block, - global_inner_scope); - - VLOG(3) << paddle::framework::GenScopeTreeDebugInfo( - out_scope_vec->front()); - - if (is_test || !egr::Controller::Instance().HasGrad()) { - VLOG(4) << "is test, set this scope can reused"; - global_inner_scope->SetCanReuesd(true); - details::GcScope(global_inner_scope); - } else { - VLOG(4) << "not test, set this scope can not reused"; - global_inner_scope->SetCanReuesd(false); - } + interpretercore_info_cache.UpdateSkipEagerDeleteVars( + program_id, false, skip_eager_delete_vars); + VLOG(2) << "Get skip GC vars size is: " << skip_eager_delete_vars.size(); + } else { + paddle::platform::RecordEvent record_event( + "get_interpretercore_cahce", + paddle::platform::TracerEventType::UserDefined, + 1); + VLOG(2) << "Get interpretercore cahce by program:" << program_id; + // Step 1. get cache interpretercore + auto &cached_value = + interpretercore_info_cache.GetMutable(program_id, /*is_grad=*/false); + interpreter_core = cached_value.core_; + // Step 2. update scope for cache interpretercore + details::ShareTensorsIntoScope(x, global_inner_scope); + details::ShareTensorsIntoScope(params, global_inner_scope); + if (interpreter_core->GetVariableScope()->GetMutableScope() != + global_inner_scope) { + details::BuildScopeByBlock( + *interpreter_core.get(), *forward_global_block, global_inner_scope); + interpreter_core->reset_scope(global_inner_scope); } + } -#ifdef PADDLE_WITH_MKLDNN - if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place); -#endif - } else { - VLOG(2) << "RunProgramOp execute with parallel_executor."; - - // Step 2. prepare executor and init persistable variables - // NOTE(Aurelius84): While training some models, forward can be called many - // times and then apply backpropagation all at once, such as Reinforcement - // Learning. Tensor data in multi-step training should be saved into single - // scope separately. Otherwise, the gradients can be miscalculated because - // always using the Tensor data of the last step in forward. - paddle::framework::Scope *global_inner_scope = out_scope_vec->front(); - VLOG(2) << "The number of sub scopes before forward: " - << out_scope_vec->front()->kids().size(); - paddle::framework::Scope &scope = global_inner_scope->NewScope(); - - // share input_vars & parameters into scope - details::ShareTensorsIntoScope(x, &scope); - details::ShareTensorsIntoScope(params, &scope); - - const auto &place = egr::Controller::Instance().GetExpectedPlace(); - - auto *global_block = PADDLE_GET_CONST(paddle::framework::BlockDesc *, - attrs.at("global_block")); - auto start_op_index = PADDLE_GET_CONST(int64_t, attrs.at("start_op_index")); - auto end_op_index = PADDLE_GET_CONST(int64_t, attrs.at("end_op_index")); - - if (end_op_index > start_op_index) { - auto input_names = details::GetTensorsName(x); - auto output_names = details::GetTensorsName(out); - auto dout_names = details::GetTensorsName(dout); - auto *program = global_block->Program(); - - auto cache_info = - paddle::framework::GetExecutorInfoFromCache(*program, - place, - start_op_index, - end_op_index, - /*is_grad=*/false, - program_id, - &scope); - auto ¶llel_executor = cache_info.first; - // all out_vars are skip_eager_var - auto &skip_eager_delete_vars = - paddle::framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars( - program_id, false); - if (cache_info.second /*is_new_created*/) { - parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, input_names); - skip_eager_delete_vars.insert(skip_eager_delete_vars.end(), - output_names.begin(), - output_names.end()); - skip_eager_delete_vars.insert( - skip_eager_delete_vars.end(), dout_names.begin(), dout_names.end()); - paddle::framework::details::ParseSafeEagerDeletionSkipVars( - *program, end_op_index, output_names, &skip_eager_delete_vars); - } + // interpretercore run + if (forward_global_block->OpSize() > 0) { + paddle::platform::RecordEvent record_event( + "interpreter_core_run", + paddle::platform::TracerEventType::UserDefined, + 1); + interpreter_core->Run({}); + } - // Step 3. run ops - parallel_executor->RunWithoutFetch(skip_eager_delete_vars); - } - // Step 4. Get Output - details::ShareTensorsFromScope(out, *global_block, &scope); - details::ShareTensorsFromScope(dout, *global_block, &scope); + { + paddle::platform::RecordEvent record_event( + "fetch_and_gc", paddle::platform::TracerEventType::UserDefined, 1); + // Get Output + details::ShareTensorsFromScopeWithPartialBlock( + out, *forward_global_block, *backward_global_block, global_inner_scope); + details::ShareTensorsFromScopeWithPartialBlock(dout, + *forward_global_block, + *backward_global_block, + global_inner_scope); - // Debug info: scope info when run end VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front()); - // Step 5. Drop all children scopes while testing. + if (is_test || !egr::Controller::Instance().HasGrad()) { - out_scope_vec->front()->DropKids(); + VLOG(4) << "is test, set this scope can reused"; + global_inner_scope->SetCanReuesd(true); + details::GcScope(global_inner_scope); + } else { + VLOG(4) << "not test, set this scope can not reused"; + global_inner_scope->SetCanReuesd(false); } - VLOG(2) << "The number of sub scopes after forward: " - << out_scope_vec->front()->kids().size(); + } + #ifdef PADDLE_WITH_MKLDNN - if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place); + if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place); #endif - } } inline void RunProgramGradAPI( @@ -538,8 +457,6 @@ inline void RunProgramGradAPI( // if all output vars are set to stop_gradient, grad op no need to executed if (x_grad.empty() && params_grad.empty()) return; - bool use_interpretorcore = - PADDLE_GET_CONST(bool, attrs.at("use_interpretorcore")); auto program_id = PADDLE_GET_CONST(int64_t, attrs.at("program_id")); auto *out_scope_vec = &step_scope; @@ -550,196 +467,111 @@ inline void RunProgramGradAPI( "The OutScope of RunProgramGradOp should only hold one scope.")); auto place = egr::Controller::Instance().GetExpectedPlace(); - - if (use_interpretorcore) { - VLOG(2) << "RunProgramGradOp use interpretercore to execute program."; - - paddle::framework::Scope *global_inner_scope = out_scope_vec->front(); - - auto *forward_global_block = PADDLE_GET_CONST( - paddle::framework::BlockDesc *, attrs.at("forward_global_block")); - auto *backward_global_block = PADDLE_GET_CONST( - paddle::framework::BlockDesc *, attrs.at("backward_global_block")); - auto *backward_program = backward_global_block->Program(); - - auto out_grad_names = details::GetTensorsName(out_grad); - auto &interpretercore_info_cache = - paddle::framework::InterpreterCoreInfoCache::Instance(); - std::shared_ptr interpreter_core = - nullptr; - if (!interpretercore_info_cache.Has(program_id, /*is_grad=*/true)) { - paddle::platform::RecordEvent record_event( - "create_new_interpretercore", - paddle::platform::TracerEventType::UserDefined, - 1); - VLOG(2) << "No interpretercore cahce, so create a new interpretercore"; - details::ShareTensorsIntoScope(out_grad, global_inner_scope); - interpreter_core = paddle::framework::CreateInterpreterCoreInfoToCache( - *backward_program, - place, - /*is_grad=*/true, - program_id, - global_inner_scope); - - // share threadpool - // NOTE(zhiqiu): this only works interpreter_core is executed strictly - // after the related fwd_interpreter_core. - if (interpretercore_info_cache.Has(program_id, false)) { - auto fwd_interpreter_core = - interpretercore_info_cache.GetMutable(program_id, /*is_grad=*/false) - .core_; - interpreter_core->ShareWorkQueueFrom(fwd_interpreter_core); - VLOG(4) << "Share workqueue from " << fwd_interpreter_core.get() - << " to " << interpreter_core.get(); - } - - std::vector x_grad_names; - std::vector param_grad_names; - if (!x_grad.empty()) { - x_grad_names = details::GetTensorsName(x_grad); - } - if (!params_grad.empty()) { - param_grad_names = details::GetTensorsName(params_grad); - } - // get all eager gc vars - std::set skip_eager_delete_vars; - // all out_vars are skip_eager_var - skip_eager_delete_vars.insert(x_grad_names.begin(), x_grad_names.end()); - // initialize skip gc vars by forward_program and backward_program - paddle::framework::details::AppendSkipDeletionVars( - param_grad_names, &skip_eager_delete_vars); - interpreter_core->SetSkipGcVars(skip_eager_delete_vars); - interpretercore_info_cache.UpdateSkipEagerDeleteVars( - program_id, /*is_grad=*/true, skip_eager_delete_vars); - VLOG(2) << "Get skip GC vars size is: " << skip_eager_delete_vars.size(); - } else { - paddle::platform::RecordEvent record_event( - "get_interpretercore_cahce", - paddle::platform::TracerEventType::UserDefined, - 1); - VLOG(2) << "Get interpretercore cahce by program:" << program_id; - auto &cached_value = - interpretercore_info_cache.GetMutable(program_id, /*is_grad=*/true); - interpreter_core = cached_value.core_; - - // update scope - details::ShareTensorsIntoScope(out_grad, global_inner_scope); - if (interpreter_core->GetVariableScope()->GetMutableScope() != - global_inner_scope) { - details::BuildScopeByBlock(*interpreter_core.get(), - *backward_global_block, - global_inner_scope); - interpreter_core->reset_scope(global_inner_scope); - } + VLOG(2) << "RunProgramGradOp use interpretercore to execute program."; + + paddle::framework::Scope *global_inner_scope = out_scope_vec->front(); + + auto *forward_global_block = PADDLE_GET_CONST( + paddle::framework::BlockDesc *, attrs.at("forward_global_block")); + auto *backward_global_block = PADDLE_GET_CONST( + paddle::framework::BlockDesc *, attrs.at("backward_global_block")); + auto *backward_program = backward_global_block->Program(); + + auto out_grad_names = details::GetTensorsName(out_grad); + auto &interpretercore_info_cache = + paddle::framework::InterpreterCoreInfoCache::Instance(); + std::shared_ptr interpreter_core = + nullptr; + if (!interpretercore_info_cache.Has(program_id, /*is_grad=*/true)) { + paddle::platform::RecordEvent record_event( + "create_new_interpretercore", + paddle::platform::TracerEventType::UserDefined, + 1); + VLOG(2) << "No interpretercore cahce, so create a new interpretercore"; + details::ShareTensorsIntoScope(out_grad, global_inner_scope); + interpreter_core = + paddle::framework::CreateInterpreterCoreInfoToCache(*backward_program, + place, + /*is_grad=*/true, + program_id, + global_inner_scope); + + // share threadpool + // NOTE(zhiqiu): this only works interpreter_core is executed strictly + // after the related fwd_interpreter_core. + if (interpretercore_info_cache.Has(program_id, false)) { + auto fwd_interpreter_core = + interpretercore_info_cache.GetMutable(program_id, /*is_grad=*/false) + .core_; + interpreter_core->ShareWorkQueueFrom(fwd_interpreter_core); + VLOG(4) << "Share workqueue from " << fwd_interpreter_core.get() << " to " + << interpreter_core.get(); } - if (backward_global_block->OpSize() > 0) { - paddle::platform::RecordEvent record_event( - "interpreter_core_run", - paddle::platform::TracerEventType::UserDefined, - 1); - // Debug info: scope info when run end - VLOG(3) << paddle::framework::GenScopeTreeDebugInfo( - out_scope_vec->front()); - interpreter_core->Run({}); + std::vector x_grad_names; + std::vector param_grad_names; + if (!x_grad.empty()) { + x_grad_names = details::GetTensorsName(x_grad); } - - { - paddle::platform::RecordEvent record_event( - "fetch_and_gc", paddle::platform::TracerEventType::UserDefined, 1); - // Step 4. get outputs - details::ShareTensorsFromScopeWithPartialBlock(x_grad, - *forward_global_block, - *backward_global_block, - global_inner_scope); - details::ShareTensorsFromScopeWithPartialBlock(params_grad, - *forward_global_block, - *backward_global_block, - global_inner_scope); - VLOG(4) << "after backward gc all vars"; - global_inner_scope->SetCanReuesd(true); - details::GcScope(global_inner_scope); + if (!params_grad.empty()) { + param_grad_names = details::GetTensorsName(params_grad); } + // get all eager gc vars + std::set skip_eager_delete_vars; + // all out_vars are skip_eager_var + skip_eager_delete_vars.insert(x_grad_names.begin(), x_grad_names.end()); + // initialize skip gc vars by forward_program and backward_program + paddle::framework::details::AppendSkipDeletionVars(param_grad_names, + &skip_eager_delete_vars); + interpreter_core->SetSkipGcVars(skip_eager_delete_vars); + interpretercore_info_cache.UpdateSkipEagerDeleteVars( + program_id, /*is_grad=*/true, skip_eager_delete_vars); + VLOG(2) << "Get skip GC vars size is: " << skip_eager_delete_vars.size(); } else { - VLOG(2) << "RunProgramGradOp use pe to execute program."; - - paddle::framework::Scope *global_inner_scope = out_scope_vec->front(); - auto sub_scope_num = global_inner_scope->kids().size(); - VLOG(2) << "The number of sub scopes before backward: " << sub_scope_num; - PADDLE_ENFORCE_GT(sub_scope_num, - 0, - paddle::platform::errors::InvalidArgument( - "The OutScope of RunProgramGradOp should hold at " - "least one sub scope.")); - - auto &scope = *(global_inner_scope->kids().front()); - - auto *global_block = PADDLE_GET_CONST(paddle::framework::BlockDesc *, - attrs.at("global_block")); - auto orig_end_op_index = - PADDLE_GET_CONST(int64_t, attrs.at("end_op_index")); - - // NOTE: skip `shape` and `fill_constant` op created by - // fluid.backward.gradients, one forward output will generate one `shape` - // and `fill_constant` - int64_t start_op_index = orig_end_op_index + (out_grad.size() * 2); - int64_t end_op_index = global_block->OpSize(); - - if (end_op_index > start_op_index) { - auto out_grad_names = details::GetTensorsName(out_grad); - // Step 2. prepare executor and scope - auto *program = global_block->Program(); - auto cache_info = - paddle::framework::GetExecutorInfoFromCache(*program, - place, - start_op_index, - end_op_index, - /*is_grad*/ true, - program_id, - &scope); - auto ¶llel_executor = cache_info.first; - - auto &skip_eager_delete_vars = - paddle::framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars( - program_id, true); - if (cache_info.second /*is_new_created*/) { - parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, out_grad_names); - // NOTE: after PR22939 [Add double grad] merged, the grad op maker's - // SetOutput will set to None if the input var stop_gradient=True, - // it will cause an NotFound error when ctx.OutputNames() is called - std::vector x_grad_names; - std::vector param_grad_names; - if (!x_grad.empty()) { - x_grad_names = details::GetTensorsName(x_grad); - } - if (!params_grad.empty()) { - param_grad_names = details::GetTensorsName(params_grad); - } - skip_eager_delete_vars.insert(skip_eager_delete_vars.end(), - x_grad_names.begin(), - x_grad_names.end()); - paddle::framework::details::AppendSkipDeletionVars( - param_grad_names, &skip_eager_delete_vars); - } - - details::ShareTensorsIntoScope(out_grad, &scope); - // Debug info: scope info when run end - VLOG(3) << paddle::framework::GenScopeTreeDebugInfo( - out_scope_vec->front()); - - // Step 3. run ops - parallel_executor->RunWithoutFetch( - /*skip_eager_delete_vars=*/skip_eager_delete_vars); + paddle::platform::RecordEvent record_event( + "get_interpretercore_cahce", + paddle::platform::TracerEventType::UserDefined, + 1); + VLOG(2) << "Get interpretercore cahce by program:" << program_id; + auto &cached_value = + interpretercore_info_cache.GetMutable(program_id, /*is_grad=*/true); + interpreter_core = cached_value.core_; + + // update scope + details::ShareTensorsIntoScope(out_grad, global_inner_scope); + if (interpreter_core->GetVariableScope()->GetMutableScope() != + global_inner_scope) { + details::BuildScopeByBlock( + *interpreter_core.get(), *backward_global_block, global_inner_scope); + interpreter_core->reset_scope(global_inner_scope); } + } - // Step 4. get outputs - details::ShareTensorsFromScope(x_grad, *global_block, &scope); - details::ShareTensorsFromScope(params_grad, *global_block, &scope); + if (backward_global_block->OpSize() > 0) { + paddle::platform::RecordEvent record_event( + "interpreter_core_run", + paddle::platform::TracerEventType::UserDefined, + 1); + // Debug info: scope info when run end + VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front()); + interpreter_core->Run({}); + } - // Step5. drop current scope - global_inner_scope->DeleteScope(&scope); - VLOG(2) << "The number of sub scopes after backward: " - << global_inner_scope->kids().size(); + { + paddle::platform::RecordEvent record_event( + "fetch_and_gc", paddle::platform::TracerEventType::UserDefined, 1); + // Step 4. get outputs + details::ShareTensorsFromScopeWithPartialBlock(x_grad, + *forward_global_block, + *backward_global_block, + global_inner_scope); + details::ShareTensorsFromScopeWithPartialBlock(params_grad, + *forward_global_block, + *backward_global_block, + global_inner_scope); + VLOG(4) << "after backward gc all vars"; + global_inner_scope->SetCanReuesd(true); + details::GcScope(global_inner_scope); } } diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py index 626bdab2f5a..b8a4808f412 100644 --- a/python/paddle/jit/dy2static/partial_program.py +++ b/python/paddle/jit/dy2static/partial_program.py @@ -23,10 +23,6 @@ from paddle.fluid import backward, core, framework, program_guard from paddle.fluid.compiler import BuildStrategy from paddle.fluid.dygraph import layers from paddle.fluid.dygraph.base import switch_to_static_graph -from paddle.fluid.executor import ( - _is_dy2st_enable_standalone_executor, - _is_enable_standalone_executor, -) from paddle.fluid.framework import _apply_pass from paddle.fluid.layers.utils import _hash_with_id, flatten, pack_sequence_as @@ -128,14 +124,26 @@ class ProgramInfo: A helper class to recoder Program information """ - def __init__(self, mode='infer'): + def __init__(self): self.op_size = { 'fp32': -1, 'amp': -1, 'fp16': -1, } - assert mode in ['train', 'infer'] - self.mode = mode + self.programs = {} + self.mode = "infer" + + def __call__(self, key, prog_creator): + """ + Recoder infer program and op size. + """ + assert key in ['fp32', 'amp', 'fp16'] + if key not in self.programs: + infer_prog = prog_creator(is_infer_mode=True) + self.programs[key] = infer_prog + self.op_size[key] = infer_prog.desc.block(0).op_size() + + return self.programs[key], self.op_size[key] class PartialProgramLayer: @@ -176,7 +184,7 @@ class PartialProgramLayer: self._cuda_graph_pool_id = 0 # Set default mode to train self.training = True - self._infer_info = ProgramInfo(mode='infer') + self._infer_info = ProgramInfo() custom_white_list, custom_black_list = None, None tracer = framework._dygraph_tracer() @@ -191,6 +199,28 @@ class PartialProgramLayer: # program_id -> list(scope) self._scope_cache = {} + def __call__(self, inputs): + """ + Execute static graph by Interpreter and Return dynamic Tensors. + """ + in_vars, out_vars = self._prepare(inputs) + self._cast_fp16_if_pure_fp16(in_vars) + attrs = self._prepare_attributes() + + _legacy_C_ops.run_program( + self._valid_vars(in_vars), + self._valid_vars(self._params), + self._valid_vars(out_vars), + self._create_scope_vec( + program_id=self.program_id, use_scope_cache=True + ), + self._double_grads, + self._cuda_graph_vec, + *attrs + ) + restored_nest_out = self._restore_out(out_vars) + return self._remove_no_value(restored_nest_out) + def _get_scope(self, program_id=None, use_scope_cache=False): if use_scope_cache: if program_id not in self._scope_cache: @@ -259,8 +289,9 @@ class PartialProgramLayer: @switch_to_static_graph def _create_forward_backward_train_program(self): whole_program = self._train_program - forward_end_op_index = self._infer_info.op_size['fp32'] + _, forward_end_op_index = self._infer_info('fp32', self._create_program) assert forward_end_op_index >= 0 + return self._get_forward_backward_program_form( whole_program, forward_end_op_index ) @@ -268,8 +299,11 @@ class PartialProgramLayer: @switch_to_static_graph def _create_forward_backward_train_amp_program(self): whole_program = self._train_amp_program - forward_end_op_index = self._infer_info.op_size['amp'] + _, forward_end_op_index = self._infer_info( + 'amp', self._create_amp_program + ) assert forward_end_op_index >= 0 + return self._get_forward_backward_program_form( whole_program, forward_end_op_index ) @@ -277,8 +311,11 @@ class PartialProgramLayer: @switch_to_static_graph def _create_forward_backward_train_pure_fp16_program(self): whole_program = self._train_pure_fp16_program - forward_end_op_index = self._infer_info.op_size['fp16'] + _, forward_end_op_index = self._infer_info( + 'fp16', self._create_pure_fp16_program + ) assert forward_end_op_index >= 0 + return self._get_forward_backward_program_form( whole_program, forward_end_op_index ) @@ -289,11 +326,8 @@ class PartialProgramLayer: @LazyInitialized def _infer_program(self): - program = self._create_program(is_infer_mode=True) - self._infer_info.op_size['fp32'] = program.desc.block(0).op_size() - return self._build_infer_program( - program, self._infer_info.op_size['fp32'] - ) + program, op_size = self._infer_info('fp32', self._create_program) + return self._build_infer_program(program, op_size) @LazyInitialized def _train_amp_program(self): @@ -301,11 +335,8 @@ class PartialProgramLayer: @LazyInitialized def _infer_amp_program(self): - program = self._create_amp_program(is_infer_mode=True) - self._infer_info.op_size['amp'] = program.desc.block(0).op_size() - return self._build_infer_program( - program, self._infer_info.op_size['amp'] - ) + program, op_size = self._infer_info('amp', self._create_amp_program) + return self._build_infer_program(program, op_size) @LazyInitialized def _train_pure_fp16_program(self): @@ -313,11 +344,10 @@ class PartialProgramLayer: @LazyInitialized def _infer_pure_fp16_program(self): - program = self._create_pure_fp16_program(is_infer_mode=True) - self._infer_info.op_size['fp16'] = program.desc.block(0).op_size() - return self._build_infer_program( - program, self._infer_info.op_size['fp16'] + program, op_size = self._infer_info( + 'fp16', self._create_pure_fp16_program ) + return self._build_infer_program(program, op_size) @LazyInitialized def _train_forward_backward_program(self): @@ -632,27 +662,24 @@ class PartialProgramLayer: double_grads.append(var_base) return self._valid_vars(double_grads) - def _get_end_op_index(self): - if _in_amp_guard(): - infer_program = self._infer_amp_program - elif _in_pure_fp16_guard(): - infer_program = self._infer_pure_fp16_program - else: - infer_program = self._infer_program - return infer_program.desc.block(0).op_size() - - def __call__(self, inputs): - in_vars, out_vars = self._prepare(inputs) - - self._cast_fp16_if_pure_fp16(in_vars) + def _cast_fp16_if_pure_fp16(self, in_vars): + if _in_pure_fp16_guard(): + for i, var in enumerate(in_vars): + name = var.name + if ( + self.program.global_block().has_var(name) + and self.program.global_block().var(name).dtype + == paddle.float16 + ): + in_vars[i] = var.astype('float16') + in_vars[i].name = name + def _prepare_attributes(self): attrs = [ - 'global_block', - self.program.desc.block(0), - 'start_op_index', - 0, - 'end_op_index', - self._get_end_op_index(), + 'forward_global_block', + self.forward_program.desc.block(0), + 'backward_global_block', + self.backward_program.desc.block(0), 'is_test', not self.training, 'program_id', @@ -679,57 +706,7 @@ class PartialProgramLayer: self._cuda_graph_pool_id, ) ) - - use_interpretorcore = ( - _is_enable_standalone_executor() - and _is_dy2st_enable_standalone_executor() - ) - attrs.extend(('use_interpretorcore', use_interpretorcore)) - if use_interpretorcore: - attrs.extend( - ( - 'forward_global_block', - self.forward_program.desc.block(0), - 'backward_global_block', - self.backward_program.desc.block(0), - ) - ) - - _legacy_C_ops.run_program( - self._valid_vars(in_vars), - self._valid_vars(self._params), - self._valid_vars(out_vars), - self._create_scope_vec( - program_id=self.program_id, use_scope_cache=True - ), - self._double_grads, - self._cuda_graph_vec, - *attrs - ) - else: - _legacy_C_ops.run_program( - self._valid_vars(in_vars), - self._valid_vars(self._params), - self._valid_vars(out_vars), - self._create_scope_vec(), - self._double_grads, - self._cuda_graph_vec, - *attrs - ) - restored_nest_out = self._restore_out(out_vars) - return self._remove_no_value(restored_nest_out) - - def _cast_fp16_if_pure_fp16(self, in_vars): - if _in_pure_fp16_guard(): - for i, var in enumerate(in_vars): - name = var.name - if ( - self.program.global_block().has_var(name) - and self.program.global_block().var(name).dtype - == paddle.float16 - ): - in_vars[i] = var.astype('float16') - in_vars[i].name = name + return attrs @switch_to_static_graph def _build_infer_program(self, infer_program, forward_end_op_index): -- GitLab