From c4de127757192e4c6f6fc2819bd55f5bdbed3337 Mon Sep 17 00:00:00 2001 From: Ruibiao Chen Date: Fri, 23 Sep 2022 20:15:33 +0800 Subject: [PATCH] Add ExecutionConfig and fix last-live-op bug for standalone executor (#46405) * Add ExecutionConfig and fix last-live-op bug for standalone executor * Improve code design --- .../framework/new_executor/CMakeLists.txt | 1 + .../new_executor/interpreter/CMakeLists.txt | 5 + .../interpreter/dependency_builder.cc | 16 +- .../interpreter/dependency_builder.h | 2 +- .../execution_config.cc} | 25 ++- .../interpreter/execution_config.h | 42 ++++ .../framework/new_executor/interpretercore.cc | 197 ++++++++++-------- .../framework/new_executor/interpretercore.h | 11 +- .../new_executor/interpretercore_util.cc | 29 ++- .../new_executor/interpretercore_util.h | 3 +- 10 files changed, 215 insertions(+), 116 deletions(-) rename paddle/fluid/framework/new_executor/{threadpool_config.h => interpreter/execution_config.cc} (85%) create mode 100644 paddle/fluid/framework/new_executor/interpreter/execution_config.h diff --git a/paddle/fluid/framework/new_executor/CMakeLists.txt b/paddle/fluid/framework/new_executor/CMakeLists.txt index 98ec60f8d7..2e6f273490 100644 --- a/paddle/fluid/framework/new_executor/CMakeLists.txt +++ b/paddle/fluid/framework/new_executor/CMakeLists.txt @@ -14,6 +14,7 @@ set(STANDALONE_EXECUTOR_SRCS set(STANDALONE_EXECUTOR_DEPS dependency_builder device_context + execution_config op_registry scope framework_proto diff --git a/paddle/fluid/framework/new_executor/interpreter/CMakeLists.txt b/paddle/fluid/framework/new_executor/interpreter/CMakeLists.txt index 86d34bf983..dc4b2e6407 100644 --- a/paddle/fluid/framework/new_executor/interpreter/CMakeLists.txt +++ b/paddle/fluid/framework/new_executor/interpreter/CMakeLists.txt @@ -2,3 +2,8 @@ cc_library( dependency_builder SRCS dependency_builder.cc DEPS operator) + +cc_library( + execution_config + SRCS execution_config.cc + DEPS phi_backends) diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc index f12fc719d4..0c094917f6 100644 --- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc +++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc @@ -16,16 +16,6 @@ #include -// The difference between "sequential_run" and "serial_run": -// "sequential_run" dispatches OPs one by one according to the sequence in the -// Program, while "serial_run" ensures that all Ops are scheduled in a singal -// thread. In standalone executor, "sequential_run" is also "serial_run", while -// "serial_run" is not necessarily "sequential_run". -PADDLE_DEFINE_EXPORTED_bool(new_executor_sequential_run, - false, - "Enable sequential execution for standalone " - "executor, only applied to GPU OPs."); - namespace paddle { namespace framework { namespace interpreter { @@ -67,7 +57,7 @@ const std::string StringizeDownstreamMap( } const std::map>& DependencyBuilder::Build( - const std::vector& instructions) { + const std::vector& instructions, bool is_sequential_run) { PADDLE_ENFORCE_EQ( is_build_, false, @@ -85,7 +75,7 @@ const std::map>& DependencyBuilder::Build( AddDependencyForRandomOp(); AddDependencyForReadOp(); - if (FLAGS_new_executor_sequential_run) { + if (is_sequential_run) { AddDependencyForSequentialRun(); } @@ -505,7 +495,7 @@ void DependencyBuilder::BuildOpHappensBefore() { next, op_idx)); op_happens_before_[op_idx][next] = true; - VLOG(8) << "happens before: " << op_idx << " " << next; + VLOG(10) << "happens before: " << op_idx << " " << next; q.push(next); } } diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h index 9d579014a1..ef2cfdc296 100644 --- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h +++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h @@ -36,7 +36,7 @@ class DependencyBuilder { // build op dependencies and return the mapping from op to its downstream-op // set const std::map>& Build( - const std::vector& instructions); + const std::vector& instructions, bool is_sequential_run); bool OpHappensBefore(int prior_op_idx, int posterior_op_idx); diff --git a/paddle/fluid/framework/new_executor/threadpool_config.h b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc similarity index 85% rename from paddle/fluid/framework/new_executor/threadpool_config.h rename to paddle/fluid/framework/new_executor/interpreter/execution_config.cc index 0270aa7d1e..0a7257ecb8 100644 --- a/paddle/fluid/framework/new_executor/threadpool_config.h +++ b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc @@ -12,12 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#pragma once +#include "paddle/fluid/framework/new_executor/interpreter/execution_config.h" +#include #include + #include "paddle/fluid/platform/device/ipu/ipu_info.h" #include "paddle/fluid/platform/device/npu/npu_info.h" -#include "paddle/fluid/platform/place.h" #include "paddle/phi/backends/device_manager.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/xpu/xpu_info.h" @@ -48,7 +49,7 @@ static constexpr size_t kMinOpNumForAsyncPrepare = 1000; // Note that the purpose of the config is to limit the total 'possible' // threads introduced by interpretercore to avoid hurting performance. -inline std::tuple GetThreadPoolConfig(const phi::Place place, +inline std::tuple GetThreadPoolConfig(const phi::Place& place, size_t op_num) { int num_device_threads = kDeviceNumThreads, num_host_threads = kHostNumThreads, @@ -131,6 +132,24 @@ inline std::tuple GetThreadPoolConfig(const phi::Place place, num_host_threads, num_device_threads, num_prepare_threads); } +ExecutionConfig::ExecutionConfig(const phi::Place& place, size_t op_num) { + std::tie(host_num_threads, deivce_num_threads, prepare_num_threads) = + GetThreadPoolConfig(place, op_num); +} + +void ExecutionConfig::Log(int log_level) { + VLOG(log_level) << "ExecutionConfig:"; + VLOG(log_level) << "used_for_jit = " << used_for_jit; + VLOG(log_level) << "create_local_scope = " << create_local_scope; + VLOG(log_level) << "host_num_threads = " << host_num_threads; + VLOG(log_level) << "deivce_num_threads = " << deivce_num_threads; + VLOG(log_level) << "prepare_num_threads = " << prepare_num_threads; + VLOG(log_level) << "skip_gc_vars = "; + for (const std::string& var : skip_gc_vars) { + VLOG(log_level) << var; + } +} + } // namespace interpreter } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/interpreter/execution_config.h b/paddle/fluid/framework/new_executor/interpreter/execution_config.h new file mode 100644 index 0000000000..e637cce7b3 --- /dev/null +++ b/paddle/fluid/framework/new_executor/interpreter/execution_config.h @@ -0,0 +1,42 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace framework { +namespace interpreter { + +struct ExecutionConfig { + bool used_for_jit{false}; + bool create_local_scope{true}; + + size_t host_num_threads; + size_t deivce_num_threads; + size_t prepare_num_threads; + + std::set skip_gc_vars; + + ExecutionConfig(const phi::Place& place, size_t op_num); + void Log(int log_level); +}; + +} // namespace interpreter +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 854f2da0d2..dd725cff66 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -19,7 +19,6 @@ #include "paddle/fluid/framework/details/nan_inf_utils.h" #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h" #include "paddle/fluid/framework/new_executor/interpretercore_util.h" -#include "paddle/fluid/framework/new_executor/threadpool_config.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/os_info.h" @@ -32,6 +31,15 @@ #endif #include "paddle/phi/backends/device_manager.h" +// The difference between "sequential_run" and "serial_run": +// "sequential_run" dispatches OPs one by one according to the sequence in the +// Program, while "serial_run" ensures that all Ops are scheduled in a singal +// thread. In standalone executor, "sequential_run" is also "serial_run", while +// "serial_run" is not necessarily "sequential_run". +PADDLE_DEFINE_EXPORTED_bool(new_executor_sequential_run, + false, + "Enable sequential execution for standalone " + "executor, only applied to GPU OPs."); PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace, false, "Use inplace in new executor"); @@ -49,6 +57,8 @@ constexpr const char* kTaskCompletion = "TaskCompletion"; namespace paddle { namespace framework { +// TODO(Ruibia): Pass skip_gc_vars, used_for_jit, and other config messages by +// constructing an interpreter::ExecutionConfig InterpreterCore::InterpreterCore(const platform::Place& place, const BlockDesc& block, const std::set& skip_gc_vars, @@ -56,10 +66,9 @@ InterpreterCore::InterpreterCore(const platform::Place& place, bool used_for_jit) : place_(place), block_(block), - skip_gc_vars_(skip_gc_vars), + execution_config_(place, block.OpSize()), var_scope_(scope), - stream_analyzer_(place), - used_for_jit_(used_for_jit) { + stream_analyzer_(place) { VLOG(4) << "InterpreterCore(): " << this << " on " << place_; is_build_ = false; @@ -67,14 +76,13 @@ InterpreterCore::InterpreterCore(const platform::Place& place, exception_notifier_ = main_thread_blocker_.RegisterEvent(kExceptionCaught); completion_notifier_ = main_thread_blocker_.RegisterEvent(kTaskCompletion); - create_local_scope_ = FLAGS_new_executor_use_local_scope; - - if (used_for_jit_) { - create_local_scope_ = false; - } - VLOG(4) << "create_local_scope_ is " << create_local_scope_; + execution_config_.used_for_jit = used_for_jit; + execution_config_.create_local_scope = + !used_for_jit && FLAGS_new_executor_use_local_scope; + execution_config_.skip_gc_vars = skip_gc_vars; + execution_config_.Log(/*log_level=*/8); - if (create_local_scope_) { + if (execution_config_.create_local_scope) { auto local_scope = &var_scope_.GetMutableScope()->NewScope(); local_scope_ = local_scope; } @@ -127,7 +135,7 @@ interpreter::CostInfo InterpreterCore::DryRun( platform::DeviceContextPool::Instance().Get(place_)->Wait(); } - if (create_local_scope_) { + if (execution_config_.create_local_scope) { ClearLoDTensorArrayInLocalScope(); } @@ -173,7 +181,7 @@ paddle::framework::FetchList InterpreterCore::Run( } #endif } - if (create_local_scope_) { + if (execution_config_.create_local_scope) { ClearLoDTensorArrayInLocalScope(); } @@ -201,16 +209,17 @@ paddle::framework::FetchList InterpreterCore::Run( if (!is_build_) { LOG_FIRST_N(INFO, 1) << "New Executor is Running."; paddle::framework::interpreter::build_variable_scope( - block_, &var_scope_, create_local_scope_); + block_, &var_scope_, execution_config_.create_local_scope); std::vector op_func_nodes; - paddle::framework::interpreter::build_op_func_list(place_, - block_, - skip_gc_vars_, - &op_func_nodes, - &var_scope_, - create_local_scope_, - used_for_jit_); + paddle::framework::interpreter::build_op_func_list( + place_, + block_, + execution_config_.skip_gc_vars, + &op_func_nodes, + &var_scope_, + execution_config_.create_local_scope, + execution_config_.used_for_jit); is_build_ = true; SetFeedVarsInplaceSkip(feed_names); // convert vec func_list to graph @@ -239,12 +248,13 @@ paddle::framework::FetchList InterpreterCore::Run( #endif } - if (create_local_scope_) { + if (execution_config_.create_local_scope) { ClearLoDTensorArrayInLocalScope(); } // return Fetch Tensors - Scope* inner_scope = - create_local_scope_ ? local_scope_ : var_scope_.GetMutableScope(); + Scope* inner_scope = execution_config_.create_local_scope + ? local_scope_ + : var_scope_.GetMutableScope(); auto* fetch_var = inner_scope->FindVar(interpreter::kFetchVarName); if (fetch_var) { return std::move(*fetch_var->GetMutable()); @@ -259,12 +269,13 @@ void InterpreterCore::SetCopyProgram(std::shared_ptr prog) { void InterpreterCore::SetSkipGcVars(const std::set& skip_gc_vars) { PADDLE_ENFORCE_EQ( - skip_gc_vars_.empty(), + execution_config_.skip_gc_vars.empty(), true, platform::errors::PreconditionNotMet( - "Skip_gc_vars_ can only be initialized once, now skip_gc_vars_ is " + "execution_config_.skip_gc_vars can only be initialized once, now " + "execution_config_.skip_gc_vars is " "not empty, do not call SetSkipGcVars method repeatedly.")); - skip_gc_vars_ = skip_gc_vars; + execution_config_.skip_gc_vars = skip_gc_vars; } const VariableScope* InterpreterCore::GetVariableScope() const { @@ -288,12 +299,13 @@ void InterpreterCore::ShareWorkQueueFrom(std::shared_ptr src) { << ") to InterpreterCore(" << this << ")"; } -bool InterpreterCore::BuildInplaceCheckVarIsOnlyInput(size_t var_index) { +bool InterpreterCore::BuildInplaceCheckVarIsOnlyInput( + const std::vector>& input_var2op, size_t var_index) { if (!var_scope_.VarDesc(var_index)) { - return input_var2op_info_.at(var_index).size() == 1; + return input_var2op.at(var_index).size() == 1; } else { int is_input_cnt = 0; - for (auto inst_id : input_var2op_info_.at(var_index)) { + for (auto inst_id : input_var2op.at(var_index)) { OpInOutInfo info; info.Build(vec_instruction_.at(inst_id).OpBase()); if (info.IsInArgBufferNeeded(var_scope_.VarDesc(var_index)->Name())) { @@ -306,21 +318,19 @@ bool InterpreterCore::BuildInplaceCheckVarIsOnlyInput(size_t var_index) { std::shared_ptr InterpreterCore::GetWorkQueue() { if (async_work_queue_ == nullptr) { - int host_num_threads = 1, deivce_num_threads = 1, prepare_num_threads = 1; - std::tie(host_num_threads, deivce_num_threads, prepare_num_threads) = - interpreter::GetThreadPoolConfig(place_, vec_instruction_.size()); - async_work_queue_ = - std::make_shared(host_num_threads, - deivce_num_threads, - prepare_num_threads, - &main_thread_blocker_); + async_work_queue_ = std::make_shared( + execution_config_.host_num_threads, + execution_config_.deivce_num_threads, + execution_config_.prepare_num_threads, + &main_thread_blocker_); } return async_work_queue_; } void InterpreterCore::BuildAndCacheInstructionCtx(Instruction* instr_node) { - Scope* inner_scope = - create_local_scope_ ? local_scope_ : var_scope_.GetMutableScope(); + Scope* inner_scope = execution_config_.create_local_scope + ? local_scope_ + : var_scope_.GetMutableScope(); VariableValueMap ins_map; for (auto& var_name_item : instr_node->Inputs()) { std::vector input_vars; @@ -346,8 +356,9 @@ void InterpreterCore::BuildAndCacheInstructionCtx(Instruction* instr_node) { // set runtime_ctx and infershape_ctx_ if (instr_node->OpBase()->Type() == "cinn_launch") { // OP use scope in // kernel - Scope* local_scope = create_local_scope_ ? var_scope_.GetMutableLocalScope() - : var_scope_.GetMutableScope(); + Scope* local_scope = execution_config_.create_local_scope + ? var_scope_.GetMutableLocalScope() + : var_scope_.GetMutableScope(); instr_node->ResetContextWithScope(ins_map, outs_map, *local_scope); } else { instr_node->ResetContext(ins_map, outs_map); @@ -377,8 +388,19 @@ void InterpreterCore::BuildInplace() { } } - Scope* local_scope = create_local_scope_ ? var_scope_.GetMutableLocalScope() - : var_scope_.GetMutableScope(); + Scope* local_scope = execution_config_.create_local_scope + ? var_scope_.GetMutableLocalScope() + : var_scope_.GetMutableScope(); + std::vector> input_var2op(var_scope_.VarSize()); + for (Instruction& instr : vec_instruction_) { + for (auto& item : instr.Inputs()) { + for (int var_id : item.second) { + if (var_id != kEmptyVarIndex) { + input_var2op.at(var_id).push_back(instr.Id()); + } + } + } + } for (size_t i = 0; i < vec_instruction_.size(); ++i) { auto& instr = vec_instruction_[i]; @@ -402,7 +424,7 @@ void InterpreterCore::BuildInplace() { if (var_scope_.GetVarSikpInplace(iter->second[0])) { continue; } - if (BuildInplaceCheckVarIsOnlyInput(iter->second[0])) { + if (BuildInplaceCheckVarIsOnlyInput(input_var2op, iter->second[0])) { auto iterout = outputs.find(pair.second); if (iterout != outputs.end() && !iterout->second.empty()) { const std::string& invar_name = @@ -432,7 +454,9 @@ void InterpreterCore::BuildOperatorDependences() { // Schedule auto op_nums = vec_instruction_.size(); dependecy_count_.resize(op_nums); - auto op2downstream = dependency_builder_.Build(vec_instruction_); + auto op2downstream = dependency_builder_.Build( + vec_instruction_, + /*is_sequential_run=*/FLAGS_new_executor_sequential_run); for (size_t op = 0; op < vec_instruction_.size(); ++op) { auto op_list = op2downstream[op]; std::vector downsteam_vector(op_list.begin(), op_list.end()); @@ -460,10 +484,7 @@ void InterpreterCore::ClearLoDTensorArrayInLocalScope() { void InterpreterCore::Convert( std::vector* op_func_nodes) { auto& vec_meta_info = var_scope_.MutableVecMetaInfo(); - auto var_nums = var_scope_.VarSize(); - input_var2op_info_.resize(var_nums); auto nodes = *op_func_nodes; - auto op_nums = nodes.size(); vec_instruction_.reserve(op_nums); for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) { @@ -471,35 +492,42 @@ void InterpreterCore::Convert( auto* dev_ctx_ = stream_analyzer_.ParseDeviceContext(op_func_node); vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_); } + BuildOperatorDependences(); + // calculate last_live_ops_ for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) { - auto& instr = vec_instruction_[op_idx]; + Instruction& instr = vec_instruction_[op_idx]; OpInOutInfo info; - std::set gc_check_inputs; - for (auto& item : instr.Inputs()) { + info.Build(instr.OpBase()); + + std::set gc_check_vars; + + const std::map>& ins = instr.Inputs(); + const std::map>& outs = instr.Outputs(); + std::multimap> ins_and_outs{ins.begin(), + ins.end()}; + ins_and_outs.insert(outs.begin(), outs.end()); + + for (auto& item : ins_and_outs) { for (auto id : item.second) { if (id == kEmptyVarIndex) { continue; } - input_var2op_info_.at(id).push_back(op_idx); - // var can be gc-ed - if (!info.IsBuilt()) { - info.Build(instr.OpBase()); - } auto* var_desc = var_scope_.VarDesc(id); - if (var_desc) { - if (info.IsInArgBufferNeeded(var_desc->Name())) { - gc_check_inputs.insert(id); - } - } else { - gc_check_inputs.insert(id); + // skip no_need_buffer input vars + if (var_desc && ins.count(item.first) && + !info.IsInArgBufferNeeded(var_desc->Name())) { + continue; } + gc_check_vars.insert(id); } } - for (auto var_id : gc_check_inputs) { - Scope* inner_scope = - create_local_scope_ ? local_scope_ : var_scope_.GetMutableScope(); + + for (auto var_id : gc_check_vars) { + Scope* inner_scope = execution_config_.create_local_scope + ? local_scope_ + : var_scope_.GetMutableScope(); paddle::framework::Variable* var = inner_scope->FindVar(var_scope_.GetNameById(var_id)); if (var->IsType() || var->IsType() || @@ -512,18 +540,9 @@ void InterpreterCore::Convert( } } } - for (size_t i = 0; i < vec_instruction_.size(); ++i) { - // checkout output - for (auto& item : vec_instruction_[i].Outputs()) { - for (auto var_id : item.second) { - if (input_var2op_info_.at(var_id).size() == 0) { - last_live_ops_[var_id].insert(i); - } - } - } - } + // clear the last_live_ops list for all vars in skip_gc_vars - for (const std::string& skip_gc_var : skip_gc_vars_) { + for (const std::string& skip_gc_var : execution_config_.skip_gc_vars) { int var_id = var_scope_.GetIdByName(skip_gc_var); if (var_id != -1) { last_live_ops_[var_id].clear(); @@ -662,8 +681,9 @@ inline void SetDeviceId(const platform::Place& place) { void InterpreterCore::RunInstruction(const Instruction& instr_node) { auto* op = instr_node.OpBase(); auto place = instr_node.DeviceContext().GetPlace(); - Scope* local_scope = create_local_scope_ ? var_scope_.GetMutableLocalScope() - : var_scope_.GetMutableScope(); + Scope* local_scope = execution_config_.create_local_scope + ? var_scope_.GetMutableLocalScope() + : var_scope_.GetMutableScope(); VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope_); SetDeviceId(place); @@ -947,6 +967,8 @@ void InterpreterCore::RunInstructionAsync( CheckGC(instr_node, atomic_var_ref); + interpreter::LogDeviceMemoryStats(place_); + interpreter::RecordEvent(instr_node, place_); } catch (platform::EnforceNotMet& ex) { framework::InsertCallStackInfo(op->Type(), op->Attrs(), &ex); @@ -1135,16 +1157,17 @@ void InterpreterCore::Prepare( if (!is_build_) { paddle::framework::interpreter::build_variable_scope( - block_, &var_scope_, create_local_scope_); + block_, &var_scope_, execution_config_.create_local_scope); FeedInput(); std::vector op_func_nodes; - paddle::framework::interpreter::build_op_func_list(place_, - block_, - skip_gc_vars_, - &op_func_nodes, - &var_scope_, - create_local_scope_, - used_for_jit_); + paddle::framework::interpreter::build_op_func_list( + place_, + block_, + execution_config_.skip_gc_vars, + &op_func_nodes, + &var_scope_, + execution_config_.create_local_scope, + execution_config_.used_for_jit); is_build_ = true; SetFeedVarsInplaceSkip(feed_names); // convert vec func_list to graph diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h index 86f3768c54..eca8ad1b2f 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.h +++ b/paddle/fluid/framework/new_executor/interpretercore.h @@ -23,6 +23,7 @@ #include "paddle/fluid/framework/new_executor/event_manager.h" #include "paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h" #include "paddle/fluid/framework/new_executor/interpreter/dependency_builder.h" +#include "paddle/fluid/framework/new_executor/interpreter/execution_config.h" #include "paddle/fluid/framework/new_executor/interpretercore_util.h" #include "paddle/fluid/framework/new_executor/new_executor_defs.h" #include "paddle/fluid/framework/new_executor/profiler.h" @@ -67,7 +68,8 @@ class InterpreterCore { void reset_scope(Scope* new_scope); private: - bool BuildInplaceCheckVarIsOnlyInput(size_t var_index); + bool BuildInplaceCheckVarIsOnlyInput( + const std::vector>& input_var2op, size_t var_index); std::shared_ptr GetWorkQueue(); @@ -112,9 +114,9 @@ class InterpreterCore { platform::Place place_; const BlockDesc& block_; // not owned - std::set skip_gc_vars_; interpreter::DependencyBuilder dependency_builder_; + interpreter::ExecutionConfig execution_config_; // NOTE(zhiqiu): when add fetch ops in GetInterpreterCore, we will // copy a new program and block, the copy_program_ here is used to @@ -134,10 +136,7 @@ class InterpreterCore { std::vector dependecy_count_; std::atomic unfinished_op_numer_{0}; - std::vector> input_var2op_info_; - VariableScope var_scope_; - bool create_local_scope_{true}; Scope* local_scope_{nullptr}; // not owned StreamAnalyzer stream_analyzer_; @@ -151,8 +150,6 @@ class InterpreterCore { std::future> atomic_deps_; std::future> atomic_var_ref_; - - bool used_for_jit_{false}; }; std::shared_ptr CreateInterpreterCore( diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc index edd5f76987..934d29591d 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_util.cc +++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc @@ -11,6 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + #include "paddle/fluid/framework/new_executor/interpretercore_util.h" #include @@ -18,6 +19,7 @@ #include "paddle/fluid/framework/details/nan_inf_utils.h" #include "paddle/fluid/framework/executor_gc_helper.h" #include "paddle/fluid/framework/new_executor/data_transfer.h" +#include "paddle/fluid/memory/stats.h" #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h" #include "paddle/fluid/operators/controlflow/recurrent_op_helper.h" #include "paddle/fluid/operators/controlflow/while_op_helper.h" @@ -34,6 +36,11 @@ PADDLE_DEFINE_EXPORTED_bool( false, "Enable serial execution for standalone executor, used for debug."); +PADDLE_DEFINE_EXPORTED_bool( + new_executor_log_memory_stats, + false, + "Log memory stats after each op runs, just used for debug."); + DECLARE_bool(use_mkldnn); DECLARE_bool(check_nan_inf); @@ -135,6 +142,21 @@ std::unique_ptr PrepareAtomicVarRef( return var_ref; } +void LogDeviceMemoryStats(const platform::Place& place) { + if (FLAGS_new_executor_log_memory_stats && platform::is_gpu_place(place)) { + VLOG(0) << "memory_allocated: " + << static_cast(memory::DeviceMemoryStatCurrentValue( + "Allocated", place.device)) / + 1024 / 1024 + << " MB"; + VLOG(0) << "max_memory_allocated: " + << static_cast(memory::DeviceMemoryStatPeakValue( + "Allocated", place.device)) / + 1024 / 1024 + << " MB"; + } +} + bool var_can_be_deleted(const std::string& name, const BlockDesc& block) { auto* var_desc = block.FindVar(name); if (var_desc == nullptr || var_desc->Persistable()) { @@ -629,18 +651,14 @@ void build_op_func_list(const platform::Place& place, // step 5. run kernel if (run_phi_kernel) { - VLOG(1) << "start run phi kernel. "; phi::KernelContext phi_kernel_context; op_with_kernel->BuildPhiKernelContext( runtime_context, dev_ctx, &phi_kernel_context); (*op_func_node.phi_kernel_)(&phi_kernel_context); - VLOG(1) << "end run phi kernel. "; } else { - VLOG(4) << "start run kernel. "; // the place of exec_ctx maybe has changed. op_func_node.kernel_func_(ExecutionContext( *op_with_kernel, *runtime_scope, *dev_ctx, runtime_context)); - VLOG(4) << "end run kernel. "; } // post-process grad_op.outputs if need cast complex grad into real @@ -702,6 +720,7 @@ void build_op_func_list(const platform::Place& place, // gc--------------------------------------------------------------------------- auto iter = unused_var_map.find(op); if (iter == unused_var_map.end()) { + interpreter::LogDeviceMemoryStats(place); continue; } @@ -722,6 +741,8 @@ void build_op_func_list(const platform::Place& place, } } delete garbages; // free mem + + interpreter::LogDeviceMemoryStats(place); } } diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.h b/paddle/fluid/framework/new_executor/interpretercore_util.h index 8fc0dcb266..ec2bbbbd1f 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_util.h +++ b/paddle/fluid/framework/new_executor/interpretercore_util.h @@ -44,7 +44,6 @@ constexpr size_t kPrepareWorkQueueIdx = 2; namespace paddle { namespace framework { namespace interpreter { - class AsyncWorkQueue { public: AsyncWorkQueue(size_t host_num_threads, @@ -77,6 +76,8 @@ std::unique_ptr PrepareAtomicDeps( std::unique_ptr PrepareAtomicVarRef( const std::vector& vec_meta_info); +void LogDeviceMemoryStats(const platform::Place& place); + void build_variable_scope(const framework::BlockDesc& block, VariableScope* var_scope, bool use_local_scope = true); -- GitLab