From 8b2c906af3b14997dfe09ac215b89e711252a706 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Wed, 10 Nov 2021 15:01:58 +0800 Subject: [PATCH] Simplify constructor of InterpreterCore (#37072) * Simplify constructor of InterpreterCore * fix bool * clean code --- .../framework/new_executor/event_manager.cc | 9 +- .../framework/new_executor/event_manager.h | 11 +- .../framework/new_executor/interpretercore.cc | 102 ++++++++---------- .../framework/new_executor/interpretercore.h | 28 +++-- .../new_executor/new_executor_defs.h | 6 ++ .../fluid/framework/new_executor/profiler.h | 73 +++---------- .../new_executor/standalone_executor.cc | 11 +- .../new_executor/standalone_executor.h | 5 +- paddle/fluid/pybind/pybind.cc | 12 ++- 9 files changed, 100 insertions(+), 157 deletions(-) diff --git a/paddle/fluid/framework/new_executor/event_manager.cc b/paddle/fluid/framework/new_executor/event_manager.cc index 87caff8c57..a45f65d264 100644 --- a/paddle/fluid/framework/new_executor/event_manager.cc +++ b/paddle/fluid/framework/new_executor/event_manager.cc @@ -16,9 +16,8 @@ namespace paddle { namespace framework { - -void EventManager::WaitEvent(const Instruction& instruction, - const platform::Place& place) { +namespace interpreter { +void WaitEvent(const Instruction& instruction, const platform::Place& place) { // If InterpreterCore in on CPUPlace, do nothing. if (platform::is_cpu_place(place)) return; @@ -32,8 +31,7 @@ void EventManager::WaitEvent(const Instruction& instruction, } } -void EventManager::RecordEvent(const Instruction& instruction, - const platform::Place& place) { +void RecordEvent(const Instruction& instruction, const platform::Place& place) { // If InterpreterCore in on CPUPlace, do nothing. if (platform::is_cpu_place(place)) return; @@ -43,5 +41,6 @@ void EventManager::RecordEvent(const Instruction& instruction, } } +} // namespace interpreter } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/event_manager.h b/paddle/fluid/framework/new_executor/event_manager.h index d23c240469..a949ae1440 100644 --- a/paddle/fluid/framework/new_executor/event_manager.h +++ b/paddle/fluid/framework/new_executor/event_manager.h @@ -17,14 +17,11 @@ namespace paddle { namespace framework { +namespace interpreter { +void RecordEvent(const Instruction& instruction, const platform::Place& place); -class EventManager { - public: - void RecordEvent(const Instruction& instruction, - const platform::Place& place); - - void WaitEvent(const Instruction& instruction, const platform::Place& place); -}; +void WaitEvent(const Instruction& instruction, const platform::Place& place); +} // namespace interpreter } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index ea3e7dd411..89810fd303 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -33,9 +33,9 @@ namespace framework { // NOTE(Aurelius84): Need a better strategy to determine it. static constexpr size_t kHostNumThreads = 4; -InterpreterCore::InterpreterCore(const platform::Place& place, BlockDesc* block, - VariableScope* global_scope, - const std::vector& feed_names) +InterpreterCore::InterpreterCore(const platform::Place& place, + const BlockDesc& block, + VariableScope* global_scope) : place_(place), block_(block), global_scope_(global_scope), @@ -45,8 +45,6 @@ InterpreterCore::InterpreterCore(const platform::Place& place, BlockDesc* block, new interpreter::AsyncWorkQueue(kHostNumThreads, &main_thread_blocker_)); gc_.reset(new InterpreterCoreGarbageCollector()); - feed_names_ = feed_names; - exception_notifier_ = main_thread_blocker_.RegisterEvent( kExceptionCaught, [this]() { return exception_holder_.IsCaught(); }); @@ -65,27 +63,12 @@ InterpreterCore::~InterpreterCore() { } paddle::framework::FetchList InterpreterCore::Run( + const std::vector& feed_names, const std::vector& feed_tensors) { - auto FeedInput = [&] { - for (size_t i = 0; i < feed_names_.size(); ++i) { - auto* feed_var = global_scope_->Var(feed_names_[i]); - auto feed_tensor = feed_var->GetMutable(); - feed_tensor->ShareDataWith(feed_tensors[i]); - feed_tensor->set_lod(feed_tensors[i].lod()); - } - }; + bool is_build = is_build_; + Prepare(feed_names, feed_tensors, is_build); - if (is_build_ == false) { - paddle::framework::interpreter::build_variable_scope(*block_, - global_scope_); - FeedInput(); - paddle::framework::interpreter::build_op_func_list( - place_, *block_, &vec_func_list_, global_scope_); - is_build_ = true; - // convert vec func_list to graph - Convert(); - } else { - FeedInput(); + if (is_build) { ExecuteInstructionList(vec_instruction_); } @@ -95,9 +78,9 @@ paddle::framework::FetchList InterpreterCore::Run( } void InterpreterCore::Convert() { + auto& vec_meta_info = global_scope_->MutableVecMetaInfo(); auto var_nums = global_scope_->VarSize(); input_var2op_info_.resize(var_nums); - vec_meta_info_.resize(var_nums); auto op_nums = vec_func_list_.size(); vec_instruction_.reserve(op_nums); @@ -136,7 +119,7 @@ void InterpreterCore::Convert() { gc_check_input_list.erase(last, gc_check_input_list.end()); for (auto var_id : gc_check_input_list) { - vec_meta_info_[var_id].var_ref_count_++; + vec_meta_info[var_id].var_ref_count_++; instr.AddGCCheckVar(var_id); } } @@ -148,7 +131,7 @@ void InterpreterCore::Convert() { if (input_var2op_info_.at(id).size() == 0) { // output var not be used by any kernel vec_instruction_[i].AddGCCheckVar(id); - vec_meta_info_[id].var_ref_count_++; + vec_meta_info[id].var_ref_count_++; } } } @@ -180,7 +163,7 @@ void InterpreterCore::Convert() { } for (size_t i = 0; i < vec_instruction_.size(); ++i) { - BuildAndCacheInstructionCtx(&vec_instruction_[i], *global_scope_, place_); + BuildAndCacheInstructionCtx(&vec_instruction_[i]); } BuildSkipShareLoDInfo(); @@ -248,16 +231,14 @@ void InterpreterCore::BuildInplace() { } } -void InterpreterCore::BuildAndCacheInstructionCtx( - Instruction* instr_node, const VariableScope& var_scope, - const platform::Place& place) { +void InterpreterCore::BuildAndCacheInstructionCtx(Instruction* instr_node) { VariableValueMap ins_map; for (auto& var_name_item : instr_node->Inputs()) { std::vector input_vars; input_vars.reserve(var_name_item.second.size()); for (auto& id : var_name_item.second) { - input_vars.emplace_back(var_scope.Var(id)); + input_vars.emplace_back(global_scope_->Var(id)); } ins_map.emplace(var_name_item.first, std::move(input_vars)); } @@ -268,7 +249,7 @@ void InterpreterCore::BuildAndCacheInstructionCtx( out_vars.reserve(var_name_item.second.size()); for (auto& id : var_name_item.second) { - out_vars.emplace_back(var_scope.Var(id)); + out_vars.emplace_back(global_scope_->Var(id)); } outs_map.emplace(var_name_item.first, std::move(out_vars)); } @@ -359,7 +340,7 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { void InterpreterCore::ExecuteInstructionList( const std::vector& vec_instr) { async_work_queue_->PrepareAtomicDeps(dependecy_count_); - async_work_queue_->PrepareAtomicVarRef(vec_meta_info_); + async_work_queue_->PrepareAtomicVarRef(global_scope_->VecMetaInfo()); op_run_number_ = 0; exception_holder_.Clear(); @@ -452,7 +433,7 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id) { auto& instr_node = vec_instruction_.at(instr_id); auto* op = instr_node.OpBase(); platform::RecordEvent instruction_event(op->Type()); - event_manager_.WaitEvent(instr_node, place_); + interpreter::WaitEvent(instr_node, place_); try { RunInstruction(instr_node); @@ -479,7 +460,7 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id) { return; } - event_manager_.RecordEvent(instr_node, place_); + interpreter::RecordEvent(instr_node, place_); op_run_number_.fetch_add(1, std::memory_order_relaxed); // GC infomation @@ -508,11 +489,18 @@ void InterpreterCore::CheckGC(const Instruction& instr) { } } -void InterpreterCore::DryRunPrepare( - const std::vector& feed_tensors) { +void InterpreterCore::Prepare( + const std::vector& feed_names, + const std::vector& feed_tensors, bool prepare_feed) { + PADDLE_ENFORCE_EQ(feed_names.size(), feed_tensors.size(), + platform::errors::PreconditionNotMet( + "Required feed_names.size() == feed_tensors.size(), " + "but received %d != %d", + feed_names.size(), feed_tensors.size())); + auto FeedInput = [&] { - for (size_t i = 0; i < feed_names_.size(); ++i) { - auto* feed_var = global_scope_->FindVar(feed_names_[i]); + for (size_t i = 0; i < feed_names.size(); ++i) { + auto* feed_var = global_scope_->FindVar(feed_names[i]); PADDLE_ENFORCE_NOT_NULL(feed_var, platform::errors::NotFound( "feed_var shall not be nullptr.")); @@ -522,35 +510,33 @@ void InterpreterCore::DryRunPrepare( } }; - if (is_build_ == false) { - paddle::framework::interpreter::build_variable_scope(*block_, - global_scope_); + if (!is_build_) { + paddle::framework::interpreter::build_variable_scope(block_, global_scope_); FeedInput(); paddle::framework::interpreter::build_op_func_list( - place_, *block_, &vec_func_list_, global_scope_); + place_, block_, &vec_func_list_, global_scope_); is_build_ = true; // convert vec func_list to graph Convert(); } // NOTE: Because feed_tensor will be GC after // paddle::framework::build_op_func_list, so we should - // call - // FeedInput again. - FeedInput(); + // call FeedInput again. + if (prepare_feed) FeedInput(); } -const CostInfo& InterpreterCore::DryRun( +interpreter::CostInfo InterpreterCore::DryRun( + const std::vector& feed_names, const std::vector& feed_tensors) { - DryRunPrepare(feed_tensors); - // DryRun may be called many times. - dry_run_profiler_.Reset(); - dry_run_profiler_.Start(); - ExecuteInstructionList(vec_instruction_); - platform::DeviceContextPool::Instance().Get(place_)->Wait(); - - dry_run_profiler_.Pause(); - dry_run_profiler_.TotalCUDAAllocatedMemorySize(place_); - return dry_run_profiler_.GetCostInfo(); + Prepare(feed_names, feed_tensors, true); + interpreter::CostInfo cost_info; + { + interpreter::ProfilerGuard(place_, &cost_info); + ExecuteInstructionList(vec_instruction_); + platform::DeviceContextPool::Instance().Get(place_)->Wait(); + } + + return cost_info; } } // namespace framework diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h index 3a6876a912..915ae782e2 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.h +++ b/paddle/fluid/framework/new_executor/interpretercore.h @@ -40,23 +40,23 @@ using AtomicVectorSizeT = std::vector>>; class InterpreterCore { public: - InterpreterCore(const platform::Place& place, BlockDesc* block, - VariableScope* global_scope, - const std::vector& feed_names); + InterpreterCore(const platform::Place& place, const BlockDesc& block, + VariableScope* global_scope); ~InterpreterCore(); paddle::framework::FetchList Run( + const std::vector& feed_names, const std::vector& feed_tensors); - const CostInfo& DryRun(const std::vector& feed_tensors); + interpreter::CostInfo DryRun( + const std::vector& feed_names, + const std::vector& feed_tensors); private: void Convert(); - void BuildAndCacheInstructionCtx(Instruction* instr_node, - const VariableScope& var_scope, - const platform::Place& place); + void BuildAndCacheInstructionCtx(Instruction* instr_node); void BuildInplace(); @@ -66,7 +66,9 @@ class InterpreterCore { void ExecuteInstructionList(const std::vector& vec_instr); - void DryRunPrepare(const std::vector& feed_tensors); + void Prepare(const std::vector& feed_names, + const std::vector& feed_tensors, + bool prepare_feed); void CheckGC(const Instruction& instr); @@ -79,22 +81,17 @@ class InterpreterCore { bool is_build_; const platform::Place& place_; - BlockDesc* block_; // not owned + const BlockDesc& block_; // not owned VariableScope* global_scope_; // not owned std::vector vec_func_list_; std::vector vec_instruction_; // deconstruct before OpFuncNode - InstructionInfo instruction_info_; std::vector dependecy_count_; + std::atomic op_run_number_{0}; std::vector> input_var2op_info_; - std::vector vec_meta_info_; - std::vector feed_names_; - - InterpreterProfiler dry_run_profiler_; StreamAnalyzer stream_analyzer_; - EventManager event_manager_; EventsWaiter main_thread_blocker_; std::unique_ptr async_work_queue_; details::ExceptionHolder exception_holder_; @@ -102,7 +99,6 @@ class InterpreterCore { std::unique_ptr gc_; std::vector gc_event_; - std::atomic op_run_number_{0}; }; } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h index f4832cb283..37fb57072f 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.h +++ b/paddle/fluid/framework/new_executor/new_executor_defs.h @@ -607,6 +607,12 @@ class VariableScope : public ScopeBase { platform::errors::NotFound("%s not in VariableScope.", name)); } + std::vector& MutableVecMetaInfo() { return vec_meta_info_; } + + const std::vector& VecMetaInfo() const { + return vec_meta_info_; + } + private: std::vector var_list_; std::map name2id_; diff --git a/paddle/fluid/framework/new_executor/profiler.h b/paddle/fluid/framework/new_executor/profiler.h index 77783535b6..51c9e3d66a 100644 --- a/paddle/fluid/framework/new_executor/profiler.h +++ b/paddle/fluid/framework/new_executor/profiler.h @@ -20,84 +20,41 @@ namespace paddle { namespace framework { - -static void GetTensors(Variable* var, std::unordered_set* tensor_set) { - if (var->IsType() && var->Get().IsInitialized()) { - tensor_set->insert(var->GetMutable()); - } else if (var->IsType() && - var->Get().value().IsInitialized()) { - tensor_set->insert(var->GetMutable()->mutable_value()); - } else if (var->IsType()) { - auto* tensor_arr = var->GetMutable(); - for (auto& t : *tensor_arr) { - if (t.IsInitialized()) { - tensor_set->insert(&t); - } - } - } -} - -static std::pair GetTensorMemorySize( - const std::vector& var_list) { - std::unordered_set tensor_set; - for (auto* var : var_list) { - GetTensors(var, &tensor_set); - } - size_t host_memory_bytes = 0; - size_t device_memory_bytes = 0; - std::unordered_set allocation_set; - for (auto* tensor : tensor_set) { - auto allocation = tensor->Holder().get(); - if (!allocation_set.count(allocation)) { - allocation_set.insert(allocation); - if (platform::is_cuda_pinned_place(tensor->place()) || - platform::is_cpu_place(tensor->place())) { - VLOG(3) << "found host memory : " << allocation->size(); - host_memory_bytes += allocation->size(); - } else { - VLOG(3) << "found device memory : " << allocation->size(); - device_memory_bytes += allocation->size(); - } - } - } - return {host_memory_bytes, device_memory_bytes}; -} - +namespace interpreter { struct CostInfo { double total_time{0.}; // ms size_t device_memory_bytes{0}; // total allocated memory size }; -class InterpreterProfiler { +class ProfilerGuard { public: - void Start() { timer_.Start(); } - - void Pause() { - timer_.Pause(); - cost_info_.total_time += timer_.ElapsedMS(); + ProfilerGuard(const platform::Place& place, CostInfo* cost_info) + : place_(place), cost_info_(cost_info) { + timer_.Start(); } - void Reset() { - timer_.Reset(); - cost_info_.total_time = 0.; - cost_info_.device_memory_bytes = 0; + ~ProfilerGuard() { + timer_.Pause(); + cost_info_->total_time += timer_.ElapsedMS(); + TotalCUDAAllocatedMemorySize(place_); } + private: void TotalCUDAAllocatedMemorySize(const platform::Place& place) { if (platform::is_gpu_place(place)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, place); - cost_info_.device_memory_bytes = + cost_info_->device_memory_bytes = platform::RecordedCudaMallocSize(cuda_place.device); #endif } } - const CostInfo& GetCostInfo() const { return cost_info_; } - - private: + const platform::Place& place_; + CostInfo* cost_info_; platform::Timer timer_; - CostInfo cost_info_; }; + +} // namespace interpreter } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index 6d3b531c50..e3bcbaec7a 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -51,16 +51,15 @@ paddle::framework::FetchList StandaloneExecutor::Run( const std::vector& fetch_names) { auto core = GetInterpreterCore(feed_names, fetch_names); - return core->Run(feed_tensors); + return core->Run(feed_names, feed_tensors); } -const CostInfo& StandaloneExecutor::DryRun( +framework::interpreter::CostInfo StandaloneExecutor::DryRun( const std::vector& feed_names, const std::vector& feed_tensors) { auto core = GetInterpreterCore(feed_names, {}); - auto& cost_info = core->DryRun(feed_tensors); - return cost_info; + return core->DryRun(feed_names, feed_tensors); } void StandaloneExecutor::BuildVariableOuterScope( @@ -102,8 +101,8 @@ std::shared_ptr StandaloneExecutor::GetInterpreterCore( auto* block = new_prog->MutableBlock(0); interpreter::add_fetch(fetch_names, block); - auto core = std::make_shared(place_, block, &global_scope_, - feed_names); + auto core = + std::make_shared(place_, *block, &global_scope_); programs_.emplace(oss.str(), new_prog); interpretercores_.emplace(oss.str(), core); return core; diff --git a/paddle/fluid/framework/new_executor/standalone_executor.h b/paddle/fluid/framework/new_executor/standalone_executor.h index 6d48a85142..1fbdf7b4b0 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.h +++ b/paddle/fluid/framework/new_executor/standalone_executor.h @@ -45,8 +45,9 @@ class StandaloneExecutor : public ExecutorBase { const std::vector& feed_tensors, const std::vector& fetch_names); - const CostInfo& DryRun(const std::vector& feed_names, - const std::vector& feed_tensors); + framework::interpreter::CostInfo DryRun( + const std::vector& feed_names, + const std::vector& feed_tensors); private: void BuildVariableOuterScope(const framework::ProgramDesc& pdesc, diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 050a25c478..a5b0b1cd2a 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2069,11 +2069,13 @@ All parameter, weight, gradient are variables in Paddle. fetch_vars); }); - py::class_(m, "CostInfo") + py::class_(m, "CostInfo") .def(py::init<>()) - .def("total_time", [](CostInfo &self) { return self.total_time; }) - .def("device_memory_bytes", - [](CostInfo &self) { return self.device_memory_bytes; }); + .def("total_time", + [](interpreter::CostInfo &self) { return self.total_time; }) + .def("device_memory_bytes", [](interpreter::CostInfo &self) { + return self.device_memory_bytes; + }); py::class_(m, "StandaloneExecutor") .def(py::init