From 2fff5a58ba136cd7a16ff76413168ff1ace17c50 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Sat, 18 Sep 2021 10:43:04 +0800 Subject: [PATCH] Clean ParseMemInfo and Fix unittest failed under multi-thread (#35840) * Clean ParaseMemInfo and fix unittest with multi-thread * fix declare --- .../framework/new_executor/interpretercore.cc | 19 +++++---------- .../framework/new_executor/interpretercore.h | 5 ++-- .../fluid/framework/new_executor/profiler.h | 24 +++---------------- paddle/fluid/pybind/pybind.cc | 6 +---- .../interpreter/test_standalone_executor.py | 7 ------ 5 files changed, 12 insertions(+), 49 deletions(-) diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index b8bb6d21ebc..2da9c275c3d 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -324,16 +324,15 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { } void InterpreterCore::ExecuteInstructionList( - const std::vector& vec_instr, bool is_dry_run) { + const std::vector& vec_instr) { auto atomic_deps = async_work_queue_.PrepareAtomicDeps(dependecy_count_); auto atomic_var_ref = async_work_queue_.PrepareAtomicVarRef(vec_meta_info_); std::atomic op_run_number{0}; for (size_t i = 0; i < dependecy_count_.size(); ++i) { if (dependecy_count_[i] == 0) { - async_work_queue_.AddTask(vec_instr[i].type_, [&, i, is_dry_run]() { - RunInstructionAsync(i, &atomic_deps, &atomic_var_ref, &op_run_number, - is_dry_run); + async_work_queue_.AddTask(vec_instr[i].type_, [&, i]() { + RunInstructionAsync(i, &atomic_deps, &atomic_var_ref, &op_run_number); }); } } @@ -350,8 +349,7 @@ void InterpreterCore::ExecuteInstructionList( void InterpreterCore::RunInstructionAsync(size_t instr_id, AtomicVectorSizeT* atomic_deps, AtomicVectorSizeT* atomic_var_ref, - std::atomic* op_run_number, - bool is_dry_run) { + std::atomic* op_run_number) { auto& instr_node = vec_instruction_[instr_id]; event_manager_.WaitEvent(instr_node, place_); @@ -360,10 +358,6 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id, event_manager_.RecordEvent(instr_node, place_); op_run_number->fetch_add(1, std::memory_order_relaxed); - if (is_dry_run) { - dry_run_profiler_.ParseMemoryInfo(global_scope_->var_list); - } - auto& next_instr = instr_node.next_instruction_.all_next_ops_; for (auto next_i : next_instr) { @@ -372,8 +366,7 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id, atomic_deps->at(next_i)->fetch_sub(1, std::memory_order_relaxed) == 1; if (is_ready) { async_work_queue_.AddTask(vec_instruction_[next_i].type_, [=]() { - RunInstructionAsync(next_i, atomic_deps, atomic_var_ref, op_run_number, - is_dry_run); + RunInstructionAsync(next_i, atomic_deps, atomic_var_ref, op_run_number); }); } } @@ -433,7 +426,7 @@ const CostInfo& InterpreterCore::DryRun( // DryRun may be called many times. dry_run_profiler_.Reset(); dry_run_profiler_.Start(); - ExecuteInstructionList(vec_instruction_, /*is_dry_run=*/true); + ExecuteInstructionList(vec_instruction_); platform::DeviceContextPool::Instance().Get(place_)->Wait(); dry_run_profiler_.Pause(); diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h index 371f5bba2af..d7fb6b7fd91 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.h +++ b/paddle/fluid/framework/new_executor/interpretercore.h @@ -59,8 +59,7 @@ class InterpreterCore { void RunInstruction(const Instruction& instr_node); - void ExecuteInstructionList(const std::vector& vec_instr, - bool is_dry_run = false); + void ExecuteInstructionList(const std::vector& vec_instr); void DryRunPrepare(const std::vector& feed_tensors); @@ -70,7 +69,7 @@ class InterpreterCore { void RunInstructionAsync(size_t instr_id, AtomicVectorSizeT* working_dependecy_count, AtomicVectorSizeT* working_var_ref, - std::atomic* op_run_number, bool is_dry_run); + std::atomic* op_run_number); void AddFetch(const std::vector& fetch_names); void BuildSkipShareLoDInfo(); diff --git a/paddle/fluid/framework/new_executor/profiler.h b/paddle/fluid/framework/new_executor/profiler.h index 5d8ec05b7f2..77783535b64 100644 --- a/paddle/fluid/framework/new_executor/profiler.h +++ b/paddle/fluid/framework/new_executor/profiler.h @@ -64,10 +64,8 @@ static std::pair GetTensorMemorySize( } struct CostInfo { - double total_time{0.}; // ms - size_t host_memory_bytes{0}; // bytes - size_t device_memory_bytes{0}; // bytes - size_t device_total_memory_bytes{0}; // total allocated memory size + double total_time{0.}; // ms + size_t device_memory_bytes{0}; // total allocated memory size }; class InterpreterProfiler { @@ -82,30 +80,14 @@ class InterpreterProfiler { void Reset() { timer_.Reset(); cost_info_.total_time = 0.; - cost_info_.host_memory_bytes = 0; cost_info_.device_memory_bytes = 0; - cost_info_.device_total_memory_bytes = 0; - } - - void ParseMemoryInfo(const std::vector& vars) { - timer_.Start(); - auto memory_info = GetTensorMemorySize(vars); - VLOG(3) << "host memory size: " << memory_info.first; - cost_info_.host_memory_bytes = - std::max(cost_info_.host_memory_bytes, memory_info.first); - - VLOG(3) << "device memory size: " << memory_info.second; - cost_info_.device_memory_bytes = - std::max(cost_info_.device_memory_bytes, memory_info.second); - timer_.Pause(); - cost_info_.total_time -= timer_.ElapsedMS(); } void TotalCUDAAllocatedMemorySize(const platform::Place& place) { if (platform::is_gpu_place(place)) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, place); - cost_info_.device_total_memory_bytes = + cost_info_.device_memory_bytes = platform::RecordedCudaMallocSize(cuda_place.device); #endif } diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 5174306d722..e404f27a10d 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1979,12 +1979,8 @@ All parameter, weight, gradient are variables in Paddle. py::class_(m, "CostInfo") .def(py::init<>()) .def("total_time", [](CostInfo &self) { return self.total_time; }) - .def("host_memory_bytes", - [](CostInfo &self) { return self.host_memory_bytes; }) .def("device_memory_bytes", - [](CostInfo &self) { return self.device_memory_bytes; }) - .def("device_total_memory_bytes", - [](CostInfo &self) { return self.device_total_memory_bytes; }); + [](CostInfo &self) { return self.device_memory_bytes; }); py::class_(m, "StandaloneExecutor") .def(py::init