未验证 提交 2fff5a58 编写于 作者: A Aurelius84 提交者: GitHub

Clean ParseMemInfo and Fix unittest failed under multi-thread (#35840)

* Clean ParaseMemInfo and fix unittest with multi-thread

* fix declare
上级 e4c2a854
...@@ -324,16 +324,15 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { ...@@ -324,16 +324,15 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
} }
void InterpreterCore::ExecuteInstructionList( void InterpreterCore::ExecuteInstructionList(
const std::vector<Instruction>& vec_instr, bool is_dry_run) { const std::vector<Instruction>& vec_instr) {
auto atomic_deps = async_work_queue_.PrepareAtomicDeps(dependecy_count_); auto atomic_deps = async_work_queue_.PrepareAtomicDeps(dependecy_count_);
auto atomic_var_ref = async_work_queue_.PrepareAtomicVarRef(vec_meta_info_); auto atomic_var_ref = async_work_queue_.PrepareAtomicVarRef(vec_meta_info_);
std::atomic<size_t> op_run_number{0}; std::atomic<size_t> op_run_number{0};
for (size_t i = 0; i < dependecy_count_.size(); ++i) { for (size_t i = 0; i < dependecy_count_.size(); ++i) {
if (dependecy_count_[i] == 0) { if (dependecy_count_[i] == 0) {
async_work_queue_.AddTask(vec_instr[i].type_, [&, i, is_dry_run]() { async_work_queue_.AddTask(vec_instr[i].type_, [&, i]() {
RunInstructionAsync(i, &atomic_deps, &atomic_var_ref, &op_run_number, RunInstructionAsync(i, &atomic_deps, &atomic_var_ref, &op_run_number);
is_dry_run);
}); });
} }
} }
...@@ -350,8 +349,7 @@ void InterpreterCore::ExecuteInstructionList( ...@@ -350,8 +349,7 @@ void InterpreterCore::ExecuteInstructionList(
void InterpreterCore::RunInstructionAsync(size_t instr_id, void InterpreterCore::RunInstructionAsync(size_t instr_id,
AtomicVectorSizeT* atomic_deps, AtomicVectorSizeT* atomic_deps,
AtomicVectorSizeT* atomic_var_ref, AtomicVectorSizeT* atomic_var_ref,
std::atomic<size_t>* op_run_number, std::atomic<size_t>* op_run_number) {
bool is_dry_run) {
auto& instr_node = vec_instruction_[instr_id]; auto& instr_node = vec_instruction_[instr_id];
event_manager_.WaitEvent(instr_node, place_); event_manager_.WaitEvent(instr_node, place_);
...@@ -360,10 +358,6 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id, ...@@ -360,10 +358,6 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id,
event_manager_.RecordEvent(instr_node, place_); event_manager_.RecordEvent(instr_node, place_);
op_run_number->fetch_add(1, std::memory_order_relaxed); op_run_number->fetch_add(1, std::memory_order_relaxed);
if (is_dry_run) {
dry_run_profiler_.ParseMemoryInfo(global_scope_->var_list);
}
auto& next_instr = instr_node.next_instruction_.all_next_ops_; auto& next_instr = instr_node.next_instruction_.all_next_ops_;
for (auto next_i : next_instr) { for (auto next_i : next_instr) {
...@@ -372,8 +366,7 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id, ...@@ -372,8 +366,7 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id,
atomic_deps->at(next_i)->fetch_sub(1, std::memory_order_relaxed) == 1; atomic_deps->at(next_i)->fetch_sub(1, std::memory_order_relaxed) == 1;
if (is_ready) { if (is_ready) {
async_work_queue_.AddTask(vec_instruction_[next_i].type_, [=]() { async_work_queue_.AddTask(vec_instruction_[next_i].type_, [=]() {
RunInstructionAsync(next_i, atomic_deps, atomic_var_ref, op_run_number, RunInstructionAsync(next_i, atomic_deps, atomic_var_ref, op_run_number);
is_dry_run);
}); });
} }
} }
...@@ -433,7 +426,7 @@ const CostInfo& InterpreterCore::DryRun( ...@@ -433,7 +426,7 @@ const CostInfo& InterpreterCore::DryRun(
// DryRun may be called many times. // DryRun may be called many times.
dry_run_profiler_.Reset(); dry_run_profiler_.Reset();
dry_run_profiler_.Start(); dry_run_profiler_.Start();
ExecuteInstructionList(vec_instruction_, /*is_dry_run=*/true); ExecuteInstructionList(vec_instruction_);
platform::DeviceContextPool::Instance().Get(place_)->Wait(); platform::DeviceContextPool::Instance().Get(place_)->Wait();
dry_run_profiler_.Pause(); dry_run_profiler_.Pause();
......
...@@ -59,8 +59,7 @@ class InterpreterCore { ...@@ -59,8 +59,7 @@ class InterpreterCore {
void RunInstruction(const Instruction& instr_node); void RunInstruction(const Instruction& instr_node);
void ExecuteInstructionList(const std::vector<Instruction>& vec_instr, void ExecuteInstructionList(const std::vector<Instruction>& vec_instr);
bool is_dry_run = false);
void DryRunPrepare(const std::vector<framework::Tensor>& feed_tensors); void DryRunPrepare(const std::vector<framework::Tensor>& feed_tensors);
...@@ -70,7 +69,7 @@ class InterpreterCore { ...@@ -70,7 +69,7 @@ class InterpreterCore {
void RunInstructionAsync(size_t instr_id, void RunInstructionAsync(size_t instr_id,
AtomicVectorSizeT* working_dependecy_count, AtomicVectorSizeT* working_dependecy_count,
AtomicVectorSizeT* working_var_ref, AtomicVectorSizeT* working_var_ref,
std::atomic<size_t>* op_run_number, bool is_dry_run); std::atomic<size_t>* op_run_number);
void AddFetch(const std::vector<std::string>& fetch_names); void AddFetch(const std::vector<std::string>& fetch_names);
void BuildSkipShareLoDInfo(); void BuildSkipShareLoDInfo();
......
...@@ -64,10 +64,8 @@ static std::pair<size_t, size_t> GetTensorMemorySize( ...@@ -64,10 +64,8 @@ static std::pair<size_t, size_t> GetTensorMemorySize(
} }
struct CostInfo { struct CostInfo {
double total_time{0.}; // ms double total_time{0.}; // ms
size_t host_memory_bytes{0}; // bytes size_t device_memory_bytes{0}; // total allocated memory size
size_t device_memory_bytes{0}; // bytes
size_t device_total_memory_bytes{0}; // total allocated memory size
}; };
class InterpreterProfiler { class InterpreterProfiler {
...@@ -82,30 +80,14 @@ class InterpreterProfiler { ...@@ -82,30 +80,14 @@ class InterpreterProfiler {
void Reset() { void Reset() {
timer_.Reset(); timer_.Reset();
cost_info_.total_time = 0.; cost_info_.total_time = 0.;
cost_info_.host_memory_bytes = 0;
cost_info_.device_memory_bytes = 0; cost_info_.device_memory_bytes = 0;
cost_info_.device_total_memory_bytes = 0;
}
void ParseMemoryInfo(const std::vector<Variable*>& vars) {
timer_.Start();
auto memory_info = GetTensorMemorySize(vars);
VLOG(3) << "host memory size: " << memory_info.first;
cost_info_.host_memory_bytes =
std::max(cost_info_.host_memory_bytes, memory_info.first);
VLOG(3) << "device memory size: " << memory_info.second;
cost_info_.device_memory_bytes =
std::max(cost_info_.device_memory_bytes, memory_info.second);
timer_.Pause();
cost_info_.total_time -= timer_.ElapsedMS();
} }
void TotalCUDAAllocatedMemorySize(const platform::Place& place) { void TotalCUDAAllocatedMemorySize(const platform::Place& place) {
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, place); auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, place);
cost_info_.device_total_memory_bytes = cost_info_.device_memory_bytes =
platform::RecordedCudaMallocSize(cuda_place.device); platform::RecordedCudaMallocSize(cuda_place.device);
#endif #endif
} }
......
...@@ -1979,12 +1979,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1979,12 +1979,8 @@ All parameter, weight, gradient are variables in Paddle.
py::class_<framework::CostInfo>(m, "CostInfo") py::class_<framework::CostInfo>(m, "CostInfo")
.def(py::init<>()) .def(py::init<>())
.def("total_time", [](CostInfo &self) { return self.total_time; }) .def("total_time", [](CostInfo &self) { return self.total_time; })
.def("host_memory_bytes",
[](CostInfo &self) { return self.host_memory_bytes; })
.def("device_memory_bytes", .def("device_memory_bytes",
[](CostInfo &self) { return self.device_memory_bytes; }) [](CostInfo &self) { return self.device_memory_bytes; });
.def("device_total_memory_bytes",
[](CostInfo &self) { return self.device_total_memory_bytes; });
py::class_<framework::StandaloneExecutor>(m, "StandaloneExecutor") py::class_<framework::StandaloneExecutor>(m, "StandaloneExecutor")
.def(py::init<const platform::Place &, const ProgramDesc &, .def(py::init<const platform::Place &, const ProgramDesc &,
......
...@@ -79,18 +79,11 @@ class LinearTestCase(unittest.TestCase): ...@@ -79,18 +79,11 @@ class LinearTestCase(unittest.TestCase):
IS_WINDOWS = sys.platform.startswith('win') IS_WINDOWS = sys.platform.startswith('win')
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
# input `a` is on CPU, 16 bytes
self.assertEqual(cost_info.host_memory_bytes(), 16)
# # w,bias,b, out, memory block is at least 256 bytes on Linux # # w,bias,b, out, memory block is at least 256 bytes on Linux
gt = 16 * 4 if IS_WINDOWS else 256 * 4 gt = 16 * 4 if IS_WINDOWS else 256 * 4
self.assertGreater(cost_info.device_memory_bytes(), gt) self.assertGreater(cost_info.device_memory_bytes(), gt)
self.assertGreaterEqual(cost_info.device_total_memory_bytes(),
cost_info.device_memory_bytes())
else: else:
# x(16 bytes), w(16 bytes), bias(8 bytes), b(16 bytes), out(16 bytes)
self.assertGreaterEqual(cost_info.host_memory_bytes(), 72)
self.assertEqual(cost_info.device_memory_bytes(), 0) self.assertEqual(cost_info.device_memory_bytes(), 0)
self.assertGreaterEqual(cost_info.device_total_memory_bytes(), 0)
def build_program(): def build_program():
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册