未验证 提交 2fff5a58 编写于 作者: A Aurelius84 提交者: GitHub

Clean ParseMemInfo and Fix unittest failed under multi-thread (#35840)

* Clean ParaseMemInfo and fix unittest with multi-thread

* fix declare
上级 e4c2a854
......@@ -324,16 +324,15 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
}
void InterpreterCore::ExecuteInstructionList(
const std::vector<Instruction>& vec_instr, bool is_dry_run) {
const std::vector<Instruction>& vec_instr) {
auto atomic_deps = async_work_queue_.PrepareAtomicDeps(dependecy_count_);
auto atomic_var_ref = async_work_queue_.PrepareAtomicVarRef(vec_meta_info_);
std::atomic<size_t> op_run_number{0};
for (size_t i = 0; i < dependecy_count_.size(); ++i) {
if (dependecy_count_[i] == 0) {
async_work_queue_.AddTask(vec_instr[i].type_, [&, i, is_dry_run]() {
RunInstructionAsync(i, &atomic_deps, &atomic_var_ref, &op_run_number,
is_dry_run);
async_work_queue_.AddTask(vec_instr[i].type_, [&, i]() {
RunInstructionAsync(i, &atomic_deps, &atomic_var_ref, &op_run_number);
});
}
}
......@@ -350,8 +349,7 @@ void InterpreterCore::ExecuteInstructionList(
void InterpreterCore::RunInstructionAsync(size_t instr_id,
AtomicVectorSizeT* atomic_deps,
AtomicVectorSizeT* atomic_var_ref,
std::atomic<size_t>* op_run_number,
bool is_dry_run) {
std::atomic<size_t>* op_run_number) {
auto& instr_node = vec_instruction_[instr_id];
event_manager_.WaitEvent(instr_node, place_);
......@@ -360,10 +358,6 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id,
event_manager_.RecordEvent(instr_node, place_);
op_run_number->fetch_add(1, std::memory_order_relaxed);
if (is_dry_run) {
dry_run_profiler_.ParseMemoryInfo(global_scope_->var_list);
}
auto& next_instr = instr_node.next_instruction_.all_next_ops_;
for (auto next_i : next_instr) {
......@@ -372,8 +366,7 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id,
atomic_deps->at(next_i)->fetch_sub(1, std::memory_order_relaxed) == 1;
if (is_ready) {
async_work_queue_.AddTask(vec_instruction_[next_i].type_, [=]() {
RunInstructionAsync(next_i, atomic_deps, atomic_var_ref, op_run_number,
is_dry_run);
RunInstructionAsync(next_i, atomic_deps, atomic_var_ref, op_run_number);
});
}
}
......@@ -433,7 +426,7 @@ const CostInfo& InterpreterCore::DryRun(
// DryRun may be called many times.
dry_run_profiler_.Reset();
dry_run_profiler_.Start();
ExecuteInstructionList(vec_instruction_, /*is_dry_run=*/true);
ExecuteInstructionList(vec_instruction_);
platform::DeviceContextPool::Instance().Get(place_)->Wait();
dry_run_profiler_.Pause();
......
......@@ -59,8 +59,7 @@ class InterpreterCore {
void RunInstruction(const Instruction& instr_node);
void ExecuteInstructionList(const std::vector<Instruction>& vec_instr,
bool is_dry_run = false);
void ExecuteInstructionList(const std::vector<Instruction>& vec_instr);
void DryRunPrepare(const std::vector<framework::Tensor>& feed_tensors);
......@@ -70,7 +69,7 @@ class InterpreterCore {
void RunInstructionAsync(size_t instr_id,
AtomicVectorSizeT* working_dependecy_count,
AtomicVectorSizeT* working_var_ref,
std::atomic<size_t>* op_run_number, bool is_dry_run);
std::atomic<size_t>* op_run_number);
void AddFetch(const std::vector<std::string>& fetch_names);
void BuildSkipShareLoDInfo();
......
......@@ -64,10 +64,8 @@ static std::pair<size_t, size_t> GetTensorMemorySize(
}
struct CostInfo {
double total_time{0.}; // ms
size_t host_memory_bytes{0}; // bytes
size_t device_memory_bytes{0}; // bytes
size_t device_total_memory_bytes{0}; // total allocated memory size
double total_time{0.}; // ms
size_t device_memory_bytes{0}; // total allocated memory size
};
class InterpreterProfiler {
......@@ -82,30 +80,14 @@ class InterpreterProfiler {
void Reset() {
timer_.Reset();
cost_info_.total_time = 0.;
cost_info_.host_memory_bytes = 0;
cost_info_.device_memory_bytes = 0;
cost_info_.device_total_memory_bytes = 0;
}
void ParseMemoryInfo(const std::vector<Variable*>& vars) {
timer_.Start();
auto memory_info = GetTensorMemorySize(vars);
VLOG(3) << "host memory size: " << memory_info.first;
cost_info_.host_memory_bytes =
std::max(cost_info_.host_memory_bytes, memory_info.first);
VLOG(3) << "device memory size: " << memory_info.second;
cost_info_.device_memory_bytes =
std::max(cost_info_.device_memory_bytes, memory_info.second);
timer_.Pause();
cost_info_.total_time -= timer_.ElapsedMS();
}
void TotalCUDAAllocatedMemorySize(const platform::Place& place) {
if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, place);
cost_info_.device_total_memory_bytes =
cost_info_.device_memory_bytes =
platform::RecordedCudaMallocSize(cuda_place.device);
#endif
}
......
......@@ -1979,12 +1979,8 @@ All parameter, weight, gradient are variables in Paddle.
py::class_<framework::CostInfo>(m, "CostInfo")
.def(py::init<>())
.def("total_time", [](CostInfo &self) { return self.total_time; })
.def("host_memory_bytes",
[](CostInfo &self) { return self.host_memory_bytes; })
.def("device_memory_bytes",
[](CostInfo &self) { return self.device_memory_bytes; })
.def("device_total_memory_bytes",
[](CostInfo &self) { return self.device_total_memory_bytes; });
[](CostInfo &self) { return self.device_memory_bytes; });
py::class_<framework::StandaloneExecutor>(m, "StandaloneExecutor")
.def(py::init<const platform::Place &, const ProgramDesc &,
......
......@@ -79,18 +79,11 @@ class LinearTestCase(unittest.TestCase):
IS_WINDOWS = sys.platform.startswith('win')
if core.is_compiled_with_cuda():
# input `a` is on CPU, 16 bytes
self.assertEqual(cost_info.host_memory_bytes(), 16)
# # w,bias,b, out, memory block is at least 256 bytes on Linux
gt = 16 * 4 if IS_WINDOWS else 256 * 4
self.assertGreater(cost_info.device_memory_bytes(), gt)
self.assertGreaterEqual(cost_info.device_total_memory_bytes(),
cost_info.device_memory_bytes())
else:
# x(16 bytes), w(16 bytes), bias(8 bytes), b(16 bytes), out(16 bytes)
self.assertGreaterEqual(cost_info.host_memory_bytes(), 72)
self.assertEqual(cost_info.device_memory_bytes(), 0)
self.assertGreaterEqual(cost_info.device_total_memory_bytes(), 0)
def build_program():
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册