From b65adf7f65915da7c299bfa23b5a7f0d758c1e42 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 26 Mar 2019 16:49:52 +0800 Subject: [PATCH] add communicator_send_wait_times --- .../operators/distributed/communicator.cc | 33 ++++++++++++++----- .../operators/distributed/communicator.h | 2 +- python/paddle/fluid/__init__.py | 1 + 3 files changed, 26 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc index eba18c67771..9b14d7f0d8c 100644 --- a/paddle/fluid/operators/distributed/communicator.cc +++ b/paddle/fluid/operators/distributed/communicator.cc @@ -32,6 +32,9 @@ DEFINE_int32(communicator_send_queue_size, 20, DEFINE_int32(communicator_max_send_grad_num_before_recv, 20, "max grad num to send before recv parameters"); DEFINE_int32(communicator_thread_pool_size, 5, "thread num to do send or recv"); +DEFINE_int32(communicator_send_wait_times, 5, + "times that send thread will wait if merge num does not reach " + "max_merge_var_num"); DEFINE_int32(communicator_max_merge_var_num, 20, "max var num to merge and send"); DEFINE_bool(communicator_fake_rpc, false, @@ -101,20 +104,32 @@ void Communicator::SendThread() { VLOG(3) << var_name << " merge and send"; std::vector> vars; size_t merged_var_num = 0; - while (var_queue->Size() > 0 && - merged_var_num < FLAGS_communicator_max_merge_var_num) { - vars.push_back(var_queue->Pop()); - // only count the send number of the first var - if (var_name == send_varname_to_queue_.begin()->first) { - grad_num_.fetch_add(1, std::memory_order_relaxed); + size_t wait_times = 0; + while (merged_var_num < FLAGS_communicator_max_merge_var_num) { + if (var_queue->Size() == 0) { + VLOG(3) << "wait_times -> " << wait_times; + if (wait_times >= FLAGS_communicator_send_wait_times) { + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + wait_times++; + continue; + } else { + wait_times = 0; + + vars.push_back(var_queue->Pop()); + // only count the send number of the first var + if (var_name == send_varname_to_queue_.begin()->first) { + grad_num_.fetch_add(1, std::memory_order_relaxed); + } + merged_var_num++; } - merged_var_num++; } auto before_merge = GetCurrentUS(); MergeVars(var_name, vars, send_scope_.get()); auto after_merge = GetCurrentUS(); - VLOG(3) << "merge " << var_name << " use time " - << after_merge - before_merge; + VLOG(3) << "merge " << merged_var_num << " " << var_name + << " use time " << after_merge - before_merge; auto send_functor = distributed::ParameterSend(); auto &ctx = send_varname_to_ctx_.at(var_name); if (!FLAGS_communicator_fake_rpc) { diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h index 859c0a7f51c..dab449383b4 100644 --- a/paddle/fluid/operators/distributed/communicator.h +++ b/paddle/fluid/operators/distributed/communicator.h @@ -107,7 +107,7 @@ inline void MergeVars(const std::string& var_name, auto* out_var = scope->Var(var_name); if (var0->IsType()) { auto dims = var0->Get().dims(); - VLOG(3) << "merge " << var_name << " LoDTensor " << dims; + VLOG(3) << "merge " << var_name << " LoDTensor dims " << dims; // init output tensor auto* out_t = out_var->GetMutable(); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 97ac7fd97b3..6c7f338dabf 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -159,6 +159,7 @@ def __bootstrap__(): read_env_flags.append('communicator_thread_pool_size') read_env_flags.append('communicator_max_merge_var_num') read_env_flags.append('communicator_fake_rpc') + read_env_flags.append('communicator_send_wait_times') if core.is_compiled_with_brpc(): read_env_flags.append('max_body_size') #set brpc max body size -- GitLab