add communicator_send_wait_times

b65adf7f · Qiao Longfei · 63acbe7a · b65adf7f · b65adf7f · b65adf7f
3 changed file
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
@@ -32,6 +32,9 @@ DEFINE_int32(communicator_send_queue_size, 20,
 DEFINE_int32(communicator_max_send_grad_num_before_recv, 20,
             "max grad num to send before recv parameters");
 DEFINE_int32(communicator_thread_pool_size, 5, "thread num to do send or recv");
+DEFINE_int32(communicator_send_wait_times, 5,
+             "times that send thread will wait if merge num does not reach "
+             "max_merge_var_num");
 DEFINE_int32(communicator_max_merge_var_num, 20,
             "max var num to merge and send");
 DEFINE_bool(communicator_fake_rpc, false,
@@ -101,8 +104,19 @@ void Communicator::SendThread() {
          VLOG(3) << var_name << " merge and send";
          std::vector<std::shared_ptr<Variable>> vars;
          size_t merged_var_num = 0;
-          while (var_queue->Size() > 0 &&
+          size_t wait_times = 0;
-                 merged_var_num < FLAGS_communicator_max_merge_var_num) {
+          while (merged_var_num < FLAGS_communicator_max_merge_var_num) {
+            if (var_queue->Size() == 0) {
+              VLOG(3) << "wait_times -> " << wait_times;
+              if (wait_times >= FLAGS_communicator_send_wait_times) {
+                break;
+              }
+              std::this_thread::sleep_for(std::chrono::milliseconds(10));
+              wait_times++;
+              continue;
+            } else {
+              wait_times = 0;
              vars.push_back(var_queue->Pop());
              // only count the send number of the first var
              if (var_name == send_varname_to_queue_.begin()->first) {
@@ -110,11 +124,12 @@ void Communicator::SendThread() {
              }
              merged_var_num++;
            }
+          }
          auto before_merge = GetCurrentUS();
          MergeVars(var_name, vars, send_scope_.get());
          auto after_merge = GetCurrentUS();
-          VLOG(3) << "merge " << var_name << " use time "
+          VLOG(3) << "merge " << merged_var_num << " " << var_name
-                  << after_merge - before_merge;
+                  << " use time " << after_merge - before_merge;
          auto send_functor = distributed::ParameterSend<float>();
          auto &ctx = send_varname_to_ctx_.at(var_name);
          if (!FLAGS_communicator_fake_rpc) {

--- a/paddle/fluid/operators/distributed/communicator.h
+++ b/paddle/fluid/operators/distributed/communicator.h
@@ -107,7 +107,7 @@ inline void MergeVars(const std::string& var_name,
  auto* out_var = scope->Var(var_name);
  if (var0->IsType<framework::LoDTensor>()) {
    auto dims = var0->Get<framework::LoDTensor>().dims();
-    VLOG(3) << "merge " << var_name << " LoDTensor " << dims;
+    VLOG(3) << "merge " << var_name << " LoDTensor dims " << dims;
    // init output tensor
    auto* out_t = out_var->GetMutable<framework::LoDTensor>();

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -159,6 +159,7 @@ def __bootstrap__():
        read_env_flags.append('communicator_thread_pool_size')
        read_env_flags.append('communicator_max_merge_var_num')
        read_env_flags.append('communicator_fake_rpc')
+        read_env_flags.append('communicator_send_wait_times')
        if core.is_compiled_with_brpc():
            read_env_flags.append('max_body_size')
            #set brpc max body size