From b65adf7f65915da7c299bfa23b5a7f0d758c1e42 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Tue, 26 Mar 2019 16:49:52 +0800
Subject: [PATCH] add communicator_send_wait_times

---
 .../operators/distributed/communicator.cc     | 33 ++++++++++++++-----
 .../operators/distributed/communicator.h      |  2 +-
 python/paddle/fluid/__init__.py               |  1 +
 3 files changed, 26 insertions(+), 10 deletions(-)
diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
index eba18c67771..9b14d7f0d8c 100644
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
@@ -32,6 +32,9 @@ DEFINE_int32(communicator_send_queue_size, 20,
 DEFINE_int32(communicator_max_send_grad_num_before_recv, 20,
              "max grad num to send before recv parameters");
 DEFINE_int32(communicator_thread_pool_size, 5, "thread num to do send or recv");
+DEFINE_int32(communicator_send_wait_times, 5,
+             "times that send thread will wait if merge num does not reach "
+             "max_merge_var_num");
 DEFINE_int32(communicator_max_merge_var_num, 20,
              "max var num to merge and send");
 DEFINE_bool(communicator_fake_rpc, false,
@@ -101,20 +104,32 @@ void Communicator::SendThread() {
           VLOG(3) << var_name << " merge and send";
           std::vector<std::shared_ptr<Variable>> vars;
           size_t merged_var_num = 0;
-          while (var_queue->Size() > 0 &&
-                 merged_var_num < FLAGS_communicator_max_merge_var_num) {
-            vars.push_back(var_queue->Pop());
-            // only count the send number of the first var
-            if (var_name == send_varname_to_queue_.begin()->first) {
-              grad_num_.fetch_add(1, std::memory_order_relaxed);
+          size_t wait_times = 0;
+          while (merged_var_num < FLAGS_communicator_max_merge_var_num) {
+            if (var_queue->Size() == 0) {
+              VLOG(3) << "wait_times -> " << wait_times;
+              if (wait_times >= FLAGS_communicator_send_wait_times) {
+                break;
+              }
+              std::this_thread::sleep_for(std::chrono::milliseconds(10));
+              wait_times++;
+              continue;
+            } else {
+              wait_times = 0;
+
+              vars.push_back(var_queue->Pop());
+              // only count the send number of the first var
+              if (var_name == send_varname_to_queue_.begin()->first) {
+                grad_num_.fetch_add(1, std::memory_order_relaxed);
+              }
+              merged_var_num++;
             }
-            merged_var_num++;
           }
           auto before_merge = GetCurrentUS();
           MergeVars(var_name, vars, send_scope_.get());
           auto after_merge = GetCurrentUS();
-          VLOG(3) << "merge " << var_name << " use time "
-                  << after_merge - before_merge;
+          VLOG(3) << "merge " << merged_var_num << " " << var_name
+                  << " use time " << after_merge - before_merge;
           auto send_functor = distributed::ParameterSend<float>();
           auto &ctx = send_varname_to_ctx_.at(var_name);
           if (!FLAGS_communicator_fake_rpc) {
diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h
index 859c0a7f51c..dab449383b4 100644
--- a/paddle/fluid/operators/distributed/communicator.h
+++ b/paddle/fluid/operators/distributed/communicator.h
@@ -107,7 +107,7 @@ inline void MergeVars(const std::string& var_name,
   auto* out_var = scope->Var(var_name);
   if (var0->IsType<framework::LoDTensor>()) {
     auto dims = var0->Get<framework::LoDTensor>().dims();
-    VLOG(3) << "merge " << var_name << " LoDTensor " << dims;
+    VLOG(3) << "merge " << var_name << " LoDTensor dims " << dims;
 
     // init output tensor
     auto* out_t = out_var->GetMutable<framework::LoDTensor>();
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 97ac7fd97b3..6c7f338dabf 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -159,6 +159,7 @@ def __bootstrap__():
         read_env_flags.append('communicator_thread_pool_size')
         read_env_flags.append('communicator_max_merge_var_num')
         read_env_flags.append('communicator_fake_rpc')
+        read_env_flags.append('communicator_send_wait_times')
         if core.is_compiled_with_brpc():
             read_env_flags.append('max_body_size')
             #set brpc max body size
-- 
GitLab