add retry function to try to solve grpc error code 14 (#19661)

* rpc retry for asycsend/get/prefetch * test=develop, change retry vlog level to 3 * test=develop, set default grpc_retry_times is 3

add retry function to try to solve grpc error code 14 (#19661)
* rpc retry for asycsend/get/prefetch * test=develop, change retry vlog level to 3 * test=develop, set default grpc_retry_times is 3
1bc285a5 · 123malin · GitHub · 5eb381a3 · 1bc285a5 · 1bc285a5
5 changed file
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
@@ -73,8 +73,12 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
  const std::string var_name_val = var_name;
  const framework::Scope* p_scope = &scope;
  const auto ch = GetChannel(ep_val);
-  SendProcessor* s = new SendProcessor(ch);
  const std::string method = kSendRPC;
+
+  int retry_times_ = 0;
+
+  while (true) {
+    SendProcessor* s = new SendProcessor(ch);
    VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
    s->Prepare(h, time_out);

@@ -92,7 +96,8 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
      platform::RecordRPCEvent record_event(method);

      auto call = s->stub_g_.PrepareUnaryCall(
-        s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
+          s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req,
+          &cq_);
      call->StartCall();
      call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));

@@ -102,7 +107,19 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
    });
    req_count_++;

+    if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) {
+      h->Wait();
+      if (h->should_retry) {
+        VLOG(3) << "rpc call failed, retry times " << retry_times_;
+        retry_times_++;
+        std::random_device rd;
+        std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5));
+        continue;
+      }
+    }
+
    return h;
+  }
 }

 void ProcGetResponse(const VarHandle& var_h,
@@ -169,13 +186,17 @@ VarHandlePtr GRPCClient::_AsyncGetVar(
  const std::string table_name_val = table_name;
  const framework::Scope* p_scope = &scope;
  const auto ch = GetChannel(ep_val);
+
+  int retry_times_ = 0;
+
+  while (true) {
    GetProcessor* s = new GetProcessor(ch);

    VarHandlePtr h(new VarHandle(ep, method, out_varname_val, p_ctx, p_scope));
    s->Prepare(h, time_out);

-  framework::AsyncIO([var_name_val, out_varname_val, table_name_val, s, method,
-                      p_ctx, h, rpc_path, this] {
+    framework::AsyncIO([var_name_val, out_varname_val, table_name_val, s,
+                        method, p_ctx, h, rpc_path, this] {
      // prepare input
      sendrecv::VariableMessage req;
      req.set_varname(var_name_val);
@@ -201,10 +222,21 @@ VarHandlePtr GRPCClient::_AsyncGetVar(
        h->Wait();
      }
    });
-
    req_count_++;

+    if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) {
+      h->Wait();
+      if (h->should_retry) {
+        VLOG(3) << "rpc call failed, retry times " << retry_times_;
+        retry_times_++;
+        std::random_device rd;
+        std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5));
+        continue;
+      }
+    }
+
    return h;
+  }
 }

 VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
@@ -221,20 +253,22 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
  const std::string table_name_val = table_name;
  const framework::Scope* p_scope = &scope;
  const auto ch = GetChannel(ep_val);
-  GetProcessor* s = new GetProcessor(ch);

  const std::string method = kPrefetchRPC;
+  int retry_times_ = 0;

+  while (true) {
+    GetProcessor* s = new GetProcessor(ch);
    VarHandlePtr h(new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope));
    s->Prepare(h, time_out);

-  framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
-                      s, method, h, table_name_val, this] {
+    framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope,
+                        p_ctx, s, method, h, table_name_val, this] {
      auto* var = p_scope->FindVar(in_var_name_val);

      ::grpc::ByteBuffer req;
-    SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val,
-                          0, table_name_val);
+      SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req,
+                            out_var_name_val, 0, table_name_val);

      VLOG(3) << s->GetVarHandlePtr()->String() << " begin";

@@ -253,9 +287,21 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
        h->Wait();
      }
    });
-
    req_count_++;
+
+    if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) {
+      h->Wait();
+      if (h->should_retry) {
+        VLOG(3) << "rpc call failed, retry times " << retry_times_;
+        retry_times_++;
+        std::random_device rd;
+        std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5));
+        continue;
+      }
+    }
+
    return h;
+  }
 }

 VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
@@ -420,6 +466,14 @@ void GRPCClient::Proceed() {
        ok_ = false;
      }
      c->Finish(false);
+    } else if (c->status_.error_code() == grpc::StatusCode::UNAVAILABLE) {
+      VLOG(3) << c->GetVarHandlePtr()->String()
+              << " meets grpc error, error_code:" << c->status_.error_code()
+              << " error_message:" << c->status_.error_message()
+              << " error_details:" << c->status_.error_details()
+              << " should retry!";
+      c->GetVarHandlePtr()->should_retry = true;
+      c->Finish(false);
    } else {
      LOG(FATAL) << c->GetVarHandlePtr()->String()
                 << " meets grpc error, error_code:" << c->status_.error_code()

--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -85,6 +85,8 @@ class VarHandle {
  virtual ~VarHandle() {}

 public:
+  bool should_retry = false;
+
  bool Wait() {
    int ret = kDefaultState;
    {

--- a/paddle/fluid/operators/distributed/rpc_client.cc
+++ b/paddle/fluid/operators/distributed/rpc_client.cc
@@ -17,6 +17,7 @@

 // default to 3min to avoid temprary network failures.
 DEFINE_int32(rpc_deadline, 180000, "deadline timeouts for rpc");
+DEFINE_int32(rpc_retry_times, 3, "retry times for rpc");

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -25,6 +25,7 @@
 #include "paddle/fluid/operators/distributed/request_handler.h"

 DECLARE_int32(rpc_deadline);
+DECLARE_int32(rpc_retry_times);

 namespace paddle {
 namespace operators {

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -177,6 +177,7 @@ def __bootstrap__():
    if core.is_compiled_with_dist():
        #env for rpc
        read_env_flags.append('rpc_deadline')
+        read_env_flags.append('rpc_retry_times')
        read_env_flags.append('rpc_server_profile_path')
        read_env_flags.append('enable_rpc_profiler')
        read_env_flags.append('rpc_send_thread_num')