From b7b7011def413b2be5feb38f86219c6a14232273 Mon Sep 17 00:00:00 2001 From: wangyanfei01 Date: Tue, 6 Dec 2016 19:18:36 +0800 Subject: [PATCH] try to connect again if refused error found --- paddle/pserver/LightNetwork.cpp | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/paddle/pserver/LightNetwork.cpp b/paddle/pserver/LightNetwork.cpp index 1830170a16..9b288604aa 100644 --- a/paddle/pserver/LightNetwork.cpp +++ b/paddle/pserver/LightNetwork.cpp @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include +#include #include #include @@ -49,6 +50,10 @@ P_DEFINE_int32(sock_recv_buf_size, 1024 * 1024 * 40, "restrict sock recv buff size"); +P_DEFINE_int32(connrefused_retries_second, + 10, + "retry connrefused_retries_second if ECONNREFUSED occurs"); + namespace paddle { /** @@ -382,8 +387,20 @@ void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) { setOption(sockfd); /// Now connect to the server - PCHECK(connect(sockfd, (sockaddr *)&serv_addr, sizeof(serv_addr)) >= 0) - << "ERROR connecting to " << serverAddr; + int retry_second = 0; + int error = 0; + do { + error = connect(sockfd, (sockaddr *)&serv_addr, sizeof(serv_addr)); + if (error == ECONNREFUSED) { + LOG(WARNING) << "connection refused by pserver, try again!"; + if (retry_second++ >= FLAGS_connrefused_retries_second) { + LOG(FATAL) << "connection refused by pserver, maybe pserver failed!"; + } + std::this_thread::sleep_for(std::chrono::seconds(1)); + } else { + PCHECK(error >= 0) << "ERROR connecting to " << serverAddr; + } + } while (error == ECONNREFUSED); channel_.reset(new SocketChannel(sockfd, serverAddr)); tcpRdma_ = F_TCP; -- GitLab