diff --git a/paddle/pserver/LightNetwork.cpp b/paddle/pserver/LightNetwork.cpp index 1830170a163fa47114c75a2a88a731ea31060142..9b288604aa9c6c2232bb1556740e4b06c9e24454 100644 --- a/paddle/pserver/LightNetwork.cpp +++ b/paddle/pserver/LightNetwork.cpp @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include +#include #include #include @@ -49,6 +50,10 @@ P_DEFINE_int32(sock_recv_buf_size, 1024 * 1024 * 40, "restrict sock recv buff size"); +P_DEFINE_int32(connrefused_retries_second, + 10, + "retry connrefused_retries_second if ECONNREFUSED occurs"); + namespace paddle { /** @@ -382,8 +387,20 @@ void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) { setOption(sockfd); /// Now connect to the server - PCHECK(connect(sockfd, (sockaddr *)&serv_addr, sizeof(serv_addr)) >= 0) - << "ERROR connecting to " << serverAddr; + int retry_second = 0; + int error = 0; + do { + error = connect(sockfd, (sockaddr *)&serv_addr, sizeof(serv_addr)); + if (error == ECONNREFUSED) { + LOG(WARNING) << "connection refused by pserver, try again!"; + if (retry_second++ >= FLAGS_connrefused_retries_second) { + LOG(FATAL) << "connection refused by pserver, maybe pserver failed!"; + } + std::this_thread::sleep_for(std::chrono::seconds(1)); + } else { + PCHECK(error >= 0) << "ERROR connecting to " << serverAddr; + } + } while (error == ECONNREFUSED); channel_.reset(new SocketChannel(sockfd, serverAddr)); tcpRdma_ = F_TCP;