From c42e656179ec9c557848d97c2af4fa78375d4cfc Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 5 Nov 2020 14:18:10 +0800 Subject: [PATCH] Add retry for dygraph parallel socket bind (#28404) * add retry for dygraph parallel socket bind * change to loop always * fix writing error --- paddle/fluid/imperative/nccl_context.cc | 32 +++++++++++++++++-------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc index 9ffec11354d..abee311d08c 100644 --- a/paddle/fluid/imperative/nccl_context.cc +++ b/paddle/fluid/imperative/nccl_context.cc @@ -48,9 +48,21 @@ void NCCLParallelContext::RecvNCCLID(const std::string &ep, address.sin_addr.s_addr = INADDR_ANY; address.sin_port = htons(port); - if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0) { - PADDLE_THROW( - platform::errors::Unavailable("Bind on endpoint %s failed.", ep)); + int try_times = 0; + while (true) { + if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0) { + LOG(WARNING) << "Socket bind worker " << ep + << (try_times < 5 ? " failed, try again after 3 seconds." + : " failed, try again after 3 seconds. " + "Bind on endpoint %s failed. " + "Please confirm whether the " + "communication port or GPU card is " + "occupied."); + std::this_thread::sleep_for(std::chrono::seconds(3)); + ++try_times; + continue; + } + break; } VLOG(3) << "listening on: " << ep; @@ -119,13 +131,13 @@ void NCCLParallelContext::SendNCCLID(const std::string &ep, int try_times = 0; while (true) { if (connect(sock, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) { - VLOG(0) << "worker: " << ep - << (try_times < 5 ? " is not ready, will retry after 3 seconds..." - : " is not ready. Maybe that some process " - "is occupied the GPUs of this node now, " - "and you should kill those process manually. " - "Will retry after 3 seconds..."); - + LOG(WARNING) + << "Socket connect worker " << ep + << (try_times < 5 + ? " failed, try again after 3 seconds." + : " failed, try again after 3 seconds. Maybe that " + "some process is occupied the GPUs of this node " + "now, and you should kill those process manually."); std::this_thread::sleep_for(std::chrono::seconds(3)); ++try_times; continue; -- GitLab