From cca26f5c42fe7862442000969ab571c9e46ae8ad Mon Sep 17 00:00:00 2001 From: chengduo <30176695+chengduoZH@users.noreply.github.com> Date: Wed, 25 Sep 2019 08:09:53 +0800 Subject: [PATCH] polish multi process warning info (#19961) test=develop --- paddle/fluid/imperative/nccl_context.cc | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc index d9630bd66d..ab612b2f15 100644 --- a/paddle/fluid/imperative/nccl_context.cc +++ b/paddle/fluid/imperative/nccl_context.cc @@ -82,11 +82,18 @@ void NCCLParallelContext::SendNCCLID(const std::string &ep, if (inet_pton(AF_INET, host.c_str(), &serv_addr.sin_addr) <= 0) PADDLE_THROW("invalied address: %s", ep); + int try_times = 0; while (true) { if (connect(sock, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) { VLOG(0) << "worker: " << ep - << " is not ready, will retry after 3 seconds..."; + << (try_times < 5 ? " is not ready, will retry after 3 seconds..." + : " is not ready. Maybe that some process " + "is occupied the GPUs of this node now, " + "and you should kill those process manually. " + "Will retry after 3 seconds..."); + std::this_thread::sleep_for(std::chrono::seconds(3)); + ++try_times; continue; } VLOG(3) << "sending the ncclUniqueId to " << ep; -- GitLab