未验证 提交 cca26f5c 编写于 作者: C chengduo 提交者: GitHub

polish multi process warning info (#19961)

test=develop
上级 2efdf0ef
......@@ -82,11 +82,18 @@ void NCCLParallelContext::SendNCCLID(const std::string &ep,
if (inet_pton(AF_INET, host.c_str(), &serv_addr.sin_addr) <= 0)
PADDLE_THROW("invalied address: %s", ep);
int try_times = 0;
while (true) {
if (connect(sock, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) {
VLOG(0) << "worker: " << ep
<< " is not ready, will retry after 3 seconds...";
<< (try_times < 5 ? " is not ready, will retry after 3 seconds..."
: " is not ready. Maybe that some process "
"is occupied the GPUs of this node now, "
"and you should kill those process manually. "
"Will retry after 3 seconds...");
std::this_thread::sleep_for(std::chrono::seconds(3));
++try_times;
continue;
}
VLOG(3) << "sending the ncclUniqueId to " << ep;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册