未验证 提交 c42e6561 编写于 作者: C Chen Weihang 提交者: GitHub

Add retry for dygraph parallel socket bind (#28404)

* add retry for dygraph parallel socket bind

* change to loop always

* fix writing error
上级 c41fd033
...@@ -48,9 +48,21 @@ void NCCLParallelContext::RecvNCCLID(const std::string &ep, ...@@ -48,9 +48,21 @@ void NCCLParallelContext::RecvNCCLID(const std::string &ep,
address.sin_addr.s_addr = INADDR_ANY; address.sin_addr.s_addr = INADDR_ANY;
address.sin_port = htons(port); address.sin_port = htons(port);
if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0) { int try_times = 0;
PADDLE_THROW( while (true) {
platform::errors::Unavailable("Bind on endpoint %s failed.", ep)); if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0) {
LOG(WARNING) << "Socket bind worker " << ep
<< (try_times < 5 ? " failed, try again after 3 seconds."
: " failed, try again after 3 seconds. "
"Bind on endpoint %s failed. "
"Please confirm whether the "
"communication port or GPU card is "
"occupied.");
std::this_thread::sleep_for(std::chrono::seconds(3));
++try_times;
continue;
}
break;
} }
VLOG(3) << "listening on: " << ep; VLOG(3) << "listening on: " << ep;
...@@ -119,13 +131,13 @@ void NCCLParallelContext::SendNCCLID(const std::string &ep, ...@@ -119,13 +131,13 @@ void NCCLParallelContext::SendNCCLID(const std::string &ep,
int try_times = 0; int try_times = 0;
while (true) { while (true) {
if (connect(sock, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) { if (connect(sock, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) {
VLOG(0) << "worker: " << ep LOG(WARNING)
<< (try_times < 5 ? " is not ready, will retry after 3 seconds..." << "Socket connect worker " << ep
: " is not ready. Maybe that some process " << (try_times < 5
"is occupied the GPUs of this node now, " ? " failed, try again after 3 seconds."
"and you should kill those process manually. " : " failed, try again after 3 seconds. Maybe that "
"Will retry after 3 seconds..."); "some process is occupied the GPUs of this node "
"now, and you should kill those process manually.");
std::this_thread::sleep_for(std::chrono::seconds(3)); std::this_thread::sleep_for(std::chrono::seconds(3));
++try_times; ++try_times;
continue; continue;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册