未验证 提交 c42e6561 编写于 作者: C Chen Weihang 提交者: GitHub

Add retry for dygraph parallel socket bind (#28404)

* add retry for dygraph parallel socket bind

* change to loop always

* fix writing error
上级 c41fd033
......@@ -48,9 +48,21 @@ void NCCLParallelContext::RecvNCCLID(const std::string &ep,
address.sin_addr.s_addr = INADDR_ANY;
address.sin_port = htons(port);
if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0) {
PADDLE_THROW(
platform::errors::Unavailable("Bind on endpoint %s failed.", ep));
int try_times = 0;
while (true) {
if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0) {
LOG(WARNING) << "Socket bind worker " << ep
<< (try_times < 5 ? " failed, try again after 3 seconds."
: " failed, try again after 3 seconds. "
"Bind on endpoint %s failed. "
"Please confirm whether the "
"communication port or GPU card is "
"occupied.");
std::this_thread::sleep_for(std::chrono::seconds(3));
++try_times;
continue;
}
break;
}
VLOG(3) << "listening on: " << ep;
......@@ -119,13 +131,13 @@ void NCCLParallelContext::SendNCCLID(const std::string &ep,
int try_times = 0;
while (true) {
if (connect(sock, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) {
VLOG(0) << "worker: " << ep
<< (try_times < 5 ? " is not ready, will retry after 3 seconds..."
: " is not ready. Maybe that some process "
"is occupied the GPUs of this node now, "
"and you should kill those process manually. "
"Will retry after 3 seconds...");
LOG(WARNING)
<< "Socket connect worker " << ep
<< (try_times < 5
? " failed, try again after 3 seconds."
: " failed, try again after 3 seconds. Maybe that "
"some process is occupied the GPUs of this node "
"now, and you should kill those process manually.");
std::this_thread::sleep_for(std::chrono::seconds(3));
++try_times;
continue;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册