未验证 提交 f29fb396 编写于 作者: D danleifeng 提交者: GitHub

dygraph nccl init support host domain name (#28107)

* nccl init support hostname and ip; test=develop
上级 5cd97a1c
...@@ -100,7 +100,19 @@ void NCCLParallelContext::SendNCCLID(const std::string &ep, ...@@ -100,7 +100,19 @@ void NCCLParallelContext::SendNCCLID(const std::string &ep,
serv_addr.sin_family = AF_INET; serv_addr.sin_family = AF_INET;
serv_addr.sin_port = htons(port); serv_addr.sin_port = htons(port);
if (inet_pton(AF_INET, host.c_str(), &serv_addr.sin_addr) <= 0) { char *ip = NULL;
struct hostent *hp;
if ((hp = gethostbyname(host.c_str())) == NULL) {
PADDLE_THROW(platform::errors::InvalidArgument(
"Fail to get host by name %s.", host));
}
int i = 0;
while (hp->h_addr_list[i] != NULL) {
ip = inet_ntoa(*(struct in_addr *)hp->h_addr_list[i]);
VLOG(3) << "gethostbyname host:" << host << " ->ip: " << ip;
break;
}
if (inet_pton(AF_INET, ip, &serv_addr.sin_addr) <= 0) {
PADDLE_THROW(platform::errors::Unavailable("Open address %s failed.", ep)); PADDLE_THROW(platform::errors::Unavailable("Open address %s failed.", ep));
} }
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
// network header files // network header files
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#include <arpa/inet.h> #include <arpa/inet.h>
#include <netdb.h>
#include <netinet/in.h> #include <netinet/in.h>
#include <stdlib.h> #include <stdlib.h>
#include <sys/socket.h> #include <sys/socket.h>
......
...@@ -20,7 +20,7 @@ namespace imperative = paddle::imperative; ...@@ -20,7 +20,7 @@ namespace imperative = paddle::imperative;
namespace platform = paddle::platform; namespace platform = paddle::platform;
imperative::ParallelStrategy GetStrategy(int local_rank) { imperative::ParallelStrategy GetStrategy(int local_rank) {
std::vector<std::string> eps = {"127.0.0.1:9866", "127.0.0.1:9867"}; std::vector<std::string> eps = {"127.0.0.1:9866", "localhost:9867"};
imperative::ParallelStrategy strategy; imperative::ParallelStrategy strategy;
strategy.trainer_endpoints_ = eps; strategy.trainer_endpoints_ = eps;
strategy.current_endpoint_ = eps[local_rank]; strategy.current_endpoint_ = eps[local_rank];
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册