未验证 提交 e92f0388 编写于 作者: B Baibaifan 提交者: GitHub

add retry for gethostbyname (#34855)

上级 2164ad61
......@@ -34,6 +34,8 @@ limitations under the License. */
#include "paddle/fluid/platform/hccl_helper.h"
#endif
DECLARE_int32(get_host_by_name_time);
namespace paddle {
namespace operators {
......@@ -226,7 +228,15 @@ static int ConnectAddr(const std::string& ep, const char* head) {
char* ip = NULL;
struct hostent* hp = NULL;
hp = gethostbyname(host.c_str());
// sleep for get_host_by_name_time seconds.
for (int i = 0; 2 * i < FLAGS_get_host_by_name_time; i++) {
hp = gethostbyname(host.c_str());
if (hp != NULL) {
break;
}
std::this_thread::sleep_for(std::chrono::seconds(2));
LOG(WARNING) << "gethostbyname " << host.c_str() << " error!";
}
PADDLE_ENFORCE_NOT_NULL(hp, platform::errors::InvalidArgument(
"Fail to get host by name %s.", host));
......
......@@ -606,3 +606,17 @@ DEFINE_bool(check_kernel_launch, false,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DEFINE_bool(conv2d_disable_cudnn, false, "Disable cudnn in conv2d");
#endif
/**
* Distributed related FLAG
* Name: FLAGS_get_host_by_name_time
* Since Version: 2.2.0
* Value Range: int32, default=120
* Example:
* Note: Get host by name time.
*/
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) || \
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_HIP)
DEFINE_int32(get_host_by_name_time, 120,
"The maximum time for get host by name time");
#endif
......@@ -37,6 +37,8 @@ limitations under the License. */
#include "paddle/fluid/platform/collective_helper.h"
#endif
DECLARE_int32(get_host_by_name_time);
namespace paddle {
namespace platform {
......@@ -236,7 +238,16 @@ static int ConnectAddr(const std::string& ep, const CommHead head) {
char* ip = NULL;
struct hostent* hp = NULL;
hp = gethostbyname(host.c_str());
// sleep for get_host_by_name_time seconds.
for (int i = 0; 2 * i < FLAGS_get_host_by_name_time; i++) {
hp = gethostbyname(host.c_str());
if (hp != NULL) {
break;
}
std::this_thread::sleep_for(std::chrono::seconds(2));
LOG(WARNING) << "gethostbyname " << host.c_str() << " error!";
}
PADDLE_ENFORCE_NOT_NULL(hp, platform::errors::InvalidArgument(
"Fail to get host by name %s.", host));
......
......@@ -236,6 +236,7 @@ def __bootstrap__():
'local_exe_sub_scope_limit',
'gpu_memory_limit_mb',
'conv2d_disable_cudnn',
'get_host_by_name_time',
]
if core.is_compiled_with_npu():
......@@ -246,6 +247,7 @@ def __bootstrap__():
'reallocate_gpu_memory_in_mb',
'gpu_memory_limit_mb',
'npu_config_path',
'get_host_by_name_time',
]
core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册