diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc index 15940a76f71105a57865c0c8e00b404d087e9485..e1a0c7fd29506b9b3eacb0496941fd042aee1f84 100644 --- a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc +++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc @@ -34,6 +34,8 @@ limitations under the License. */ #include "paddle/fluid/platform/hccl_helper.h" #endif +DECLARE_int32(get_host_by_name_time); + namespace paddle { namespace operators { @@ -226,7 +228,15 @@ static int ConnectAddr(const std::string& ep, const char* head) { char* ip = NULL; struct hostent* hp = NULL; - hp = gethostbyname(host.c_str()); + // sleep for get_host_by_name_time seconds. + for (int i = 0; 2 * i < FLAGS_get_host_by_name_time; i++) { + hp = gethostbyname(host.c_str()); + if (hp != NULL) { + break; + } + std::this_thread::sleep_for(std::chrono::seconds(2)); + LOG(WARNING) << "gethostbyname " << host.c_str() << " error!"; + } PADDLE_ENFORCE_NOT_NULL(hp, platform::errors::InvalidArgument( "Fail to get host by name %s.", host)); diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index ae4a7b8b67263e90f174b022ad1d7cb153a39e64..33d9c6efef852d0298a27eca6dfacdd0df18f159 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -606,3 +606,17 @@ DEFINE_bool(check_kernel_launch, false, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) DEFINE_bool(conv2d_disable_cudnn, false, "Disable cudnn in conv2d"); #endif + +/** + * Distributed related FLAG + * Name: FLAGS_get_host_by_name_time + * Since Version: 2.2.0 + * Value Range: int32, default=120 + * Example: + * Note: Get host by name time. + */ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) || \ + defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_HIP) +DEFINE_int32(get_host_by_name_time, 120, + "The maximum time for get host by name time"); +#endif diff --git a/paddle/fluid/platform/gen_comm_id_helper.cc b/paddle/fluid/platform/gen_comm_id_helper.cc index 73bc2c41a0bc9c568f0c90e68adb471406d92a03..e9fe2a38c6c43cea391516187b6bcbaccd471c38 100644 --- a/paddle/fluid/platform/gen_comm_id_helper.cc +++ b/paddle/fluid/platform/gen_comm_id_helper.cc @@ -37,6 +37,8 @@ limitations under the License. */ #include "paddle/fluid/platform/collective_helper.h" #endif +DECLARE_int32(get_host_by_name_time); + namespace paddle { namespace platform { @@ -236,7 +238,16 @@ static int ConnectAddr(const std::string& ep, const CommHead head) { char* ip = NULL; struct hostent* hp = NULL; - hp = gethostbyname(host.c_str()); + + // sleep for get_host_by_name_time seconds. + for (int i = 0; 2 * i < FLAGS_get_host_by_name_time; i++) { + hp = gethostbyname(host.c_str()); + if (hp != NULL) { + break; + } + std::this_thread::sleep_for(std::chrono::seconds(2)); + LOG(WARNING) << "gethostbyname " << host.c_str() << " error!"; + } PADDLE_ENFORCE_NOT_NULL(hp, platform::errors::InvalidArgument( "Fail to get host by name %s.", host)); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index fcb2641710facb95d9ae71152a8d02844afb629a..5d1274a1f05324f0d3187a4802c564aad8ed2314 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -236,6 +236,7 @@ def __bootstrap__(): 'local_exe_sub_scope_limit', 'gpu_memory_limit_mb', 'conv2d_disable_cudnn', + 'get_host_by_name_time', ] if core.is_compiled_with_npu(): @@ -246,6 +247,7 @@ def __bootstrap__(): 'reallocate_gpu_memory_in_mb', 'gpu_memory_limit_mb', 'npu_config_path', + 'get_host_by_name_time', ] core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])