add retry for gethostbyname (#34855)

e92f0388 · Baibaifan · GitHub · 2164ad61 · e92f0388 · e92f0388
4 changed file
--- a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
@@ -34,6 +34,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/hccl_helper.h"
 #endif

+DECLARE_int32(get_host_by_name_time);
+
 namespace paddle {
 namespace operators {

@@ -226,7 +228,15 @@ static int ConnectAddr(const std::string& ep, const char* head) {

  char* ip = NULL;
  struct hostent* hp = NULL;
-  hp = gethostbyname(host.c_str());
+  // sleep for get_host_by_name_time seconds.
+  for (int i = 0; 2 * i < FLAGS_get_host_by_name_time; i++) {
+    hp = gethostbyname(host.c_str());
+    if (hp != NULL) {
+      break;
+    }
+    std::this_thread::sleep_for(std::chrono::seconds(2));
+    LOG(WARNING) << "gethostbyname " << host.c_str() << " error!";
+  }
  PADDLE_ENFORCE_NOT_NULL(hp, platform::errors::InvalidArgument(
                                  "Fail to get host by name %s.", host));


--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -606,3 +606,17 @@ DEFINE_bool(check_kernel_launch, false,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 DEFINE_bool(conv2d_disable_cudnn, false, "Disable cudnn in conv2d");
 #endif
+
+/**
+ * Distributed related FLAG
+ * Name: FLAGS_get_host_by_name_time
+ * Since Version: 2.2.0
+ * Value Range: int32, default=120
+ * Example:
+ * Note: Get host by name time.
+ */
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) || \
+    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_HIP)
+DEFINE_int32(get_host_by_name_time, 120,
+             "The maximum time for get host by name time");
+#endif
--- a/paddle/fluid/platform/gen_comm_id_helper.cc
+++ b/paddle/fluid/platform/gen_comm_id_helper.cc
@@ -37,6 +37,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/collective_helper.h"
 #endif

+DECLARE_int32(get_host_by_name_time);
+
 namespace paddle {
 namespace platform {

@@ -236,7 +238,16 @@ static int ConnectAddr(const std::string& ep, const CommHead head) {

  char* ip = NULL;
  struct hostent* hp = NULL;
-  hp = gethostbyname(host.c_str());
+
+  // sleep for get_host_by_name_time seconds.
+  for (int i = 0; 2 * i < FLAGS_get_host_by_name_time; i++) {
+    hp = gethostbyname(host.c_str());
+    if (hp != NULL) {
+      break;
+    }
+    std::this_thread::sleep_for(std::chrono::seconds(2));
+    LOG(WARNING) << "gethostbyname " << host.c_str() << " error!";
+  }
  PADDLE_ENFORCE_NOT_NULL(hp, platform::errors::InvalidArgument(
                                  "Fail to get host by name %s.", host));


--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -236,6 +236,7 @@ def __bootstrap__():
            'local_exe_sub_scope_limit',
            'gpu_memory_limit_mb',
            'conv2d_disable_cudnn',
+            'get_host_by_name_time',
        ]

    if core.is_compiled_with_npu():
@@ -246,6 +247,7 @@ def __bootstrap__():
            'reallocate_gpu_memory_in_mb',
            'gpu_memory_limit_mb',
            'npu_config_path',
+            'get_host_by_name_time',
        ]

    core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])