Retry CUDA Initialization to Fix Random Failure, test=develop (#28323)

This PR is follow up of #28213. On that PR we tried to decrease GPU usage, however the CI still randomly failed. So I added retry logic for the initialization of nccl and cusolver. If the initialization failed, we can retry to avoid the random failure.

Retry CUDA Initialization to Fix Random Failure, test=develop (#28323)
This PR is follow up of #28213. On that PR we tried to decrease GPU usage, however the CI still randomly failed. So I added retry logic for the initialization of nccl and cusolver. If the initialization failed, we can retry to avoid the random failure.
acc11c2a · Huihuang Zheng · GitHub · 5262b025 · acc11c2a · acc11c2a
4 changed file
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -214,8 +214,8 @@ class CUDAContext {
            << "Please recompile or reinstall Paddle with compatible CUDNN "
               "version.";
      }
-      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreate(&cudnn_handle_));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_RETRY_CUDA_SUCCESS(dynload::cudnnCreate(&cudnn_handle_));
+      PADDLE_RETRY_CUDA_SUCCESS(
          dynload::cudnnSetStream(cudnn_handle_, RawStream()));
    } else {
      cudnn_handle_ = nullptr;
@@ -223,9 +223,8 @@ class CUDAContext {
  }

  void InitCuSolverContext() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        dynload::cusolverDnCreate(&cusolver_dn_handle_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::cusolverDnCreate(&cusolver_dn_handle_));
+    PADDLE_RETRY_CUDA_SUCCESS(
        dynload::cusolverDnSetStream(cusolver_dn_handle_, RawStream()));
  }


--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -904,6 +904,25 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
    }                                                            \
  } while (0)

+#define PADDLE_RETRY_CUDA_SUCCESS(COND)                                 \
+  do {                                                                  \
+    auto __cond__ = (COND);                                             \
+    int retry_count = 1;                                                \
+    using __CUDA_STATUS_TYPE__ = decltype(__cond__);                    \
+    constexpr auto __success_type__ =                                   \
+        ::paddle::platform::details::CudaStatusType<                    \
+            __CUDA_STATUS_TYPE__>::kSuccess;                            \
+    while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
+      __cond__ = (COND);                                                \
+      ++retry_count;                                                    \
+    }                                                                   \
+    if (UNLIKELY(__cond__ != __success_type__)) {                       \
+      auto __summary__ = ::paddle::platform::errors::External(          \
+          ::paddle::platform::build_nvidia_error_msg(__cond__));        \
+      __THROW_ERROR_INTERNAL__(__summary__);                            \
+    }                                                                   \
+  } while (0)
+
 #undef DEFINE_CUDA_STATUS_TYPE
 #endif  // PADDLE_WITH_CUDA


--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -114,7 +114,7 @@ struct NCCLContextMap {
    // if num_trainers == 1, should create a new nccl id for local comms.
    if (num_trainers == 1 && nccl_id == nullptr) {
      std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex());
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitAll(
+      PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::ncclCommInitAll(
          comms.get(), static_cast<int>(order_.size()), order_.data()));
    } else {
      PADDLE_ENFORCE_NOT_NULL(nccl_id, platform::errors::InvalidArgument(
@@ -132,8 +132,8 @@ struct NCCLContextMap {
          }
          VLOG(1) << "init nccl rank:" << rank << ", nranks:" << nranks
                  << ", gpu_id:" << gpu_id << ", dev_id:" << order_[i];
-          PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(gpu_id));
-          PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitRank(
+          PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(gpu_id));
+          PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::ncclCommInitRank(
              comms.get() + i, nranks, *nccl_id, rank));
        }
      }

--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
@@ -36,7 +36,7 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
            opt = fluid.optimizer.SGD(learning_rate=0.001)
            opt.minimize(loss)

-            batch_size = 16
+            batch_size = 32
            image = np.random.normal(size=(batch_size, 784)).astype('float32')
            label = np.random.randint(0, 10, (batch_size, 1), dtype="int64")