diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index e1438a1eefa62b238241c3185daf69d3418f2dc9..e8b1d587121dc7ed31dc3362c5061ec51a8dafde 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -214,8 +214,8 @@ class CUDAContext { << "Please recompile or reinstall Paddle with compatible CUDNN " "version."; } - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreate(&cudnn_handle_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_RETRY_CUDA_SUCCESS(dynload::cudnnCreate(&cudnn_handle_)); + PADDLE_RETRY_CUDA_SUCCESS( dynload::cudnnSetStream(cudnn_handle_, RawStream())); } else { cudnn_handle_ = nullptr; @@ -223,9 +223,8 @@ class CUDAContext { } void InitCuSolverContext() { - PADDLE_ENFORCE_CUDA_SUCCESS( - dynload::cusolverDnCreate(&cusolver_dn_handle_)); - PADDLE_ENFORCE_CUDA_SUCCESS( + PADDLE_RETRY_CUDA_SUCCESS(dynload::cusolverDnCreate(&cusolver_dn_handle_)); + PADDLE_RETRY_CUDA_SUCCESS( dynload::cusolverDnSetStream(cusolver_dn_handle_, RawStream())); } diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 6a27249817027ad2d1b30c5a8a182fe411eac0b2..fc57d3a4d08ac2b910b62cf78fc41c1864dccf7d 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -904,6 +904,25 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess); } \ } while (0) +#define PADDLE_RETRY_CUDA_SUCCESS(COND) \ + do { \ + auto __cond__ = (COND); \ + int retry_count = 1; \ + using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ + constexpr auto __success_type__ = \ + ::paddle::platform::details::CudaStatusType< \ + __CUDA_STATUS_TYPE__>::kSuccess; \ + while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \ + __cond__ = (COND); \ + ++retry_count; \ + } \ + if (UNLIKELY(__cond__ != __success_type__)) { \ + auto __summary__ = ::paddle::platform::errors::External( \ + ::paddle::platform::build_nvidia_error_msg(__cond__)); \ + __THROW_ERROR_INTERNAL__(__summary__); \ + } \ + } while (0) + #undef DEFINE_CUDA_STATUS_TYPE #endif // PADDLE_WITH_CUDA diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 22550de5b3fadd4688f430f7641e35a7864ca6b4..c2f4d6ff2fffbdb60dffa0ff4e48790d78c56369 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -114,7 +114,7 @@ struct NCCLContextMap { // if num_trainers == 1, should create a new nccl id for local comms. if (num_trainers == 1 && nccl_id == nullptr) { std::lock_guard guard(NCCLGroupGuard::NCCLMutex()); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitAll( + PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::ncclCommInitAll( comms.get(), static_cast(order_.size()), order_.data())); } else { PADDLE_ENFORCE_NOT_NULL(nccl_id, platform::errors::InvalidArgument( @@ -132,8 +132,8 @@ struct NCCLContextMap { } VLOG(1) << "init nccl rank:" << rank << ", nranks:" << nranks << ", gpu_id:" << gpu_id << ", dev_id:" << order_[i]; - PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(gpu_id)); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitRank( + PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(gpu_id)); + PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::ncclCommInitRank( comms.get() + i, nranks, *nccl_id, rank)); } } diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py index 76d93259a647eb3e60ca8bdccca5117a143362e4..fd47dc37e7694de3f088428d2fe677d65c8a784c 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py @@ -36,7 +36,7 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase): opt = fluid.optimizer.SGD(learning_rate=0.001) opt.minimize(loss) - batch_size = 16 + batch_size = 32 image = np.random.normal(size=(batch_size, 784)).astype('float32') label = np.random.randint(0, 10, (batch_size, 1), dtype="int64")