未验证 提交 acc11c2a 编写于 作者: H Huihuang Zheng 提交者: GitHub

Retry CUDA Initialization to Fix Random Failure, test=develop (#28323)

This PR is follow up of #28213. On that PR we tried to decrease GPU usage, however the CI still randomly failed. So I added retry logic for the initialization of nccl and cusolver. If the initialization failed, we can retry to avoid the random failure.
上级 5262b025
...@@ -214,8 +214,8 @@ class CUDAContext { ...@@ -214,8 +214,8 @@ class CUDAContext {
<< "Please recompile or reinstall Paddle with compatible CUDNN " << "Please recompile or reinstall Paddle with compatible CUDNN "
"version."; "version.";
} }
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreate(&cudnn_handle_)); PADDLE_RETRY_CUDA_SUCCESS(dynload::cudnnCreate(&cudnn_handle_));
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_RETRY_CUDA_SUCCESS(
dynload::cudnnSetStream(cudnn_handle_, RawStream())); dynload::cudnnSetStream(cudnn_handle_, RawStream()));
} else { } else {
cudnn_handle_ = nullptr; cudnn_handle_ = nullptr;
...@@ -223,9 +223,8 @@ class CUDAContext { ...@@ -223,9 +223,8 @@ class CUDAContext {
} }
void InitCuSolverContext() { void InitCuSolverContext() {
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_RETRY_CUDA_SUCCESS(dynload::cusolverDnCreate(&cusolver_dn_handle_));
dynload::cusolverDnCreate(&cusolver_dn_handle_)); PADDLE_RETRY_CUDA_SUCCESS(
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cusolverDnSetStream(cusolver_dn_handle_, RawStream())); dynload::cusolverDnSetStream(cusolver_dn_handle_, RawStream()));
} }
......
...@@ -904,6 +904,25 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess); ...@@ -904,6 +904,25 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
} \ } \
} while (0) } while (0)
#define PADDLE_RETRY_CUDA_SUCCESS(COND) \
do { \
auto __cond__ = (COND); \
int retry_count = 1; \
using __CUDA_STATUS_TYPE__ = decltype(__cond__); \
constexpr auto __success_type__ = \
::paddle::platform::details::CudaStatusType< \
__CUDA_STATUS_TYPE__>::kSuccess; \
while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
__cond__ = (COND); \
++retry_count; \
} \
if (UNLIKELY(__cond__ != __success_type__)) { \
auto __summary__ = ::paddle::platform::errors::External( \
::paddle::platform::build_nvidia_error_msg(__cond__)); \
__THROW_ERROR_INTERNAL__(__summary__); \
} \
} while (0)
#undef DEFINE_CUDA_STATUS_TYPE #undef DEFINE_CUDA_STATUS_TYPE
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
......
...@@ -114,7 +114,7 @@ struct NCCLContextMap { ...@@ -114,7 +114,7 @@ struct NCCLContextMap {
// if num_trainers == 1, should create a new nccl id for local comms. // if num_trainers == 1, should create a new nccl id for local comms.
if (num_trainers == 1 && nccl_id == nullptr) { if (num_trainers == 1 && nccl_id == nullptr) {
std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex()); std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex());
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitAll( PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::ncclCommInitAll(
comms.get(), static_cast<int>(order_.size()), order_.data())); comms.get(), static_cast<int>(order_.size()), order_.data()));
} else { } else {
PADDLE_ENFORCE_NOT_NULL(nccl_id, platform::errors::InvalidArgument( PADDLE_ENFORCE_NOT_NULL(nccl_id, platform::errors::InvalidArgument(
...@@ -132,8 +132,8 @@ struct NCCLContextMap { ...@@ -132,8 +132,8 @@ struct NCCLContextMap {
} }
VLOG(1) << "init nccl rank:" << rank << ", nranks:" << nranks VLOG(1) << "init nccl rank:" << rank << ", nranks:" << nranks
<< ", gpu_id:" << gpu_id << ", dev_id:" << order_[i]; << ", gpu_id:" << gpu_id << ", dev_id:" << order_[i];
PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(gpu_id)); PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(gpu_id));
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitRank( PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::ncclCommInitRank(
comms.get() + i, nranks, *nccl_id, rank)); comms.get() + i, nranks, *nccl_id, rank));
} }
} }
......
...@@ -36,7 +36,7 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase): ...@@ -36,7 +36,7 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
opt = fluid.optimizer.SGD(learning_rate=0.001) opt = fluid.optimizer.SGD(learning_rate=0.001)
opt.minimize(loss) opt.minimize(loss)
batch_size = 16 batch_size = 32
image = np.random.normal(size=(batch_size, 784)).astype('float32') image = np.random.normal(size=(batch_size, 784)).astype('float32')
label = np.random.randint(0, 10, (batch_size, 1), dtype="int64") label = np.random.randint(0, 10, (batch_size, 1), dtype="int64")
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册