未验证 提交 acc11c2a 编写于 作者: H Huihuang Zheng 提交者: GitHub

Retry CUDA Initialization to Fix Random Failure, test=develop (#28323)

This PR is follow up of #28213. On that PR we tried to decrease GPU usage, however the CI still randomly failed. So I added retry logic for the initialization of nccl and cusolver. If the initialization failed, we can retry to avoid the random failure.
上级 5262b025
......@@ -214,8 +214,8 @@ class CUDAContext {
<< "Please recompile or reinstall Paddle with compatible CUDNN "
"version.";
}
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreate(&cudnn_handle_));
PADDLE_ENFORCE_CUDA_SUCCESS(
PADDLE_RETRY_CUDA_SUCCESS(dynload::cudnnCreate(&cudnn_handle_));
PADDLE_RETRY_CUDA_SUCCESS(
dynload::cudnnSetStream(cudnn_handle_, RawStream()));
} else {
cudnn_handle_ = nullptr;
......@@ -223,9 +223,8 @@ class CUDAContext {
}
void InitCuSolverContext() {
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cusolverDnCreate(&cusolver_dn_handle_));
PADDLE_ENFORCE_CUDA_SUCCESS(
PADDLE_RETRY_CUDA_SUCCESS(dynload::cusolverDnCreate(&cusolver_dn_handle_));
PADDLE_RETRY_CUDA_SUCCESS(
dynload::cusolverDnSetStream(cusolver_dn_handle_, RawStream()));
}
......
......@@ -904,6 +904,25 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
} \
} while (0)
#define PADDLE_RETRY_CUDA_SUCCESS(COND) \
do { \
auto __cond__ = (COND); \
int retry_count = 1; \
using __CUDA_STATUS_TYPE__ = decltype(__cond__); \
constexpr auto __success_type__ = \
::paddle::platform::details::CudaStatusType< \
__CUDA_STATUS_TYPE__>::kSuccess; \
while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
__cond__ = (COND); \
++retry_count; \
} \
if (UNLIKELY(__cond__ != __success_type__)) { \
auto __summary__ = ::paddle::platform::errors::External( \
::paddle::platform::build_nvidia_error_msg(__cond__)); \
__THROW_ERROR_INTERNAL__(__summary__); \
} \
} while (0)
#undef DEFINE_CUDA_STATUS_TYPE
#endif // PADDLE_WITH_CUDA
......
......@@ -114,7 +114,7 @@ struct NCCLContextMap {
// if num_trainers == 1, should create a new nccl id for local comms.
if (num_trainers == 1 && nccl_id == nullptr) {
std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex());
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitAll(
PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::ncclCommInitAll(
comms.get(), static_cast<int>(order_.size()), order_.data()));
} else {
PADDLE_ENFORCE_NOT_NULL(nccl_id, platform::errors::InvalidArgument(
......@@ -132,8 +132,8 @@ struct NCCLContextMap {
}
VLOG(1) << "init nccl rank:" << rank << ", nranks:" << nranks
<< ", gpu_id:" << gpu_id << ", dev_id:" << order_[i];
PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(gpu_id));
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitRank(
PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(gpu_id));
PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::ncclCommInitRank(
comms.get() + i, nranks, *nccl_id, rank));
}
}
......
......@@ -36,7 +36,7 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
opt = fluid.optimizer.SGD(learning_rate=0.001)
opt.minimize(loss)
batch_size = 16
batch_size = 32
image = np.random.normal(size=(batch_size, 784)).astype('float32')
label = np.random.randint(0, 10, (batch_size, 1), dtype="int64")
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册