From 52a7b0c4e893d1444ac6067ea7f6b0e31f484cb5 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Wed, 18 Aug 2021 10:54:49 +0800 Subject: [PATCH] [NPU] add retry on HcclGetRootInfo to fix "bind fail" (#34977) * add retry for HcclGetRootInfo * refine code * reduce retry interval --- .../operators/collective/c_gen_hccl_id_op.cc | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc index 9ab7d90efaa..63a783720e0 100644 --- a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc +++ b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc @@ -32,9 +32,24 @@ namespace operators { #ifdef PADDLE_WITH_ASCEND_CL static void GenHCCLID(std::vector* hccl_ids) { + constexpr int timeout = 2 * 60 + 10; // 2MSL+10s + constexpr int retry_time = 1; for (size_t i = 0; i < hccl_ids->size(); ++i) { - PADDLE_ENFORCE_NPU_SUCCESS( - platform::dynload::HcclGetRootInfo(&(*hccl_ids)[i])); + bool failed = true; + for (auto retry_times = 0; retry_times * retry_time < timeout; + ++retry_times) { + auto err = platform::dynload::HcclGetRootInfo(&(*hccl_ids)[i]); + if (err == 0) { + failed = false; + break; + } + std::this_thread::sleep_for(std::chrono::seconds(retry_time)); + LOG(WARNING) << "HcclGetRootInfo failed, err is: " << err << ", retry " + << retry_times << " times"; + } + if (failed) { + PADDLE_THROW(platform::errors::External("HcclGetRootInfo failed!")); + } } } -- GitLab