From 6b4b9fea0e0e5fd8f2428cb8289f03d42460dad8 Mon Sep 17 00:00:00 2001 From: Baibaifan <39549453+Baibaifan@users.noreply.github.com> Date: Mon, 16 Aug 2021 19:50:23 +0800 Subject: [PATCH] hccl init sync (#34918) --- paddle/fluid/operators/collective/c_comm_init_hccl_op.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc index 3df05955259..7dec645b5b3 100644 --- a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc @@ -87,6 +87,8 @@ class CCommInitOpAscend : public framework::OperatorBase { } PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast( buff, size, HCCL_DATA_TYPE_FP32, 0, comm->comm(), stream)); + // Synchronize stream to find hccl error in time. + PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream)); VLOG(3) << "Build connection successful."; #else PADDLE_THROW(platform::errors::PreconditionNotMet( -- GitLab