From d76ebd7853e3f092e313059360ba451851eea624 Mon Sep 17 00:00:00 2001 From: "yi.wu" Date: Wed, 13 Jun 2018 11:23:56 +0800 Subject: [PATCH] fix nccl dist train bug --- paddle/fluid/operators/gen_nccl_id_op.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paddle/fluid/operators/gen_nccl_id_op.cc b/paddle/fluid/operators/gen_nccl_id_op.cc index 111e58844c8..f824eee4e7d 100644 --- a/paddle/fluid/operators/gen_nccl_id_op.cc +++ b/paddle/fluid/operators/gen_nccl_id_op.cc @@ -67,6 +67,10 @@ class GenNCCLIdOp : public framework::OperatorBase { client->AsyncSendVar(ep, dev_ctx, *scope, NCCL_ID_VARNAME); } client->Wait(); + for (auto& ep : endpoint_list) { + client->AsyncSendBatchBarrier(ep); + } + client->Wait(); VLOG(3) << "sending completed..."; } -- GitLab