提交 d76ebd78 编写于 作者: Y yi.wu

fix nccl dist train bug

上级 88fa9c2e
...@@ -67,6 +67,10 @@ class GenNCCLIdOp : public framework::OperatorBase { ...@@ -67,6 +67,10 @@ class GenNCCLIdOp : public framework::OperatorBase {
client->AsyncSendVar(ep, dev_ctx, *scope, NCCL_ID_VARNAME); client->AsyncSendVar(ep, dev_ctx, *scope, NCCL_ID_VARNAME);
} }
client->Wait(); client->Wait();
for (auto& ep : endpoint_list) {
client->AsyncSendBatchBarrier(ep);
}
client->Wait();
VLOG(3) << "sending completed..."; VLOG(3) << "sending completed...";
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册