未验证 提交 87197f8c 编写于 作者: L liuyuhui 提交者: GitHub

[kunlun]fix sync in multi kunlun xpu dygraph training. (#30943)

上级 99bf6228
......@@ -626,6 +626,18 @@ void Reducer::MarkGroupReady(size_t group_index) {
// group.dense_tensors ---> group.dense_contents_
group.ConcatTensors(*parallel_ctx_->GetDeviceContext(run_order));
// NOTE(liuyuhui): ConcatTensors use communication stream, but BKCL only support
// default stream for communicating,
// so there exist some problems in synchronization. And need to add a WaitComm
// there.
// TODO(liuyuhui): If BKCL support events, it should be fixed as non-blocking
// communication.
#ifdef PADDLE_WITH_XPU_BKCL
if (platform::is_xpu_place(group.dense_tensors_[0].place())) {
parallel_ctx_->WaitComm(run_order);
}
#endif
// Start allreduce
parallel_ctx_->AllReduceByStream(
group.dense_contents_, &(group.dense_contents_), run_order, false);
......
......@@ -55,7 +55,7 @@ class TestParallelDygraphMnistXPU(TestDistBase):
if fluid.core.is_compiled_with_xpu():
self.check_with_place(
"parallel_dygraph_mnist.py",
delta=1e-1,
delta=1e-4,
check_error_log=True,
log_name=flag_name)
......@@ -94,7 +94,7 @@ class TestFleetDygraphMnistXPU(TestDistBase):
if fluid.core.is_compiled_with_xpu():
self.check_with_place(
"parallel_dygraph_mnist.py",
delta=1e-1,
delta=1e-4,
check_error_log=True,
log_name=flag_name)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册