[kunlun]fix sync in multi kunlun xpu dygraph training. (#30943)

87197f8c · liuyuhui · GitHub · 99bf6228 · 87197f8c · 87197f8c
Showing with 14 addition and 2 deletion

paddle/fluid/imperative/reducer.cc paddle/fluid/imperative/reducer.cc +12 -0

python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py ...ddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py +2 -2

未找到文件。
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -626,6 +626,18 @@ void Reducer::MarkGroupReady(size_t group_index) {
        // group.dense_tensors ---> group.dense_contents_
        group.ConcatTensors(*parallel_ctx_->GetDeviceContext(run_order));

+// NOTE(liuyuhui): ConcatTensors use communication stream, but BKCL only support
+// default stream for communicating,
+// so there exist some problems in synchronization. And need to add a WaitComm
+// there.
+// TODO(liuyuhui): If BKCL support events, it should be fixed as non-blocking
+// communication.
+#ifdef PADDLE_WITH_XPU_BKCL
+        if (platform::is_xpu_place(group.dense_tensors_[0].place())) {
+          parallel_ctx_->WaitComm(run_order);
+        }
+#endif
+
        // Start allreduce
        parallel_ctx_->AllReduceByStream(
            group.dense_contents_, &(group.dense_contents_), run_order, false);

--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
@@ -55,7 +55,7 @@ class TestParallelDygraphMnistXPU(TestDistBase):
        if fluid.core.is_compiled_with_xpu():
            self.check_with_place(
                "parallel_dygraph_mnist.py",
-                delta=1e-1,
+                delta=1e-4,
                check_error_log=True,
                log_name=flag_name)

@@ -94,7 +94,7 @@ class TestFleetDygraphMnistXPU(TestDistBase):
        if fluid.core.is_compiled_with_xpu():
            self.check_with_place(
                "parallel_dygraph_mnist.py",
-                delta=1e-1,
+                delta=1e-4,
                check_error_log=True,
                log_name=flag_name)