diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index 9f296cbd5e1dc5fad3a981c3346e57c78cdb5511..8f55645b880975d43bf94db1b5784a705e0c5f65 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -626,6 +626,18 @@ void Reducer::MarkGroupReady(size_t group_index) {
         // group.dense_tensors ---> group.dense_contents_
         group.ConcatTensors(*parallel_ctx_->GetDeviceContext(run_order));
 
+// NOTE(liuyuhui): ConcatTensors use communication stream, but BKCL only support
+// default stream for communicating,
+// so there exist some problems in synchronization. And need to add a WaitComm
+// there.
+// TODO(liuyuhui): If BKCL support events, it should be fixed as non-blocking
+// communication.
+#ifdef PADDLE_WITH_XPU_BKCL
+        if (platform::is_xpu_place(group.dense_tensors_[0].place())) {
+          parallel_ctx_->WaitComm(run_order);
+        }
+#endif
+
         // Start allreduce
         parallel_ctx_->AllReduceByStream(
             group.dense_contents_, &(group.dense_contents_), run_order, false);
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
index faba479b32fdf722802f186463ed186b9eaacbbc..f21468f50c5f8f87c83cd6d9e20afa5993ecf064 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
@@ -55,7 +55,7 @@ class TestParallelDygraphMnistXPU(TestDistBase):
         if fluid.core.is_compiled_with_xpu():
             self.check_with_place(
                 "parallel_dygraph_mnist.py",
-                delta=1e-1,
+                delta=1e-4,
                 check_error_log=True,
                 log_name=flag_name)
 
@@ -94,7 +94,7 @@ class TestFleetDygraphMnistXPU(TestDistBase):
         if fluid.core.is_compiled_with_xpu():
             self.check_with_place(
                 "parallel_dygraph_mnist.py",
-                delta=1e-1,
+                delta=1e-4,
                 check_error_log=True,
                 log_name=flag_name)