correct sync behavior for XPU distributed training (#47882)

* correct sync behavior for XPU distributed training XPU support event mechanism similar to cuda event, so it is advisable to use an event to sync compute/comm streams for performance. However this mechanism is never fully tested, and inconsistent loss/ending_epochs are reported. Therefore, this PR replaces event sync with stream waiting as a temporary solution. * remove compile warning

correct sync behavior for XPU distributed training (#47882)
* correct sync behavior for XPU distributed training XPU support event mechanism similar to cuda event, so it is advisable to use an event to sync compute/comm streams for performance. However this mechanism is never fully tested, and inconsistent loss/ending_epochs are reported. Therefore, this PR replaces event sync with stream waiting as a temporary solution. * remove compile warning
aafa9820 · james · GitHub · 1fb4d90b · aafa9820 · aafa9820
4 changed file
--- a/paddle/fluid/distributed/collective/BKCLTools.h
+++ b/paddle/fluid/distributed/collective/BKCLTools.h
@@ -77,23 +77,11 @@ class XPUEventManager {
                          device_index_));

    platform::XPUDeviceGuard guard(device_index_);
-    PADDLE_ENFORCE_XPU_SUCCESS(xpu_event_record(event_, ctx.stream()));
+    // TODO(zhangxiaoci) temporary solution: xpu::event seems buggy
+    PADDLE_ENFORCE_XPU_SUCCESS(xpu_wait(ctx.stream()));
  }

-  void Block(const XPUContext& ctx) const {
-    if (is_created_) {
-      auto device_index = ctx.GetPlace().device;
-      PADDLE_ENFORCE_EQ(device_index,
-                        device_index_,
-                        platform::errors::PreconditionNotMet(
-                            "XPUContext's device %d does not match"
-                            "Event's device %d",
-                            device_index,
-                            device_index_));
-      platform::XPUDeviceGuard guard(device_index_);
-      PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_wait_event(ctx.stream(), event_));
-    }
-  }
+  void Block(const XPUContext& ctx) const {}

 private:
  bool is_created_{false};

--- a/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc
@@ -57,8 +57,14 @@ bool ProcessGroupBKCL::BKCLTask::Wait(std::chrono::milliseconds timeout) {

  if (barrier_) {
    // If we use the work to do barrier, we should block cpu
+
+    // TODO(zhangxiaoci) There is no such function that can sync entire device
+    // for xpu (for now), so all we can do is sync whatever stream that we know
+    // and hope for the best. Note that for correctness the communication stream
+    // needs to be in sync mode.
    platform::XPUDeviceGuard guard(place_.GetDeviceId());
    xpu_wait();
+    calc_ctx->Wait();
  }
  return true;
 }

--- a/paddle/phi/backends/xpu/xpu_context.cc
+++ b/paddle/phi/backends/xpu/xpu_context.cc
@@ -64,7 +64,7 @@ struct XPUContext::Impl {
      // manually destroy XPUStream here until xpu::api integrates this work
      // into Context dtor
      xpu_wait(context_->xpu_stream);
-      PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_destroy(context_->xpu_stream));
+      xpu_stream_destroy(context_->xpu_stream);
      context_->xpu_stream = nullptr;
      xpu::destroy_context(context_);
      context_ = nullptr;

--- a/paddle/phi/kernels/xpu/concat_and_split.cc
+++ b/paddle/phi/kernels/xpu/concat_and_split.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
-
-#include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"

 namespace phi {
@@ -67,14 +65,7 @@ class ConcatFunctor<XPUContext, T> {
                                  reinterpret_cast<XPUType*>(output->data<T>()),
                                  xdims_list,
                                  axis);
-    PADDLE_ENFORCE_EQ(
-        r,
-        XPU_SUCCESS,
-        paddle::platform::errors::External(
-            "XPU API return wrong value[%d %s], please check whether "
-            "Baidu Kunlun Card is properly installed.",
-            r,
-            XPUAPIErrorMsg[r]));
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "concat");
  }
 };

@@ -126,14 +117,7 @@ class SplitFunctor<XPUContext, T> {
        xdims_list,
        split_list,
        axis);
-    PADDLE_ENFORCE_EQ(
-        r,
-        XPU_SUCCESS,
-        paddle::platform::errors::External(
-            "XPU API return wrong value[%d %s], please check whether "
-            "Baidu Kunlun Card is properly installed.",
-            r,
-            XPUAPIErrorMsg[r]));
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "split");
  }
 };