[Kunlun] bug fix of PR2: Support MultiDevicePass and BKCL in parallel executor (#29926)

3d1741b7 · liuyuhui · GitHub · 332da133 · 3d1741b7 · 3d1741b7
3 changed file
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
@@ -39,10 +39,13 @@ class Graph;
 namespace paddle {
 namespace platform {
+#if defined(PADDLE_WITH_NCCL)
 class NCCLContextMap;
 class NCCLCommunicator;
+#elif defined(PADDLE_WITH_XPU_BKCL)
 class BKCLContextMap;
 class BKCLCommunicator;
+#endif
 }
 namespace framework {

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -968,9 +968,6 @@ void ParallelExecutor::BCastParamsToDevices(
      continue;
    }
    auto &dims = main_tensor.dims();
-    VLOG(1) << "bcast var=" << var;
    if (paddle::platform::is_gpu_place(main_tensor.place())) {
 #if defined(PADDLE_WITH_NCCL)
      std::vector<void *> buffers;
@@ -1013,6 +1010,11 @@ void ParallelExecutor::BCastParamsToDevices(
      std::vector<void *> buffers;
      buffers.reserve(member_->places_.size());
      size_t numel = main_tensor.numel();
+      // TODO(liuyuhui): BKCL only support parameters using float type,
+      // other parameters need to be strongly converted to float before
+      // broadcasting,
+      // but broadcast is equivalent to no type of operation, does not affect
+      // correctness.
      BKCLDataType data_type = BKCL_FLOAT;
      // BKCLDataType data_type = platform::ToBKCLDataType(main_tensor.type());
      for (size_t i = 0; i < member_->places_.size(); ++i) {

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -123,7 +123,7 @@ class XPUDeviceContext : public DeviceContext {
  void Wait() const override;
 #ifdef PADDLE_WITH_XPU_BKCL
-  /*! \brief  Return nccl context. */
+  /*! \brief  Return bkcl context. */
  BKCLContext_t bkcl_context() const { return bkcl_context_; }
  /*! \brief  Set bkcl context. */