[Dygraph] Fix bugs of supporting ProcessGroupNCCL on DCU (#43682)

* fix bugs * update * update * update * code style * code style check

[Dygraph] Fix bugs of supporting ProcessGroupNCCL on DCU (#43682)
* fix bugs * update * update * update * code style * code style check
ae867a84 · Haohongxiang · GitHub · 292b7254 · ae867a84 · ae867a84
3 changed file
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -129,7 +129,7 @@ endif()

 if(NOT ON_INFER)
  set(PYBIND_DEPS ${PYBIND_DEPS} processgroup eager_reducer)
-  if(WITH_NCCL)
+  if(WITH_NCCL OR WITH_RCCL)
    set(PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl)
    if(WITH_PSCORE)
      set(PYBIND_DEPS ${PYBIND_DEPS} processgroup_heter)

--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -31,7 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/phi/api/all.h"

-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
 #endif

@@ -61,11 +61,15 @@ std::shared_ptr<distributed::EagerReducer> CreateEagerReducer(
    const std::vector<std::vector<size_t>> &group_indices,
    const std::vector<bool> &is_sparse_gradient,
    std::shared_ptr<distributed::ProcessGroup> process_group,
-    const std::vector<size_t> &group_size_limits, bool find_unused_parameters) {
+    const std::vector<size_t> &group_size_limits,
+    bool find_unused_parameters) {
  auto params = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0);
-  return std::make_shared<distributed::EagerReducer>(
-      params, group_indices, is_sparse_gradient, process_group,
-      group_size_limits, find_unused_parameters);
+  return std::make_shared<distributed::EagerReducer>(params,
+                                                     group_indices,
+                                                     is_sparse_gradient,
+                                                     process_group,
+                                                     group_size_limits,
+                                                     find_unused_parameters);
 }

 #if defined(PADDLE_WITH_GLOO)
@@ -111,7 +115,8 @@ void BindDistributed(py::module *m) {
          .def("name", &distributed::ProcessGroup::GetBackendName)
          .def(
              "allreduce",
-              [](distributed::ProcessGroup &self, py::handle py_tensor,
+              [](distributed::ProcessGroup &self,
+                 py::handle py_tensor,
                 distributed::ReduceOp op) {
                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
                distributed::AllreduceOptions opts;
@@ -121,12 +126,14 @@ void BindDistributed(py::module *m) {
                std::vector<phi::DenseTensor> tensors = {*dense};
                return self.AllReduce(tensors, tensors, opts);
              },
-              py::arg("tensor"), py::arg("op") = distributed::ReduceOp::SUM,
+              py::arg("tensor"),
+              py::arg("op") = distributed::ReduceOp::SUM,
              py::call_guard<py::gil_scoped_release>())

          .def(
              "broadcast",
-              [](distributed::ProcessGroup &self, py::handle py_tensor,
+              [](distributed::ProcessGroup &self,
+                 py::handle py_tensor,
                 int source_rank) {
                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
                distributed::BroadcastOptions opts;
@@ -136,7 +143,8 @@ void BindDistributed(py::module *m) {
                std::vector<phi::DenseTensor> tensors = {*dense};
                return self.Broadcast(tensors, tensors, opts);
              },
-              py::arg("tensor"), py::arg("source_rank"),
+              py::arg("tensor"),
+              py::arg("source_rank"),
              py::call_guard<py::gil_scoped_release>())

          .def(
@@ -151,7 +159,8 @@ void BindDistributed(py::module *m) {

          .def(
              "send",
-              [](distributed::ProcessGroup &self, py::handle py_tensor,
+              [](distributed::ProcessGroup &self,
+                 py::handle py_tensor,
                 int dst) {
                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
                auto dense =
@@ -159,12 +168,14 @@ void BindDistributed(py::module *m) {
                std::vector<phi::DenseTensor> tensors = {*dense};
                return self.Send(tensors, dst);
              },
-              py::arg("tensor"), py::arg("dst"),
+              py::arg("tensor"),
+              py::arg("dst"),
              py::call_guard<py::gil_scoped_release>())

          .def(
              "recv",
-              [](distributed::ProcessGroup &self, py::handle py_tensor,
+              [](distributed::ProcessGroup &self,
+                 py::handle py_tensor,
                 int src) {
                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
                auto dense =
@@ -172,12 +183,14 @@ void BindDistributed(py::module *m) {
                std::vector<phi::DenseTensor> tensors = {*dense};
                return self.Recv(tensors, src);
              },
-              py::arg("tensor"), py::arg("src"),
+              py::arg("tensor"),
+              py::arg("src"),
              py::call_guard<py::gil_scoped_release>())

          .def(
              "all_gather",
-              [](distributed::ProcessGroup &self, py::handle py_in_tensor,
+              [](distributed::ProcessGroup &self,
+                 py::handle py_in_tensor,
                 py::handle py_out_tensor) {
                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
@@ -189,12 +202,14 @@ void BindDistributed(py::module *m) {
                std::vector<phi::DenseTensor> out_tensors = {*out_dense};
                return self.AllGather(in_tensors, out_tensors);
              },
-              py::arg("in"), py::arg("out"),
+              py::arg("in"),
+              py::arg("out"),
              py::call_guard<py::gil_scoped_release>())

          .def(
              "alltoall",
-              [](distributed::ProcessGroup &self, py::handle py_in_tensor,
+              [](distributed::ProcessGroup &self,
+                 py::handle py_in_tensor,
                 py::handle py_out_tensor) {
                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
@@ -206,13 +221,16 @@ void BindDistributed(py::module *m) {
                std::vector<phi::DenseTensor> out_tensors = {*out_dense};
                return self.AllToAll(in_tensors, out_tensors);
              },
-              py::arg("in"), py::arg("out"),
+              py::arg("in"),
+              py::arg("out"),
              py::call_guard<py::gil_scoped_release>())

          .def(
              "reduce",
-              [](distributed::ProcessGroup &self, py::handle py_in_tensor,
-                 int dst, distributed::ReduceOp op) {
+              [](distributed::ProcessGroup &self,
+                 py::handle py_in_tensor,
+                 int dst,
+                 distributed::ReduceOp op) {
                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
                distributed::ReduceOptions opts;
                opts.reduce_op = op;
@@ -222,14 +240,17 @@ void BindDistributed(py::module *m) {
                std::vector<phi::DenseTensor> tensors = {*dense};
                return self.Reduce(tensors, tensors, opts);
              },
-              py::arg("tensor"), py::arg("dst"),
+              py::arg("tensor"),
+              py::arg("dst"),
              py::arg("op") = distributed::ReduceOp::SUM,
              py::call_guard<py::gil_scoped_release>())

          .def(
              "scatter",
-              [](distributed::ProcessGroup &self, py::handle py_in_tensor,
-                 py::handle py_out_tensor, int src) {
+              [](distributed::ProcessGroup &self,
+                 py::handle py_in_tensor,
+                 py::handle py_out_tensor,
+                 int src) {
                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
                distributed::ScatterOptions opts;
@@ -242,17 +263,25 @@ void BindDistributed(py::module *m) {
                std::vector<phi::DenseTensor> out_tensors = {*out_dense};
                return self.Scatter(in_tensors, out_tensors, opts);
              },
-              py::arg("in"), py::arg("out"), py::arg("src"),
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("src"),
              py::call_guard<py::gil_scoped_release>());

-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)
  py::class_<distributed::ProcessGroupNCCL,
             std::shared_ptr<distributed::ProcessGroupNCCL>>(
      *m, "ProcessGroupNCCL", ProcessGroup)
-      .def(py::init<const std::shared_ptr<distributed::Store> &, int, int,
-                    const platform::CUDAPlace &, int>(),
-           py::arg("store"), py::arg("rank"), py::arg("world_size"),
-           py::arg("place"), py::arg("group_id") = 0,
+      .def(py::init<const std::shared_ptr<distributed::Store> &,
+                    int,
+                    int,
+                    const platform::CUDAPlace &,
+                    int>(),
+           py::arg("store"),
+           py::arg("rank"),
+           py::arg("world_size"),
+           py::arg("place"),
+           py::arg("group_id") = 0,
           py::call_guard<py::gil_scoped_release>());
 #endif

@@ -261,29 +290,53 @@ void BindDistributed(py::module *m) {
  py::class_<distributed::ProcessGroupHeter,
             std::shared_ptr<distributed::ProcessGroupHeter>>(
      *m, "ProcessGroupHeter", ProcessGroup)
-      .def(py::init<const std::shared_ptr<distributed::Store> &, int, int,
+      .def(py::init<const std::shared_ptr<distributed::Store> &,
+                    int,
+                    int,
 #if defined(PADDLE_WITH_ASCEND_CL)
                    const platform::NPUPlace &,
 #else
                    const platform::CUDAPlace &,
 #endif
-                    int, int, int, int, int, bool, std::string, int, int>(),
-           py::arg("store"), py::arg("rank"), py::arg("world_size"),
-           py::arg("place"), py::arg("gid") = 0, py::arg("local_rank") = 0,
-           py::arg("local_size") = 1, py::arg("gloo_rank") = 0,
-           py::arg("gloo_size") = 1, py::arg("with_switch") = false,
-           py::arg("switch_endpoint") = "", py::arg("src_rank") = "",
-           py::arg("dst_rank") = "", py::call_guard<py::gil_scoped_release>());
+                    int,
+                    int,
+                    int,
+                    int,
+                    int,
+                    bool,
+                    std::string,
+                    int,
+                    int>(),
+           py::arg("store"),
+           py::arg("rank"),
+           py::arg("world_size"),
+           py::arg("place"),
+           py::arg("gid") = 0,
+           py::arg("local_rank") = 0,
+           py::arg("local_size") = 1,
+           py::arg("gloo_rank") = 0,
+           py::arg("gloo_size") = 1,
+           py::arg("with_switch") = false,
+           py::arg("switch_endpoint") = "",
+           py::arg("src_rank") = "",
+           py::arg("dst_rank") = "",
+           py::call_guard<py::gil_scoped_release>());
 #endif

 #if defined(PADDLE_WITH_ASCEND_CL)
  py::class_<distributed::ProcessGroupHCCL,
             std::shared_ptr<distributed::ProcessGroupHCCL>>(
      *m, "ProcessGroupHCCL", ProcessGroup)
-      .def(py::init<const std::shared_ptr<distributed::Store> &, int, int,
-                    const platform::NPUPlace &, int>(),
-           py::arg("store"), py::arg("rank"), py::arg("world_size"),
-           py::arg("place"), py::arg("group_id") = 0,
+      .def(py::init<const std::shared_ptr<distributed::Store> &,
+                    int,
+                    int,
+                    const platform::NPUPlace &,
+                    int>(),
+           py::arg("store"),
+           py::arg("rank"),
+           py::arg("world_size"),
+           py::arg("place"),
+           py::arg("group_id") = 0,
           py::call_guard<py::gil_scoped_release>());

 #endif
@@ -291,22 +344,29 @@ void BindDistributed(py::module *m) {
  py::class_<distributed::ProcessGroup::Task,
             std::shared_ptr<distributed::ProcessGroup::Task>>(*m, "task")
      .def("is_completed", &distributed::ProcessGroup::Task::IsCompleted)
-      .def("wait", &distributed::ProcessGroup::Task::Wait,
+      .def("wait",
+           &distributed::ProcessGroup::Task::Wait,
           py::arg("timeout") = kWaitTimeout,
           py::call_guard<py::gil_scoped_release>())
-      .def("synchronize", &distributed::ProcessGroup::Task::Synchronize,
+      .def("synchronize",
+           &distributed::ProcessGroup::Task::Synchronize,
           py::call_guard<py::gil_scoped_release>());

 #if defined(PADDLE_WITH_GLOO)
  py::class_<ProcessGroupGloo, std::shared_ptr<ProcessGroupGloo>>(
      *m, "ProcessGroupGloo", ProcessGroup)
-      .def(py::init<const std::shared_ptr<paddle::distributed::Store> &, int,
-                    int, const platform::CPUPlace &, int,
+      .def(py::init<const std::shared_ptr<paddle::distributed::Store> &,
+                    int,
+                    int,
+                    const platform::CPUPlace &,
+                    int,
                    std::shared_ptr<GlooOptions> &>(),
           py::call_guard<py::gil_scoped_release>())
      .def(py::init([](const std::shared_ptr<paddle::distributed::Store> &store,
-                       int rank, int world_size,
-                       const platform::CPUPlace &place, int gid) {
+                       int rank,
+                       int world_size,
+                       const platform::CPUPlace &place,
+                       int gid) {
             auto opts = GlooOptions::create();
             char *ifname = getenv(GLOO_SOCKET_IFNAME_ENV.c_str());
             if (ifname && strlen(ifname) > 1) {
@@ -315,11 +375,14 @@ void BindDistributed(py::module *m) {
             } else {
               opts->device = ProcessGroupGloo::createDefaultDevice();
             }
-             return std::make_shared<ProcessGroupGloo>(store, rank, world_size,
-                                                       place, gid, opts);
+             return std::make_shared<ProcessGroupGloo>(
+                 store, rank, world_size, place, gid, opts);
           }),
-           py::arg("store"), py::arg("rank"), py::arg("world_size"),
-           py::arg("place"), py::arg("group_id") = 0,
+           py::arg("store"),
+           py::arg("rank"),
+           py::arg("world_size"),
+           py::arg("place"),
+           py::arg("group_id") = 0,
           py::call_guard<py::gil_scoped_release>())
      .def_static("create_default_device",
                  &ProcessGroupGloo::createDefaultDevice);
@@ -327,21 +390,23 @@ void BindDistributed(py::module *m) {

  m->def(
      "eager_assign_group_by_size",
-      [](py::handle py_tensors, std::vector<bool> is_sparse_gradient,
+      [](py::handle py_tensors,
+         std::vector<bool> is_sparse_gradient,
         std::vector<size_t> group_size_limits,
         std::vector<int64_t> tensor_indices) {
        auto tensors = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0);
        return distributed::Eager_AssignGroupBySize(
            tensors, is_sparse_gradient, group_size_limits, tensor_indices);
      },
-      py::arg("tensors"), py::arg("is_sparse_gradient"),
+      py::arg("tensors"),
+      py::arg("is_sparse_gradient"),
      py::arg("group_size_limits") = std::vector<size_t>{25 * 1024 * 1024},
      py::arg("tensor_indices") = std::vector<int64_t>{},
      py::call_guard<py::gil_scoped_release>());

  py::class_<distributed::EagerReducer,
-             std::shared_ptr<distributed::EagerReducer>>(*m, "EagerReducer",
-                                                         R"DOC()DOC")
+             std::shared_ptr<distributed::EagerReducer>>(
+      *m, "EagerReducer", R"DOC()DOC")
      .def(py::init(&CreateEagerReducer))
      .def(
          "prepare_for_backward",
@@ -349,7 +414,8 @@ void BindDistributed(py::module *m) {
            auto params = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0);
            self.PrepareForBackward(params);
          },
-          py::arg("tensors"), py::call_guard<py::gil_scoped_release>());
+          py::arg("tensors"),
+          py::call_guard<py::gil_scoped_release>());
 }

 }  // end namespace pybind

--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc