opt sparse allreduce using ncclgather (#29819)

f65f1caa · ShenLiang · GitHub · 82aa01c3 · f65f1caa
隐藏空白更改
内联并排

Showing with 15 addition and 0 deletion

paddle/fluid/imperative/all_reduce.cc paddle/fluid/imperative/all_reduce.cc +15 -0

未找到文件。
--- a/paddle/fluid/imperative/all_reduce.cc
+++ b/paddle/fluid/imperative/all_reduce.cc
@@ -119,6 +119,21 @@ static void AllReduce(const framework::SelectedRows &src,
  if (!use_calc_stream) {
    dev_ctx->Wait();
  }
+  if (std::all_of(cpu_rows_num_ptr, cpu_rows_num_ptr + strategy.nranks_,
+                  [&](int64_t row) { return row == cpu_rows_num_ptr[0]; })) {
+    // During sparse communication, the number of each card is same.
+    // allgather is used to speed up the allreduce by replacing broadcast.
+    auto row_sendcount = cpu_rows_num_ptr[0];
+    VLOG(3) << "allgather replaces broadcast to speed up in sparse allreduce";
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+        src_rows_ptr, dst_rows_ptr, row_sendcount, ncclInt64, comm->comm(),
+        stream));
+    auto value_sendcount = cpu_rows_num_ptr[0] * feature_size;
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+        src_tensor_ptr, dst_tensor_ptr, value_sendcount, nccl_dtype,
+        comm->comm(), stream));
+    return;
+  }
  for (int i = 0; i < strategy.nranks_; ++i) {
    if (cpu_rows_num_ptr[i] > 0) {
      // 2. Broadcast the rows of SelectedRows