From f65f1caad312c271b44f93620e954d780cf24fb8 Mon Sep 17 00:00:00 2001 From: ShenLiang Date: Tue, 22 Dec 2020 14:41:58 +0800 Subject: [PATCH] opt sparse allreduce using ncclgather (#29819) --- paddle/fluid/imperative/all_reduce.cc | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/paddle/fluid/imperative/all_reduce.cc b/paddle/fluid/imperative/all_reduce.cc index 8cebb35d4ed..57b620ff4b5 100644 --- a/paddle/fluid/imperative/all_reduce.cc +++ b/paddle/fluid/imperative/all_reduce.cc @@ -119,6 +119,21 @@ static void AllReduce(const framework::SelectedRows &src, if (!use_calc_stream) { dev_ctx->Wait(); } + if (std::all_of(cpu_rows_num_ptr, cpu_rows_num_ptr + strategy.nranks_, + [&](int64_t row) { return row == cpu_rows_num_ptr[0]; })) { + // During sparse communication, the number of each card is same. + // allgather is used to speed up the allreduce by replacing broadcast. + auto row_sendcount = cpu_rows_num_ptr[0]; + VLOG(3) << "allgather replaces broadcast to speed up in sparse allreduce"; + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather( + src_rows_ptr, dst_rows_ptr, row_sendcount, ncclInt64, comm->comm(), + stream)); + auto value_sendcount = cpu_rows_num_ptr[0] * feature_size; + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather( + src_tensor_ptr, dst_tensor_ptr, value_sendcount, nccl_dtype, + comm->comm(), stream)); + return; + } for (int i = 0; i < strategy.nranks_; ++i) { if (cpu_rows_num_ptr[i] > 0) { // 2. Broadcast the rows of SelectedRows -- GitLab