From 078e8c7893c48d22b3db5d687602ee8824c1c4cc Mon Sep 17 00:00:00 2001 From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com> Date: Sun, 9 Oct 2022 11:07:48 +0800 Subject: [PATCH] [Dygraph] Fix Perf of FusedFeedForward and FusedAttention with AllReduce (#46780) --- paddle/fluid/operators/fused/fused_attention_op.cu | 6 ++++-- paddle/fluid/operators/fused/fused_feedforward_op.cu | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu index 62ea3f723dc..d03b76adef3 100644 --- a/paddle/fluid/operators/fused/fused_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_attention_op.cu @@ -30,7 +30,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/math_function.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/distributed/collective/ProcessGroup.h" +#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif @@ -50,13 +50,15 @@ static void AllReduce(phi::DenseTensor &tensor, // NOLINT if (map->has(ring_id)) { paddle::distributed::ProcessGroup *pg = map->get(ring_id); + auto pg_nccl = static_cast(pg); + std::vector in_tensor; std::vector out_tensor; in_tensor.push_back(tensor); out_tensor.push_back(tensor); paddle::distributed::AllreduceOptions opts; opts.reduce_op = distributed::ReduceOp::SUM; - auto task = pg->AllReduce(in_tensor, out_tensor, opts); + auto task = pg_nccl->AllReduce(in_tensor, out_tensor, opts, true, true); task->Wait(); } else { auto dtype = platform::ToNCCLDataType( diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu index 6084b1f61f8..90af129296c 100644 --- a/paddle/fluid/operators/fused/fused_feedforward_op.cu +++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu @@ -23,7 +23,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/elementwise_functor.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/distributed/collective/ProcessGroup.h" +#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif @@ -43,13 +43,15 @@ static void AllReduce(phi::DenseTensor& tensor, // NOLINT if (map->has(ring_id)) { paddle::distributed::ProcessGroup* pg = map->get(ring_id); + auto pg_nccl = static_cast(pg); + std::vector in_tensor; std::vector out_tensor; in_tensor.push_back(tensor); out_tensor.push_back(tensor); paddle::distributed::AllreduceOptions opts; opts.reduce_op = distributed::ReduceOp::SUM; - auto task = pg->AllReduce(in_tensor, out_tensor, opts); + auto task = pg_nccl->AllReduce(in_tensor, out_tensor, opts, true, true); task->Wait(); } else { auto dtype = platform::ToNCCLDataType( -- GitLab