From 8c0529fd8395b62126d485daf292e98da6e2f190 Mon Sep 17 00:00:00 2001 From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com> Date: Sat, 8 Oct 2022 17:01:03 +0800 Subject: [PATCH] [Dygraph] Fix performance of pp+mp by using send/recv_calc_stream instead of send/recv (#46116) --- .../distributed/collective/ProcessGroup.h | 10 ++++++ .../collective/ProcessGroupNCCL.cc | 35 +++++++++++++++++++ .../distributed/collective/ProcessGroupNCCL.h | 8 +++++ .../collective/ProcessGroupStream.cc | 25 +++++++++++++ .../collective/ProcessGroupStream.h | 15 ++++++++ paddle/fluid/pybind/distributed_py.cc | 31 ++++++++++++++++ .../distributed/fleet/layers/mpu/mp_ops.py | 21 ++++++++++- .../pp_utils/p2p_communication.py | 23 +++++------- 8 files changed, 153 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h index 0ca900a0fa7..09d140c5416 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.h +++ b/paddle/fluid/distributed/collective/ProcessGroup.h @@ -214,6 +214,16 @@ class ProcessGroup { "ProcessGroup%s does not support AllGather_Partial", GetBackendName())); } + virtual std::shared_ptr AllGather_Partial( + std::vector& in_tensors, // NOLINT + std::vector& out_tensors, // NOLINT + int offset, + int length, + bool) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support AllGather_Partial", GetBackendName())); + } + virtual std::shared_ptr AllToAll( std::vector&, // NOLINT std::vector&) { // NOLINT diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index 12f60faf800..b406f596401 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -1034,6 +1034,41 @@ std::shared_ptr ProcessGroupNCCL::AllGather_Partial( CommType::ALLGATHER); } +std::shared_ptr ProcessGroupNCCL::AllGather_Partial( + std::vector& in_tensors, + std::vector& out_tensors, + int offset, + int length, + bool sync_op, + bool use_calc_stream) { + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(in_tensors), + true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(out_tensors), + true, + platform::errors::InvalidArgument("All outputs should be in CudaPlace.")); + return Collective( + in_tensors, + out_tensors, + [&](phi::DenseTensor& input, + phi::DenseTensor& output, + ncclComm_t comm, + const gpuStream_t& stream) { + return platform::dynload::ncclAllGather( + GetPointerByOffset(input.data(), offset, input.dtype()), + output.data(), + length, + platform::ToNCCLDataType(input.dtype()), + comm, + stream); + }, + CommType::ALLGATHER, + sync_op, + use_calc_stream); +} + std::shared_ptr ProcessGroupNCCL::AllToAll( std::vector& in_tensors, std::vector& out_tensors) { diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index 24ba7c86b18..6d15f6ebdef 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -182,6 +182,14 @@ class ProcessGroupNCCL : public ProcessGroupStream { int offset, int length) override; + std::shared_ptr AllGather_Partial( + std::vector& in_tensors, + std::vector& out_tensors, + int offset, + int length, + bool sync_op, + bool use_calc_stream) override; + std::shared_ptr AllToAll( std::vector& in, std::vector& out) override; diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.cc b/paddle/fluid/distributed/collective/ProcessGroupStream.cc index 43ca0bbb36d..222fe03b60b 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupStream.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupStream.cc @@ -154,5 +154,30 @@ std::shared_ptr ProcessGroupStream::Recv_Partial( "ProcessGroup%s does not support do recv_partial", GetBackendName())); } +std::shared_ptr ProcessGroupStream::AllGather_Partial( + std::vector& in_tensors, + std::vector& out_tensors, + int offset, + int length, + bool sync_op) { + return AllGather_Partial(in_tensors, + out_tensors, + offset, + length, + sync_op, + /*use_calc_stream*/ false); +} + +std::shared_ptr ProcessGroupStream::AllGather_Partial( + std::vector& in_tensors, + std::vector& out_tensors, + int offset, + int length, + bool sync_op, + bool use_calc_stream) { + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support do recv_partial", GetBackendName())); +} + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.h b/paddle/fluid/distributed/collective/ProcessGroupStream.h index f8ab562ad07..1162c3e0509 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupStream.h +++ b/paddle/fluid/distributed/collective/ProcessGroupStream.h @@ -132,6 +132,21 @@ class ProcessGroupStream : public ProcessGroup { int length, bool sync_op, bool use_calc_stream); + + std::shared_ptr AllGather_Partial( + std::vector& in_tensors, + std::vector& out_tensors, + int offset, + int length, + bool sync_op) override; + + virtual std::shared_ptr AllGather_Partial( + std::vector& in_tensors, // NOLINT + std::vector& out_tensors, // NOLINT + int offset, + int length, + bool sync_op, + bool use_calc_stream); }; } // namespace distributed diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index 6a2d099ac17..39986c604c0 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -621,6 +621,37 @@ void BindDistributed(py::module *m) { py::arg("op"), py::call_guard()) + .def( + "all_gather_partial_on_calc_stream", + [](distributed::ProcessGroupStream &self, + py::handle py_in_tensor, + py::handle py_out_tensor, + int nranks, + int rank_id) { + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + auto in_dense = std::dynamic_pointer_cast( + in_tensor.impl()); + auto out_dense = std::dynamic_pointer_cast( + out_tensor.impl()); + std::vector in_tensors = {*in_dense}; + std::vector out_tensors = {*out_dense}; + int numel = (*in_dense).numel(); + int send_numel = numel / nranks; + int offset = send_numel * rank_id; + return self.AllGather_Partial(in_tensors, + out_tensors, + offset, + send_numel, + /*sync_op*/ true, + /*use_calc_stream*/ true); + }, + py::arg("in"), + py::arg("out"), + py::arg("num"), + py::arg("id"), + py::call_guard()) + .def( "send_on_calc_stream", [](distributed::ProcessGroupStream &self, diff --git a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py index dc4dc05c7ba..3506851e1db 100644 --- a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py +++ b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py @@ -43,7 +43,26 @@ def _c_identity(tensor, group=None): return ring_id = 0 if group is None else group.id - if _non_static_mode(): + if in_dygraph_mode(): + from paddle.autograd import PyLayer + + class c_identity_eager(PyLayer): + + @staticmethod + def forward(ctx, tensor): + return _legacy_C_ops.c_identity(tensor, 'use_calc_stream', True, + 'ring_id', group.id, + 'use_model_parallel', True) + + @staticmethod + def backward(ctx, dy): + op_type = collective._get_reduce_op(ReduceOp.SUM, "_c_identity") + group.process_group.allreduce_on_calc_stream(dy, op_type) + return dy + + return c_identity_eager.apply(tensor) + + elif _in_legacy_dygraph(): return _legacy_C_ops.c_identity(tensor, 'use_calc_stream', True, 'ring_id', ring_id, 'use_model_parallel', True) diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py index e2ca6f8d2a0..c1cf0527e1b 100644 --- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py +++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py @@ -173,7 +173,9 @@ def _partial_send_op(tensor, group, use_calc_stream, ring_id, dst, nranks, elif in_dygraph_mode(): group = paddle.distributed.collective._get_default_group( ) if group is None else group - return group.process_group.send_partial(tensor, dst, nranks, rank_id) + comm_op = group.process_group.send_partial_on_calc_stream \ + if use_calc_stream else group.process_group.send_partial + return comm_op(tensor, dst, nranks, rank_id) def send_partial(tensor, @@ -212,12 +214,9 @@ def _partial_recv_op(tensor, group, use_calc_stream, ring_id, src, nranks, elif in_dygraph_mode(): group = paddle.distributed.collective._get_default_group( ) if group is None else group - task = group.process_group.recv_partial(tensor, src, nranks, rank_id) - if use_calc_stream: - task.wait() - return None - else: - return task + comm_op = group.process_group.recv_partial_on_calc_stream \ + if use_calc_stream else group.process_group.recv_partial + return comm_op(tensor, src, nranks, rank_id) def recv_partial(tensor, @@ -255,13 +254,9 @@ def _partial_allgather_op(tensor, group, use_calc_stream, ring_id, nranks, elif in_dygraph_mode(): group = paddle.distributed.collective._get_default_group( ) if group is None else group - task = group.process_group.all_gather_partial(tensor, tensor, nranks, - rank_id) - if use_calc_stream: - task.wait() - return None - else: - return task + comm_op = group.process_group.all_gather_partial_on_calc_stream \ + if use_calc_stream else group.process_group.all_gather_partial + return comm_op(tensor, tensor, nranks, rank_id) def allgather_partial(tensor, -- GitLab