diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt index 85efa52c3196a7b1e0684afa7d870512935012fe..de9059228a61bd06a60093595987a63248b089cd 100644 --- a/paddle/fluid/distributed/collective/CMakeLists.txt +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -3,13 +3,13 @@ cc_library( SRCS ProcessGroup.cc DEPS dense_tensor) cc_library( - processgroup_stream - SRCS ProcessGroupStream.cc + process_group_stream + SRCS process_group_stream.cc DEPS dense_tensor) cc_library( eager_reducer SRCS reducer.cc - DEPS eager_api processgroup processgroup_stream phi_api string_helper) + DEPS eager_api processgroup process_group_stream phi_api string_helper) if(WITH_DISTRIBUTE) cc_library( @@ -20,10 +20,10 @@ endif() if(WITH_NCCL OR WITH_RCCL) cc_library( - processgroup_nccl - SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc check.cc + process_group_nccl + SRCS process_group_nccl.cc nccl_tools.cc Common.cc check.cc DEPS processgroup - processgroup_stream + process_group_stream place enforce collective_helper diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h index 7abecd36e3d00f5f5e27c627764dbdcea6a3d5ea..3792e8bc835466d97add6aafabc22cc1c38f41df 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.h +++ b/paddle/fluid/distributed/collective/ProcessGroup.h @@ -97,6 +97,17 @@ class ProcessGroup { GetBackendName())); } + virtual std::shared_ptr AllGather( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + bool sync_op) { + return AllGather(out_tensor, + in_tensor, + /*offset*/ 0, + /*numel*/ -1, // -1 indicates the whole tensor + sync_op); + } + virtual std::shared_ptr AllGather( phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, @@ -175,6 +186,16 @@ class ProcessGroup { GetBackendName())); } + virtual std::shared_ptr Recv(phi::DenseTensor* tensor, + int src_rank, + bool sync_op) { + return Recv(tensor, + src_rank, + /*offset*/ 0, + /*numel*/ -1, // -1 indicates the whole tensor + sync_op); + } + virtual std::shared_ptr Recv(phi::DenseTensor* tensor, int src_rank, int64_t offset, @@ -185,6 +206,15 @@ class ProcessGroup { GetBackendName())); } + virtual std::shared_ptr Send( + const phi::DenseTensor& tensor, int dst_rank, bool sync_op) { + return Send(tensor, + dst_rank, + /*offset*/ 0, + /*numel*/ -1, // -1 indicates the whole tensor + sync_op); + } + virtual std::shared_ptr Send( const phi::DenseTensor& tensor, int dst_rank, diff --git a/paddle/fluid/distributed/collective/ProcessGroupBKCL.h b/paddle/fluid/distributed/collective/ProcessGroupBKCL.h index 79d97609d9274e86349c1e06e8166112ecaca071..822f690cb802f8fd2e864c109756f54b09f3fa33 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupBKCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupBKCL.h @@ -20,7 +20,7 @@ #include #include -#include "paddle/fluid/distributed/collective/ProcessGroupStream.h" +#include "paddle/fluid/distributed/collective/process_group_stream.h" #include "paddle/fluid/distributed/store/store.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/distributed/collective/check.cc b/paddle/fluid/distributed/collective/check.cc index 9a2ca064024f4cfc37723e06fd34b9fe7bc35ac4..151d7f3574919d1c0e7a2a7a7e45a7b748ce4905 100644 --- a/paddle/fluid/distributed/collective/check.cc +++ b/paddle/fluid/distributed/collective/check.cc @@ -14,7 +14,7 @@ #include "paddle/fluid/distributed/collective/check.h" -#include "paddle/fluid/distributed/collective/NCCLTools.h" +#include "paddle/fluid/distributed/collective/nccl_tools.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" #include "paddle/phi/core/dense_tensor.h" diff --git a/paddle/fluid/distributed/collective/NCCLTools.cc b/paddle/fluid/distributed/collective/nccl_tools.cc similarity index 96% rename from paddle/fluid/distributed/collective/NCCLTools.cc rename to paddle/fluid/distributed/collective/nccl_tools.cc index 47c0f547ee79eaf8a6cc871640e899a2beb69371..ffb51d706d9ef04c116c27230a05554b21cbd931 100644 --- a/paddle/fluid/distributed/collective/NCCLTools.cc +++ b/paddle/fluid/distributed/collective/nccl_tools.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/distributed/collective/NCCLTools.h" +#include "paddle/fluid/distributed/collective/nccl_tools.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/distributed/collective/NCCLTools.h b/paddle/fluid/distributed/collective/nccl_tools.h similarity index 100% rename from paddle/fluid/distributed/collective/NCCLTools.h rename to paddle/fluid/distributed/collective/nccl_tools.h diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/process_group_nccl.cc similarity index 99% rename from paddle/fluid/distributed/collective/ProcessGroupNCCL.cc rename to paddle/fluid/distributed/collective/process_group_nccl.cc index 13de2625a6eeea292abde64eeccec75edaca983a..0859708f92cf4be5337175421e42e1625f0df15f 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/process_group_nccl.cc @@ -12,11 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" +#include "paddle/fluid/distributed/collective/process_group_nccl.h" #include "paddle/fluid/distributed/collective/Common.h" -#include "paddle/fluid/distributed/collective/NCCLTools.h" #include "paddle/fluid/distributed/collective/check.h" +#include "paddle/fluid/distributed/collective/nccl_tools.h" #include "paddle/fluid/distributed/collective/utils.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/process_group_nccl.h similarity index 98% rename from paddle/fluid/distributed/collective/ProcessGroupNCCL.h rename to paddle/fluid/distributed/collective/process_group_nccl.h index 3ce77297f56f187472ed58963c3f75b9080f05f1..816a0d2ec90b65d8178fd5808c28ea61d5223c44 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/process_group_nccl.h @@ -20,7 +20,7 @@ #include #include -#include "paddle/fluid/distributed/collective/ProcessGroupStream.h" +#include "paddle/fluid/distributed/collective/process_group_stream.h" #include "paddle/fluid/distributed/store/store.h" #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/device_event.h" @@ -29,7 +29,7 @@ #include "paddle/phi/core/device_context.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/distributed/collective/NCCLTools.h" +#include "paddle/fluid/distributed/collective/nccl_tools.h" #endif #ifdef PADDLE_WITH_RCCL diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.cc b/paddle/fluid/distributed/collective/process_group_stream.cc similarity index 86% rename from paddle/fluid/distributed/collective/ProcessGroupStream.cc rename to paddle/fluid/distributed/collective/process_group_stream.cc index e1ee425f3f8888da27c966b6ef81058294e352db..2b69cf51fe630d91cb2d7ef653c637311d52b95b 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupStream.cc +++ b/paddle/fluid/distributed/collective/process_group_stream.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/distributed/collective/ProcessGroupStream.h" +#include "paddle/fluid/distributed/collective/process_group_stream.h" namespace paddle { namespace distributed { @@ -40,6 +40,19 @@ std::shared_ptr ProcessGroupStream::AllGather( /*use_calc_stream*/ false); } +std::shared_ptr ProcessGroupStream::AllGather( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + bool sync_op, + bool use_calc_stream) { + return AllGather(out_tensor, + in_tensor, + /*offset*/ 0, + /*numel*/ -1, // -1 indicates the whole tensor + sync_op, + use_calc_stream); +} + std::shared_ptr ProcessGroupStream::AllGather( phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, @@ -200,6 +213,19 @@ std::shared_ptr ProcessGroupStream::Recv( /*use_calc_stream*/ false); } +std::shared_ptr ProcessGroupStream::Recv( + phi::DenseTensor* tensor, + int src_rank, + bool sync_op, + bool use_calc_stream) { + return Recv(tensor, + src_rank, + /*offset*/ 0, + /*numel*/ -1, // -1 indicates sending the whole tensor + sync_op, + use_calc_stream); +} + std::shared_ptr ProcessGroupStream::Recv( phi::DenseTensor* tensor, int src_rank, @@ -225,6 +251,19 @@ std::shared_ptr ProcessGroupStream::Send( /*use_calc_stream*/ false); } +std::shared_ptr ProcessGroupStream::Send( + const phi::DenseTensor& tensor, + int dst_rank, + bool sync_op, + bool use_calc_stream) { + return Send(tensor, + dst_rank, + /*offset*/ 0, + /*numel*/ -1, // -1 indicates receiving the whole tensor + sync_op, + use_calc_stream); +} + std::shared_ptr ProcessGroupStream::Send( const phi::DenseTensor& tensor, int dst_rank, diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.h b/paddle/fluid/distributed/collective/process_group_stream.h similarity index 90% rename from paddle/fluid/distributed/collective/ProcessGroupStream.h rename to paddle/fluid/distributed/collective/process_group_stream.h index 4ad75be3658b97ed28aa3de91d5ceb4652fe6d1a..d48ff0f24f83bd3b03e9d8965c351af9c8a87f64 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupStream.h +++ b/paddle/fluid/distributed/collective/process_group_stream.h @@ -69,6 +69,12 @@ class ProcessGroupStream : public ProcessGroup { int64_t numel, bool sync_op) override; + virtual std::shared_ptr AllGather( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + bool sync_op, + bool use_calc_stream); + virtual std::shared_ptr AllGather( phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, @@ -161,6 +167,11 @@ class ProcessGroupStream : public ProcessGroup { int64_t numel, bool sync_op) override; + virtual std::shared_ptr Recv(phi::DenseTensor* tensor, + int src_rank, + bool sync_op, + bool use_calc_stream); + virtual std::shared_ptr Recv(phi::DenseTensor* tensor, int src_rank, int64_t offset, @@ -174,6 +185,11 @@ class ProcessGroupStream : public ProcessGroup { int64_t numel, bool sync_op) override; + std::shared_ptr Send(const phi::DenseTensor& tensor, + int dst_rank, + bool sync_op, + bool use_calc_stream); + virtual std::shared_ptr Send( const phi::DenseTensor& tensor, int dst_rank, diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu index 67ee877f72c8a73c9bee4f9c2189dabc141bac1f..559a2afb85f45f050192d582f1bf15e6868550e3 100644 --- a/paddle/fluid/operators/fused/fused_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_attention_op.cu @@ -30,7 +30,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/math_function.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" +#include "paddle/fluid/distributed/collective/process_group_nccl.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu index 28a9cb167e093c62a1966800958c04606205cc88..925ec7d2060a48e1aa47020e7fd2d36d1800a314 100644 --- a/paddle/fluid/operators/fused/fused_feedforward_op.cu +++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu @@ -23,7 +23,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/elementwise_functor.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" +#include "paddle/fluid/distributed/collective/process_group_nccl.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 973ef8a4a79992df7db279885dd52dc7af9ddb3b..37e085b82bc08277d0c4bbafe0dc1336cf605bb3 100755 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -157,7 +157,7 @@ endif() if(WITH_PYTHON) set(PYBIND_DEPS ${PYBIND_DEPS} processgroup eager_reducer) if(WITH_NCCL OR WITH_RCCL) - set(PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl) + set(PYBIND_DEPS ${PYBIND_DEPS} process_group_nccl) endif() if(WITH_XPU_BKCL) set(PYBIND_DEPS ${PYBIND_DEPS} processgroup_bkcl) diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index c5d03ce8853e312540d5d6f5a7f090e5d3852992..e9d59132d0e4605f5f160b53c0d94157a5708bae 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -22,8 +22,8 @@ limitations under the License. */ #endif #include "paddle/fluid/distributed/collective/ProcessGroup.h" -#include "paddle/fluid/distributed/collective/ProcessGroupStream.h" #include "paddle/fluid/distributed/collective/Types.h" +#include "paddle/fluid/distributed/collective/process_group_stream.h" #include "paddle/fluid/distributed/collective/reducer.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor.h" @@ -34,7 +34,7 @@ limitations under the License. */ #include "paddle/phi/api/all.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" +#include "paddle/fluid/distributed/collective/process_group_nccl.h" #endif #if defined(PADDLE_WITH_MPI) @@ -169,9 +169,7 @@ void BindDistributed(py::module *m) { auto p_dense = std::dynamic_pointer_cast(tensor.impl()); auto out_dense = *p_dense; - // numel == -1 indicates sending the whole tensor - return self.Send( - out_dense, dst, /*offset*/ 0, /*numel*/ -1, sync_op); + return self.Send(out_dense, dst, sync_op); }, py::arg("tensor"), py::arg("dst"), @@ -215,9 +213,7 @@ void BindDistributed(py::module *m) { auto p_dense = std::dynamic_pointer_cast(tensor.impl()); auto *in_dense = p_dense.get(); - // numel == -1 indicates receiving the whole tensor - return self.Recv( - in_dense, src, /*offset*/ 0, /*numel*/ -1, sync_op); + return self.Recv(in_dense, src, sync_op); }, py::arg("tensor"), py::arg("src"), @@ -270,11 +266,7 @@ void BindDistributed(py::module *m) { auto in_dense = *p_in_tensor; auto *dev_ctx = self.GetDeviceContext(in_tensor.place()); - auto task = self.AllGather(out_dense, - in_dense, - /*offset*/ 0, - /*numel*/ -1, - sync_op); + auto task = self.AllGather(out_dense, in_dense, sync_op); SplitTensor(*dev_ctx, *out_dense, &out_tensor_list); task->UpdateWaitChain(*dev_ctx); return task; @@ -300,11 +292,7 @@ void BindDistributed(py::module *m) { in_tensor.impl()); auto in_dense = *p_in_tensor; - return self.AllGather(out_dense, - in_dense, - /*offset*/ 0, - /*numel*/ -1, - sync_op); + return self.AllGather(out_dense, in_dense, sync_op); }, py::arg("out"), py::arg("in"), @@ -771,8 +759,6 @@ void BindDistributed(py::module *m) { auto *dev_ctx = self.GetDeviceContext(in_tensor.place(), true); auto task = self.AllGather(out_dense, in_dense, - /*offset*/ 0, - /*numel*/ -1, /*sync_op*/ true, /*use_calc_stream*/ true); SplitTensor(*dev_ctx, *out_dense, &out_tensor_list); @@ -799,8 +785,6 @@ void BindDistributed(py::module *m) { return self.AllGather(out_dense, in_dense, - /*offset*/ 0, - /*numel*/ -1, /*sync_op*/ true, /*use_calc_stream*/ true); }, @@ -1127,11 +1111,8 @@ void BindDistributed(py::module *m) { auto p_dense = std::dynamic_pointer_cast(tensor.impl()); auto out_dense = *p_dense; - // numel == -1 indicates sending the whole tensor return self.Send(out_dense, dst, - /*offset*/ 0, - /*numel*/ -1, /*sync_op*/ true, /*use_calc_stream*/ true); }, @@ -1177,11 +1158,8 @@ void BindDistributed(py::module *m) { auto p_dense = std::dynamic_pointer_cast(tensor.impl()); auto *in_dense = p_dense.get(); - // numel == -1 indicates receiving the whole tensor return self.Recv(in_dense, src, - /*offset*/ 0, - /*numel*/ -1, /*sync_op*/ true, /*use_calc_stream*/ true); }, diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt index c35bd2bc456f3296bbaabae000517759d4db87b1..c9e110ae7b8e40a17ac04d4771c770460f47106a 100644 --- a/paddle/phi/backends/CMakeLists.txt +++ b/paddle/phi/backends/CMakeLists.txt @@ -67,7 +67,7 @@ endif() set(COMM_UTILS_DEPS processgroup) if(WITH_NCCL OR WITH_RCCL) - set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} processgroup_nccl) + set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} process_group_nccl) endif() if(WITH_CUSTOM_DEVICE) set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} processgroup_custom) diff --git a/paddle/phi/backends/processgroup_comm_utils.cc b/paddle/phi/backends/processgroup_comm_utils.cc index 841b88d752e9a3547164d21171f0e7def326707d..450c17638712228ff86f64d2dc65412894fb4206 100644 --- a/paddle/phi/backends/processgroup_comm_utils.cc +++ b/paddle/phi/backends/processgroup_comm_utils.cc @@ -15,7 +15,7 @@ #include "paddle/fluid/distributed/collective/ProcessGroup.h" #include "paddle/phi/backends/c_comm_lib.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" +#include "paddle/fluid/distributed/collective/process_group_nccl.h" #endif #if defined(PADDLE_WITH_CUSTOM_DEVICE) #include "paddle/fluid/distributed/collective/ProcessGroupCustom.h" diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 808b18bb02d45f03e82c6df9fabc563c92288282..abe35f284d66407dae03305d77860625066f34cb 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -80,7 +80,7 @@ set(COMMON_KERNEL_DEPS set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup) if(WITH_NCCL OR WITH_RCCL) - set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup_nccl) + set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} process_group_nccl) endif() set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup_comm_utils) if(WITH_CUDNN_FRONTEND) diff --git a/paddle/phi/kernels/gpu/sync_batch_norm_utils.h b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h index cfb2758e62def45b88f634614ed04a63105c3208..f99da25dec9600864796c14c5e8e1d84d22caeae 100644 --- a/paddle/phi/kernels/gpu/sync_batch_norm_utils.h +++ b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h @@ -28,7 +28,7 @@ namespace cub = hipcub; #endif #include "paddle/fluid/distributed/collective/ProcessGroup.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" +#include "paddle/fluid/distributed/collective/process_group_nccl.h" #endif #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h"