From 1e96575630b3270d22b50cf774b9cb952f09339e Mon Sep 17 00:00:00 2001 From: LiYuRio <63526175+LiYuRio@users.noreply.github.com> Date: Fri, 12 Aug 2022 21:56:40 +0800 Subject: [PATCH] fix nccl comm in sync_bn (#45100) --- .../distributed/collective/CMakeLists.txt | 5 +++-- paddle/fluid/distributed/collective/Common.h | 1 - .../collective/ProcessGroupNCCL.cc | 19 +++++++++++++++---- .../distributed/collective/ProcessGroupNCCL.h | 2 ++ paddle/phi/kernels/CMakeLists.txt | 5 +++++ .../phi/kernels/gpu/sync_batch_norm_kernel.cu | 15 ++++++++++++++- .../phi/kernels/gpu/sync_batch_norm_utils.h | 19 ++++++++++++++++++- 7 files changed, 57 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt index 116b3db40d6..b6b5ddc179f 100644 --- a/paddle/fluid/distributed/collective/CMakeLists.txt +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -1,7 +1,7 @@ cc_library( processgroup SRCS ProcessGroup.cc - DEPS phi_api eager_api) + DEPS dense_tensor) cc_library( eager_reducer SRCS reducer.cc @@ -18,7 +18,8 @@ if(WITH_NCCL OR WITH_RCCL) cc_library( processgroup_nccl SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc - DEPS place enforce collective_helper device_context phi_api eager_api) + DEPS processgroup place enforce collective_helper device_context + dense_tensor) if(WITH_DISTRIBUTE AND WITH_PSCORE) cc_library( processgroup_heter diff --git a/paddle/fluid/distributed/collective/Common.h b/paddle/fluid/distributed/collective/Common.h index c01bd23fe12..4c6c42bd86d 100644 --- a/paddle/fluid/distributed/collective/Common.h +++ b/paddle/fluid/distributed/collective/Common.h @@ -15,7 +15,6 @@ #pragma once #include "paddle/fluid/platform/place.h" -#include "paddle/phi/api/include/api.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/dense_tensor.h" namespace paddle { diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index 168548cf9ba..4a52ece7839 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -18,7 +18,7 @@ #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/fluid/platform/place.h" -#include "paddle/phi/api/include/api.h" +#include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/common/place.h" DECLARE_bool(nccl_blocking_wait); @@ -427,9 +427,10 @@ std::shared_ptr ProcessGroupNCCL::Barrier( platform::CUDADeviceGuard gpuGuard; for (auto& place : places) { gpuGuard.SetDeviceIndex(place.GetDeviceId()); - auto dt = full({1}, 0, phi::DataType::FLOAT32, place); - barrierTensors.push_back( - *std::dynamic_pointer_cast(dt.impl())); + phi::DenseTensorMeta meta(phi::DataType::FLOAT32, phi::DDim({1})); + auto allocator = std::unique_ptr( + new paddle::experimental::DefaultAllocator(place)); + barrierTensors.emplace_back(allocator.get(), meta); } auto task = ProcessGroupNCCL::AllReduce(barrierTensors, barrierTensors); auto nccl_task = dynamic_cast(task.get()); @@ -894,5 +895,15 @@ void ProcessGroupNCCL::GroupEnd() { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); } +ncclComm_t ProcessGroupNCCL::NCCLComm(const Place& place) const { + std::vector places = {place}; + const auto& iter = places_to_ncclcomm_.find(GetKeyFromPlaces(places)); + PADDLE_ENFORCE_NE(iter, + places_to_ncclcomm_.end(), + platform::errors::InvalidArgument( + "Cannot find nccl comm in process group.")); + return iter->second[0]->GetNcclComm(); +} + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index 5adb6867eb8..dfd07c43811 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -157,6 +157,8 @@ class ProcessGroupNCCL : public ProcessGroup { static void GroupEnd(); + ncclComm_t NCCLComm(const Place& place) const; + protected: virtual std::shared_ptr CreateTask( std::vector places, diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index ca98e43a503..d40b6f589c5 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -84,6 +84,11 @@ set(COMMON_KERNEL_DEPS gpc utf8proc) +set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup) +if(WITH_NCCL OR WITH_RCCL) + set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup_nccl) +endif() + copy_if_different(${kernel_declare_file} ${kernel_declare_file_final}) file(GLOB kernel_h "*.h" "selected_rows/*.h" "sparse/*.h" "strings/*.h") diff --git a/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu index ed8a8c33344..1d8d8a6dfac 100644 --- a/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu @@ -100,7 +100,19 @@ void SyncBatchNormKernel(const Context &ctx, } #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - auto *comm = ctx.nccl_comm(); + int global_gid = 0; + ncclComm_t comm = nullptr; + + if (paddle::distributed::ProcessGroupMapFromGid::getInstance()->has( + global_gid)) { + auto *nccl_pg = static_cast( + paddle::distributed::ProcessGroupMapFromGid::getInstance()->get( + global_gid)); + comm = nccl_pg->NCCLComm(x.place()); + } else { + comm = ctx.nccl_comm(); + } + if (comm) { int dtype = paddle::platform::ToNCCLDataType( paddle::framework::TransToProtoVarType(mean_out->dtype())); @@ -113,6 +125,7 @@ void SyncBatchNormKernel(const Context &ctx, ncclSum, comm, stream)); + VLOG(3) << "Sync result using all reduce"; } #endif diff --git a/paddle/phi/kernels/gpu/sync_batch_norm_utils.h b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h index 37b9bca73a8..27c3cd4ad44 100644 --- a/paddle/phi/kernels/gpu/sync_batch_norm_utils.h +++ b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h @@ -26,6 +26,10 @@ limitations under the License. */ #include namespace cub = hipcub; #endif +#include "paddle/fluid/distributed/collective/ProcessGroup.h" +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" +#endif #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" @@ -411,7 +415,19 @@ void SyncBatchNormGradFunctor( } #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - auto *comm = ctx.nccl_comm(); + int global_gid = 0; + ncclComm_t comm = nullptr; + + if (paddle::distributed::ProcessGroupMapFromGid::getInstance()->has( + global_gid)) { + auto *nccl_pg = static_cast( + paddle::distributed::ProcessGroupMapFromGid::getInstance()->get( + global_gid)); + comm = nccl_pg->NCCLComm(x->place()); + } else { + comm = ctx.nccl_comm(); + } + if (comm) { int dtype = paddle::platform::ToNCCLDataType( paddle::framework::TransToProtoVarType(scale.dtype())); @@ -424,6 +440,7 @@ void SyncBatchNormGradFunctor( ncclSum, comm, stream)); + VLOG(3) << "Sync result using all reduce"; } #endif -- GitLab