diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt index 116b3db40d6423d87c13ffbb24d748240a0d6851..b6b5ddc179fd57f39612126b28e897e4bcf55a7c 100644 --- a/paddle/fluid/distributed/collective/CMakeLists.txt +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -1,7 +1,7 @@ cc_library( processgroup SRCS ProcessGroup.cc - DEPS phi_api eager_api) + DEPS dense_tensor) cc_library( eager_reducer SRCS reducer.cc @@ -18,7 +18,8 @@ if(WITH_NCCL OR WITH_RCCL) cc_library( processgroup_nccl SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc - DEPS place enforce collective_helper device_context phi_api eager_api) + DEPS processgroup place enforce collective_helper device_context + dense_tensor) if(WITH_DISTRIBUTE AND WITH_PSCORE) cc_library( processgroup_heter diff --git a/paddle/fluid/distributed/collective/Common.h b/paddle/fluid/distributed/collective/Common.h index c01bd23fe127b8fedb38bba2a160b2451256bb3f..4c6c42bd86d636ef8838c4e33b23e9ac8e1dec2a 100644 --- a/paddle/fluid/distributed/collective/Common.h +++ b/paddle/fluid/distributed/collective/Common.h @@ -15,7 +15,6 @@ #pragma once #include "paddle/fluid/platform/place.h" -#include "paddle/phi/api/include/api.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/dense_tensor.h" namespace paddle { diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index 168548cf9ba06f1584882c0baf78b49f34672d74..4a52ece783909647a32cb6141fcfbd79428ddfab 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -18,7 +18,7 @@ #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/fluid/platform/place.h" -#include "paddle/phi/api/include/api.h" +#include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/common/place.h" DECLARE_bool(nccl_blocking_wait); @@ -427,9 +427,10 @@ std::shared_ptr ProcessGroupNCCL::Barrier( platform::CUDADeviceGuard gpuGuard; for (auto& place : places) { gpuGuard.SetDeviceIndex(place.GetDeviceId()); - auto dt = full({1}, 0, phi::DataType::FLOAT32, place); - barrierTensors.push_back( - *std::dynamic_pointer_cast(dt.impl())); + phi::DenseTensorMeta meta(phi::DataType::FLOAT32, phi::DDim({1})); + auto allocator = std::unique_ptr( + new paddle::experimental::DefaultAllocator(place)); + barrierTensors.emplace_back(allocator.get(), meta); } auto task = ProcessGroupNCCL::AllReduce(barrierTensors, barrierTensors); auto nccl_task = dynamic_cast(task.get()); @@ -894,5 +895,15 @@ void ProcessGroupNCCL::GroupEnd() { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); } +ncclComm_t ProcessGroupNCCL::NCCLComm(const Place& place) const { + std::vector places = {place}; + const auto& iter = places_to_ncclcomm_.find(GetKeyFromPlaces(places)); + PADDLE_ENFORCE_NE(iter, + places_to_ncclcomm_.end(), + platform::errors::InvalidArgument( + "Cannot find nccl comm in process group.")); + return iter->second[0]->GetNcclComm(); +} + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index 5adb6867eb8ef86fd78b474adc6591e5bd839168..dfd07c438110c861519459d79db0cf1b1f439743 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -157,6 +157,8 @@ class ProcessGroupNCCL : public ProcessGroup { static void GroupEnd(); + ncclComm_t NCCLComm(const Place& place) const; + protected: virtual std::shared_ptr CreateTask( std::vector places, diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index ca98e43a50313fcafde7bdfef1f53b472a244f18..d40b6f589c5f1337c0d87f04e90dfeb59f4b2682 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -84,6 +84,11 @@ set(COMMON_KERNEL_DEPS gpc utf8proc) +set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup) +if(WITH_NCCL OR WITH_RCCL) + set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup_nccl) +endif() + copy_if_different(${kernel_declare_file} ${kernel_declare_file_final}) file(GLOB kernel_h "*.h" "selected_rows/*.h" "sparse/*.h" "strings/*.h") diff --git a/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu index ed8a8c333442d0f098da8c1935202d5ba5b7956b..1d8d8a6dfacb1714a88a63a1a2f30ab9b83185bc 100644 --- a/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu @@ -100,7 +100,19 @@ void SyncBatchNormKernel(const Context &ctx, } #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - auto *comm = ctx.nccl_comm(); + int global_gid = 0; + ncclComm_t comm = nullptr; + + if (paddle::distributed::ProcessGroupMapFromGid::getInstance()->has( + global_gid)) { + auto *nccl_pg = static_cast( + paddle::distributed::ProcessGroupMapFromGid::getInstance()->get( + global_gid)); + comm = nccl_pg->NCCLComm(x.place()); + } else { + comm = ctx.nccl_comm(); + } + if (comm) { int dtype = paddle::platform::ToNCCLDataType( paddle::framework::TransToProtoVarType(mean_out->dtype())); @@ -113,6 +125,7 @@ void SyncBatchNormKernel(const Context &ctx, ncclSum, comm, stream)); + VLOG(3) << "Sync result using all reduce"; } #endif diff --git a/paddle/phi/kernels/gpu/sync_batch_norm_utils.h b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h index 37b9bca73a857beae48ca9c9c87f5c07a0e51a26..27c3cd4ad44f779774f15af6becf4256151a37ca 100644 --- a/paddle/phi/kernels/gpu/sync_batch_norm_utils.h +++ b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h @@ -26,6 +26,10 @@ limitations under the License. */ #include namespace cub = hipcub; #endif +#include "paddle/fluid/distributed/collective/ProcessGroup.h" +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" +#endif #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" @@ -411,7 +415,19 @@ void SyncBatchNormGradFunctor( } #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - auto *comm = ctx.nccl_comm(); + int global_gid = 0; + ncclComm_t comm = nullptr; + + if (paddle::distributed::ProcessGroupMapFromGid::getInstance()->has( + global_gid)) { + auto *nccl_pg = static_cast( + paddle::distributed::ProcessGroupMapFromGid::getInstance()->get( + global_gid)); + comm = nccl_pg->NCCLComm(x->place()); + } else { + comm = ctx.nccl_comm(); + } + if (comm) { int dtype = paddle::platform::ToNCCLDataType( paddle::framework::TransToProtoVarType(scale.dtype())); @@ -424,6 +440,7 @@ void SyncBatchNormGradFunctor( ncclSum, comm, stream)); + VLOG(3) << "Sync result using all reduce"; } #endif