From 1e96575630b3270d22b50cf774b9cb952f09339e Mon Sep 17 00:00:00 2001
From: LiYuRio <63526175+LiYuRio@users.noreply.github.com>
Date: Fri, 12 Aug 2022 21:56:40 +0800
Subject: [PATCH] fix nccl comm in sync_bn (#45100)

---
 .../distributed/collective/CMakeLists.txt     |  5 +++--
 paddle/fluid/distributed/collective/Common.h  |  1 -
 .../collective/ProcessGroupNCCL.cc            | 19 +++++++++++++++----
 .../distributed/collective/ProcessGroupNCCL.h |  2 ++
 paddle/phi/kernels/CMakeLists.txt             |  5 +++++
 .../phi/kernels/gpu/sync_batch_norm_kernel.cu | 15 ++++++++++++++-
 .../phi/kernels/gpu/sync_batch_norm_utils.h   | 19 ++++++++++++++++++-
 7 files changed, 57 insertions(+), 9 deletions(-)
diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
index 116b3db40d6..b6b5ddc179f 100644
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -1,7 +1,7 @@
 cc_library(
   processgroup
   SRCS ProcessGroup.cc
-  DEPS phi_api eager_api)
+  DEPS dense_tensor)
 cc_library(
   eager_reducer
   SRCS reducer.cc
@@ -18,7 +18,8 @@ if(WITH_NCCL OR WITH_RCCL)
   cc_library(
     processgroup_nccl
     SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc
-    DEPS place enforce collective_helper device_context phi_api eager_api)
+    DEPS processgroup place enforce collective_helper device_context
+         dense_tensor)
   if(WITH_DISTRIBUTE AND WITH_PSCORE)
     cc_library(
       processgroup_heter
diff --git a/paddle/fluid/distributed/collective/Common.h b/paddle/fluid/distributed/collective/Common.h
index c01bd23fe12..4c6c42bd86d 100644
--- a/paddle/fluid/distributed/collective/Common.h
+++ b/paddle/fluid/distributed/collective/Common.h
@@ -15,7 +15,6 @@
 #pragma once
 
 #include "paddle/fluid/platform/place.h"
-#include "paddle/phi/api/include/api.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
 namespace paddle {
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index 168548cf9ba..4a52ece7839 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -18,7 +18,7 @@
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/common/place.h"
 
 DECLARE_bool(nccl_blocking_wait);
@@ -427,9 +427,10 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Barrier(
   platform::CUDADeviceGuard gpuGuard;
   for (auto& place : places) {
     gpuGuard.SetDeviceIndex(place.GetDeviceId());
-    auto dt = full({1}, 0, phi::DataType::FLOAT32, place);
-    barrierTensors.push_back(
-        *std::dynamic_pointer_cast<phi::DenseTensor>(dt.impl()));
+    phi::DenseTensorMeta meta(phi::DataType::FLOAT32, phi::DDim({1}));
+    auto allocator = std::unique_ptr<phi::Allocator>(
+        new paddle::experimental::DefaultAllocator(place));
+    barrierTensors.emplace_back(allocator.get(), meta);
   }
   auto task = ProcessGroupNCCL::AllReduce(barrierTensors, barrierTensors);
   auto nccl_task = dynamic_cast<ProcessGroupNCCL::NCCLTask*>(task.get());
@@ -894,5 +895,15 @@ void ProcessGroupNCCL::GroupEnd() {
   PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
 }
 
+ncclComm_t ProcessGroupNCCL::NCCLComm(const Place& place) const {
+  std::vector<Place> places = {place};
+  const auto& iter = places_to_ncclcomm_.find(GetKeyFromPlaces(places));
+  PADDLE_ENFORCE_NE(iter,
+                    places_to_ncclcomm_.end(),
+                    platform::errors::InvalidArgument(
+                        "Cannot find nccl comm in process group."));
+  return iter->second[0]->GetNcclComm();
+}
+
 }  //  namespace distributed
 }  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
index 5adb6867eb8..dfd07c43811 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
@@ -157,6 +157,8 @@ class ProcessGroupNCCL : public ProcessGroup {
 
   static void GroupEnd();
 
+  ncclComm_t NCCLComm(const Place& place) const;
+
  protected:
   virtual std::shared_ptr<ProcessGroupNCCL::NCCLTask> CreateTask(
       std::vector<Place> places,
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index ca98e43a503..d40b6f589c5 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -84,6 +84,11 @@ set(COMMON_KERNEL_DEPS
     gpc
     utf8proc)
 
+set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup)
+if(WITH_NCCL OR WITH_RCCL)
+  set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup_nccl)
+endif()
+
 copy_if_different(${kernel_declare_file} ${kernel_declare_file_final})
 
 file(GLOB kernel_h "*.h" "selected_rows/*.h" "sparse/*.h" "strings/*.h")
diff --git a/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu
index ed8a8c33344..1d8d8a6dfac 100644
--- a/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu
@@ -100,7 +100,19 @@ void SyncBatchNormKernel(const Context &ctx,
     }
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    auto *comm = ctx.nccl_comm();
+    int global_gid = 0;
+    ncclComm_t comm = nullptr;
+
+    if (paddle::distributed::ProcessGroupMapFromGid::getInstance()->has(
+            global_gid)) {
+      auto *nccl_pg = static_cast<paddle::distributed::ProcessGroupNCCL *>(
+          paddle::distributed::ProcessGroupMapFromGid::getInstance()->get(
+              global_gid));
+      comm = nccl_pg->NCCLComm(x.place());
+    } else {
+      comm = ctx.nccl_comm();
+    }
+
     if (comm) {
       int dtype = paddle::platform::ToNCCLDataType(
           paddle::framework::TransToProtoVarType(mean_out->dtype()));
@@ -113,6 +125,7 @@ void SyncBatchNormKernel(const Context &ctx,
           ncclSum,
           comm,
           stream));
+      VLOG(3) << "Sync result using all reduce";
     }
 #endif
 
diff --git a/paddle/phi/kernels/gpu/sync_batch_norm_utils.h b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h
index 37b9bca73a8..27c3cd4ad44 100644
--- a/paddle/phi/kernels/gpu/sync_batch_norm_utils.h
+++ b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h
@@ -26,6 +26,10 @@ limitations under the License. */
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
 #endif
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+#endif
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
@@ -411,7 +415,19 @@ void SyncBatchNormGradFunctor(
   }
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  auto *comm = ctx.nccl_comm();
+  int global_gid = 0;
+  ncclComm_t comm = nullptr;
+
+  if (paddle::distributed::ProcessGroupMapFromGid::getInstance()->has(
+          global_gid)) {
+    auto *nccl_pg = static_cast<paddle::distributed::ProcessGroupNCCL *>(
+        paddle::distributed::ProcessGroupMapFromGid::getInstance()->get(
+            global_gid));
+    comm = nccl_pg->NCCLComm(x->place());
+  } else {
+    comm = ctx.nccl_comm();
+  }
+
   if (comm) {
     int dtype = paddle::platform::ToNCCLDataType(
         paddle::framework::TransToProtoVarType(scale.dtype()));
@@ -424,6 +440,7 @@ void SyncBatchNormGradFunctor(
         ncclSum,
         comm,
         stream));
+    VLOG(3) << "Sync result using all reduce";
   }
 #endif
 
-- 
GitLab