diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
index 116b3db40d6423d87c13ffbb24d748240a0d6851..b6b5ddc179fd57f39612126b28e897e4bcf55a7c 100644
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -1,7 +1,7 @@
 cc_library(
   processgroup
   SRCS ProcessGroup.cc
-  DEPS phi_api eager_api)
+  DEPS dense_tensor)
 cc_library(
   eager_reducer
   SRCS reducer.cc
@@ -18,7 +18,8 @@ if(WITH_NCCL OR WITH_RCCL)
   cc_library(
     processgroup_nccl
     SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc
-    DEPS place enforce collective_helper device_context phi_api eager_api)
+    DEPS processgroup place enforce collective_helper device_context
+         dense_tensor)
   if(WITH_DISTRIBUTE AND WITH_PSCORE)
     cc_library(
       processgroup_heter
diff --git a/paddle/fluid/distributed/collective/Common.h b/paddle/fluid/distributed/collective/Common.h
index c01bd23fe127b8fedb38bba2a160b2451256bb3f..4c6c42bd86d636ef8838c4e33b23e9ac8e1dec2a 100644
--- a/paddle/fluid/distributed/collective/Common.h
+++ b/paddle/fluid/distributed/collective/Common.h
@@ -15,7 +15,6 @@
 #pragma once
 
 #include "paddle/fluid/platform/place.h"
-#include "paddle/phi/api/include/api.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
 namespace paddle {
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index 168548cf9ba06f1584882c0baf78b49f34672d74..4a52ece783909647a32cb6141fcfbd79428ddfab 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -18,7 +18,7 @@
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/common/place.h"
 
 DECLARE_bool(nccl_blocking_wait);
@@ -427,9 +427,10 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Barrier(
   platform::CUDADeviceGuard gpuGuard;
   for (auto& place : places) {
     gpuGuard.SetDeviceIndex(place.GetDeviceId());
-    auto dt = full({1}, 0, phi::DataType::FLOAT32, place);
-    barrierTensors.push_back(
-        *std::dynamic_pointer_cast<phi::DenseTensor>(dt.impl()));
+    phi::DenseTensorMeta meta(phi::DataType::FLOAT32, phi::DDim({1}));
+    auto allocator = std::unique_ptr<phi::Allocator>(
+        new paddle::experimental::DefaultAllocator(place));
+    barrierTensors.emplace_back(allocator.get(), meta);
   }
   auto task = ProcessGroupNCCL::AllReduce(barrierTensors, barrierTensors);
   auto nccl_task = dynamic_cast<ProcessGroupNCCL::NCCLTask*>(task.get());
@@ -894,5 +895,15 @@ void ProcessGroupNCCL::GroupEnd() {
   PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
 }
 
+ncclComm_t ProcessGroupNCCL::NCCLComm(const Place& place) const {
+  std::vector<Place> places = {place};
+  const auto& iter = places_to_ncclcomm_.find(GetKeyFromPlaces(places));
+  PADDLE_ENFORCE_NE(iter,
+                    places_to_ncclcomm_.end(),
+                    platform::errors::InvalidArgument(
+                        "Cannot find nccl comm in process group."));
+  return iter->second[0]->GetNcclComm();
+}
+
 }  //  namespace distributed
 }  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
index 5adb6867eb8ef86fd78b474adc6591e5bd839168..dfd07c438110c861519459d79db0cf1b1f439743 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
@@ -157,6 +157,8 @@ class ProcessGroupNCCL : public ProcessGroup {
 
   static void GroupEnd();
 
+  ncclComm_t NCCLComm(const Place& place) const;
+
  protected:
   virtual std::shared_ptr<ProcessGroupNCCL::NCCLTask> CreateTask(
       std::vector<Place> places,
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index ca98e43a50313fcafde7bdfef1f53b472a244f18..d40b6f589c5f1337c0d87f04e90dfeb59f4b2682 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -84,6 +84,11 @@ set(COMMON_KERNEL_DEPS
     gpc
     utf8proc)
 
+set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup)
+if(WITH_NCCL OR WITH_RCCL)
+  set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup_nccl)
+endif()
+
 copy_if_different(${kernel_declare_file} ${kernel_declare_file_final})
 
 file(GLOB kernel_h "*.h" "selected_rows/*.h" "sparse/*.h" "strings/*.h")
diff --git a/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu
index ed8a8c333442d0f098da8c1935202d5ba5b7956b..1d8d8a6dfacb1714a88a63a1a2f30ab9b83185bc 100644
--- a/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu
@@ -100,7 +100,19 @@ void SyncBatchNormKernel(const Context &ctx,
     }
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    auto *comm = ctx.nccl_comm();
+    int global_gid = 0;
+    ncclComm_t comm = nullptr;
+
+    if (paddle::distributed::ProcessGroupMapFromGid::getInstance()->has(
+            global_gid)) {
+      auto *nccl_pg = static_cast<paddle::distributed::ProcessGroupNCCL *>(
+          paddle::distributed::ProcessGroupMapFromGid::getInstance()->get(
+              global_gid));
+      comm = nccl_pg->NCCLComm(x.place());
+    } else {
+      comm = ctx.nccl_comm();
+    }
+
     if (comm) {
       int dtype = paddle::platform::ToNCCLDataType(
           paddle::framework::TransToProtoVarType(mean_out->dtype()));
@@ -113,6 +125,7 @@ void SyncBatchNormKernel(const Context &ctx,
           ncclSum,
           comm,
           stream));
+      VLOG(3) << "Sync result using all reduce";
     }
 #endif
 
diff --git a/paddle/phi/kernels/gpu/sync_batch_norm_utils.h b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h
index 37b9bca73a857beae48ca9c9c87f5c07a0e51a26..27c3cd4ad44f779774f15af6becf4256151a37ca 100644
--- a/paddle/phi/kernels/gpu/sync_batch_norm_utils.h
+++ b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h
@@ -26,6 +26,10 @@ limitations under the License. */
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
 #endif
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+#endif
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
@@ -411,7 +415,19 @@ void SyncBatchNormGradFunctor(
   }
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  auto *comm = ctx.nccl_comm();
+  int global_gid = 0;
+  ncclComm_t comm = nullptr;
+
+  if (paddle::distributed::ProcessGroupMapFromGid::getInstance()->has(
+          global_gid)) {
+    auto *nccl_pg = static_cast<paddle::distributed::ProcessGroupNCCL *>(
+        paddle::distributed::ProcessGroupMapFromGid::getInstance()->get(
+            global_gid));
+    comm = nccl_pg->NCCLComm(x->place());
+  } else {
+    comm = ctx.nccl_comm();
+  }
+
   if (comm) {
     int dtype = paddle::platform::ToNCCLDataType(
         paddle::framework::TransToProtoVarType(scale.dtype()));
@@ -424,6 +440,7 @@ void SyncBatchNormGradFunctor(
         ncclSum,
         comm,
         stream));
+    VLOG(3) << "Sync result using all reduce";
   }
 #endif