From 40f3f4f00b856b017a522cefada93d63cca32ebf Mon Sep 17 00:00:00 2001
From: Wen Sun <35923278+HermitSun@users.noreply.github.com>
Date: Fri, 16 Dec 2022 22:08:09 +0800
Subject: [PATCH] refactor: rename files (#49117)

---
 .../distributed/collective/CMakeLists.txt     | 12 +++---
 .../distributed/collective/ProcessGroup.h     | 30 ++++++++++++++
 .../distributed/collective/ProcessGroupBKCL.h |  2 +-
 paddle/fluid/distributed/collective/check.cc  |  2 +-
 .../{NCCLTools.cc => nccl_tools.cc}           |  2 +-
 .../collective/{NCCLTools.h => nccl_tools.h}  |  0
 ...cessGroupNCCL.cc => process_group_nccl.cc} |  4 +-
 ...rocessGroupNCCL.h => process_group_nccl.h} |  4 +-
 ...GroupStream.cc => process_group_stream.cc} | 41 ++++++++++++++++++-
 ...ssGroupStream.h => process_group_stream.h} | 16 ++++++++
 .../operators/fused/fused_attention_op.cu     |  2 +-
 .../operators/fused/fused_feedforward_op.cu   |  2 +-
 paddle/fluid/pybind/CMakeLists.txt            |  2 +-
 paddle/fluid/pybind/distributed_py.cc         | 34 +++------------
 paddle/phi/backends/CMakeLists.txt            |  2 +-
 .../phi/backends/processgroup_comm_utils.cc   |  2 +-
 paddle/phi/kernels/CMakeLists.txt             |  2 +-
 .../phi/kernels/gpu/sync_batch_norm_utils.h   |  2 +-
 18 files changed, 112 insertions(+), 49 deletions(-)
 rename paddle/fluid/distributed/collective/{NCCLTools.cc => nccl_tools.cc} (96%)
 rename paddle/fluid/distributed/collective/{NCCLTools.h => nccl_tools.h} (100%)
 rename paddle/fluid/distributed/collective/{ProcessGroupNCCL.cc => process_group_nccl.cc} (99%)
 rename paddle/fluid/distributed/collective/{ProcessGroupNCCL.h => process_group_nccl.h} (98%)
 rename paddle/fluid/distributed/collective/{ProcessGroupStream.cc => process_group_stream.cc} (86%)
 rename paddle/fluid/distributed/collective/{ProcessGroupStream.h => process_group_stream.h} (90%)

diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
index 85efa52c319..de9059228a6 100644
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -3,13 +3,13 @@ cc_library(
   SRCS ProcessGroup.cc
   DEPS dense_tensor)
 cc_library(
-  processgroup_stream
-  SRCS ProcessGroupStream.cc
+  process_group_stream
+  SRCS process_group_stream.cc
   DEPS dense_tensor)
 cc_library(
   eager_reducer
   SRCS reducer.cc
-  DEPS eager_api processgroup processgroup_stream phi_api string_helper)
+  DEPS eager_api processgroup process_group_stream phi_api string_helper)
 
 if(WITH_DISTRIBUTE)
   cc_library(
@@ -20,10 +20,10 @@ endif()
 
 if(WITH_NCCL OR WITH_RCCL)
   cc_library(
-    processgroup_nccl
-    SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc check.cc
+    process_group_nccl
+    SRCS process_group_nccl.cc nccl_tools.cc Common.cc check.cc
     DEPS processgroup
-         processgroup_stream
+         process_group_stream
          place
          enforce
          collective_helper
diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h
index 7abecd36e3d..3792e8bc835 100644
--- a/paddle/fluid/distributed/collective/ProcessGroup.h
+++ b/paddle/fluid/distributed/collective/ProcessGroup.h
@@ -97,6 +97,17 @@ class ProcessGroup {
         GetBackendName()));
   }
 
+  virtual std::shared_ptr<ProcessGroup::Task> AllGather(
+      phi::DenseTensor* out_tensor,
+      const phi::DenseTensor& in_tensor,
+      bool sync_op) {
+    return AllGather(out_tensor,
+                     in_tensor,
+                     /*offset*/ 0,
+                     /*numel*/ -1,  // -1 indicates the whole tensor
+                     sync_op);
+  }
+
   virtual std::shared_ptr<ProcessGroup::Task> AllGather(
       phi::DenseTensor* out_tensor,
       const phi::DenseTensor& in_tensor,
@@ -175,6 +186,16 @@ class ProcessGroup {
         GetBackendName()));
   }
 
+  virtual std::shared_ptr<ProcessGroup::Task> Recv(phi::DenseTensor* tensor,
+                                                   int src_rank,
+                                                   bool sync_op) {
+    return Recv(tensor,
+                src_rank,
+                /*offset*/ 0,
+                /*numel*/ -1,  // -1 indicates the whole tensor
+                sync_op);
+  }
+
   virtual std::shared_ptr<ProcessGroup::Task> Recv(phi::DenseTensor* tensor,
                                                    int src_rank,
                                                    int64_t offset,
@@ -185,6 +206,15 @@ class ProcessGroup {
         GetBackendName()));
   }
 
+  virtual std::shared_ptr<ProcessGroup::Task> Send(
+      const phi::DenseTensor& tensor, int dst_rank, bool sync_op) {
+    return Send(tensor,
+                dst_rank,
+                /*offset*/ 0,
+                /*numel*/ -1,  // -1 indicates the whole tensor
+                sync_op);
+  }
+
   virtual std::shared_ptr<ProcessGroup::Task> Send(
       const phi::DenseTensor& tensor,
       int dst_rank,
diff --git a/paddle/fluid/distributed/collective/ProcessGroupBKCL.h b/paddle/fluid/distributed/collective/ProcessGroupBKCL.h
index 79d97609d92..822f690cb80 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupBKCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupBKCL.h
@@ -20,7 +20,7 @@
 #include <string>
 #include <unordered_map>
 
-#include "paddle/fluid/distributed/collective/ProcessGroupStream.h"
+#include "paddle/fluid/distributed/collective/process_group_stream.h"
 #include "paddle/fluid/distributed/store/store.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/distributed/collective/check.cc b/paddle/fluid/distributed/collective/check.cc
index 9a2ca064024..151d7f35749 100644
--- a/paddle/fluid/distributed/collective/check.cc
+++ b/paddle/fluid/distributed/collective/check.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/fluid/distributed/collective/check.h"
 
-#include "paddle/fluid/distributed/collective/NCCLTools.h"
+#include "paddle/fluid/distributed/collective/nccl_tools.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/fluid/distributed/collective/NCCLTools.cc b/paddle/fluid/distributed/collective/nccl_tools.cc
similarity index 96%
rename from paddle/fluid/distributed/collective/NCCLTools.cc
rename to paddle/fluid/distributed/collective/nccl_tools.cc
index 47c0f547ee7..ffb51d706d9 100644
--- a/paddle/fluid/distributed/collective/NCCLTools.cc
+++ b/paddle/fluid/distributed/collective/nccl_tools.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/distributed/collective/NCCLTools.h"
+#include "paddle/fluid/distributed/collective/nccl_tools.h"
 
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/distributed/collective/NCCLTools.h b/paddle/fluid/distributed/collective/nccl_tools.h
similarity index 100%
rename from paddle/fluid/distributed/collective/NCCLTools.h
rename to paddle/fluid/distributed/collective/nccl_tools.h
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/process_group_nccl.cc
similarity index 99%
rename from paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
rename to paddle/fluid/distributed/collective/process_group_nccl.cc
index 13de2625a6e..0859708f92c 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/process_group_nccl.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+#include "paddle/fluid/distributed/collective/process_group_nccl.h"
 
 #include "paddle/fluid/distributed/collective/Common.h"
-#include "paddle/fluid/distributed/collective/NCCLTools.h"
 #include "paddle/fluid/distributed/collective/check.h"
+#include "paddle/fluid/distributed/collective/nccl_tools.h"
 #include "paddle/fluid/distributed/collective/utils.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/process_group_nccl.h
similarity index 98%
rename from paddle/fluid/distributed/collective/ProcessGroupNCCL.h
rename to paddle/fluid/distributed/collective/process_group_nccl.h
index 3ce77297f56..816a0d2ec90 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+++ b/paddle/fluid/distributed/collective/process_group_nccl.h
@@ -20,7 +20,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "paddle/fluid/distributed/collective/ProcessGroupStream.h"
+#include "paddle/fluid/distributed/collective/process_group_stream.h"
 #include "paddle/fluid/distributed/store/store.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device_event.h"
@@ -29,7 +29,7 @@
 #include "paddle/phi/core/device_context.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/distributed/collective/NCCLTools.h"
+#include "paddle/fluid/distributed/collective/nccl_tools.h"
 #endif
 
 #ifdef PADDLE_WITH_RCCL
diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.cc b/paddle/fluid/distributed/collective/process_group_stream.cc
similarity index 86%
rename from paddle/fluid/distributed/collective/ProcessGroupStream.cc
rename to paddle/fluid/distributed/collective/process_group_stream.cc
index e1ee425f3f8..2b69cf51fe6 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupStream.cc
+++ b/paddle/fluid/distributed/collective/process_group_stream.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/distributed/collective/ProcessGroupStream.h"
+#include "paddle/fluid/distributed/collective/process_group_stream.h"
 
 namespace paddle {
 namespace distributed {
@@ -40,6 +40,19 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllGather(
                    /*use_calc_stream*/ false);
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllGather(
+    phi::DenseTensor* out_tensor,
+    const phi::DenseTensor& in_tensor,
+    bool sync_op,
+    bool use_calc_stream) {
+  return AllGather(out_tensor,
+                   in_tensor,
+                   /*offset*/ 0,
+                   /*numel*/ -1,  // -1 indicates the whole tensor
+                   sync_op,
+                   use_calc_stream);
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllGather(
     phi::DenseTensor* out_tensor,
     const phi::DenseTensor& in_tensor,
@@ -200,6 +213,19 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Recv(
               /*use_calc_stream*/ false);
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Recv(
+    phi::DenseTensor* tensor,
+    int src_rank,
+    bool sync_op,
+    bool use_calc_stream) {
+  return Recv(tensor,
+              src_rank,
+              /*offset*/ 0,
+              /*numel*/ -1,  // -1 indicates sending the whole tensor
+              sync_op,
+              use_calc_stream);
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Recv(
     phi::DenseTensor* tensor,
     int src_rank,
@@ -225,6 +251,19 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send(
               /*use_calc_stream*/ false);
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send(
+    const phi::DenseTensor& tensor,
+    int dst_rank,
+    bool sync_op,
+    bool use_calc_stream) {
+  return Send(tensor,
+              dst_rank,
+              /*offset*/ 0,
+              /*numel*/ -1,  // -1 indicates receiving the whole tensor
+              sync_op,
+              use_calc_stream);
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send(
     const phi::DenseTensor& tensor,
     int dst_rank,
diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.h b/paddle/fluid/distributed/collective/process_group_stream.h
similarity index 90%
rename from paddle/fluid/distributed/collective/ProcessGroupStream.h
rename to paddle/fluid/distributed/collective/process_group_stream.h
index 4ad75be3658..d48ff0f24f8 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupStream.h
+++ b/paddle/fluid/distributed/collective/process_group_stream.h
@@ -69,6 +69,12 @@ class ProcessGroupStream : public ProcessGroup {
       int64_t numel,
       bool sync_op) override;
 
+  virtual std::shared_ptr<ProcessGroup::Task> AllGather(
+      phi::DenseTensor* out_tensor,
+      const phi::DenseTensor& in_tensor,
+      bool sync_op,
+      bool use_calc_stream);
+
   virtual std::shared_ptr<ProcessGroup::Task> AllGather(
       phi::DenseTensor* out_tensor,
       const phi::DenseTensor& in_tensor,
@@ -161,6 +167,11 @@ class ProcessGroupStream : public ProcessGroup {
                                            int64_t numel,
                                            bool sync_op) override;
 
+  virtual std::shared_ptr<ProcessGroup::Task> Recv(phi::DenseTensor* tensor,
+                                                   int src_rank,
+                                                   bool sync_op,
+                                                   bool use_calc_stream);
+
   virtual std::shared_ptr<ProcessGroup::Task> Recv(phi::DenseTensor* tensor,
                                                    int src_rank,
                                                    int64_t offset,
@@ -174,6 +185,11 @@ class ProcessGroupStream : public ProcessGroup {
                                            int64_t numel,
                                            bool sync_op) override;
 
+  std::shared_ptr<ProcessGroup::Task> Send(const phi::DenseTensor& tensor,
+                                           int dst_rank,
+                                           bool sync_op,
+                                           bool use_calc_stream);
+
   virtual std::shared_ptr<ProcessGroup::Task> Send(
       const phi::DenseTensor& tensor,
       int dst_rank,
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
index 67ee877f72c..559a2afb85f 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -30,7 +30,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+#include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu
index 28a9cb167e0..925ec7d2060 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cu
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+#include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 973ef8a4a79..37e085b82bc 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -157,7 +157,7 @@ endif()
 if(WITH_PYTHON)
   set(PYBIND_DEPS ${PYBIND_DEPS} processgroup eager_reducer)
   if(WITH_NCCL OR WITH_RCCL)
-    set(PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl)
+    set(PYBIND_DEPS ${PYBIND_DEPS} process_group_nccl)
   endif()
   if(WITH_XPU_BKCL)
     set(PYBIND_DEPS ${PYBIND_DEPS} processgroup_bkcl)
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index c5d03ce8853..e9d59132d0e 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -22,8 +22,8 @@ limitations under the License. */
 #endif
 
 #include "paddle/fluid/distributed/collective/ProcessGroup.h"
-#include "paddle/fluid/distributed/collective/ProcessGroupStream.h"
 #include "paddle/fluid/distributed/collective/Types.h"
+#include "paddle/fluid/distributed/collective/process_group_stream.h"
 #include "paddle/fluid/distributed/collective/reducer.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -34,7 +34,7 @@ limitations under the License. */
 #include "paddle/phi/api/all.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+#include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #endif
 
 #if defined(PADDLE_WITH_MPI)
@@ -169,9 +169,7 @@ void BindDistributed(py::module *m) {
                 auto p_dense =
                     std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
                 auto out_dense = *p_dense;
-                // numel == -1 indicates sending the whole tensor
-                return self.Send(
-                    out_dense, dst, /*offset*/ 0, /*numel*/ -1, sync_op);
+                return self.Send(out_dense, dst, sync_op);
               },
               py::arg("tensor"),
               py::arg("dst"),
@@ -215,9 +213,7 @@ void BindDistributed(py::module *m) {
                 auto p_dense =
                     std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
                 auto *in_dense = p_dense.get();
-                // numel == -1 indicates receiving the whole tensor
-                return self.Recv(
-                    in_dense, src, /*offset*/ 0, /*numel*/ -1, sync_op);
+                return self.Recv(in_dense, src, sync_op);
               },
               py::arg("tensor"),
               py::arg("src"),
@@ -270,11 +266,7 @@ void BindDistributed(py::module *m) {
                 auto in_dense = *p_in_tensor;
 
                 auto *dev_ctx = self.GetDeviceContext(in_tensor.place());
-                auto task = self.AllGather(out_dense,
-                                           in_dense,
-                                           /*offset*/ 0,
-                                           /*numel*/ -1,
-                                           sync_op);
+                auto task = self.AllGather(out_dense, in_dense, sync_op);
                 SplitTensor(*dev_ctx, *out_dense, &out_tensor_list);
                 task->UpdateWaitChain(*dev_ctx);
                 return task;
@@ -300,11 +292,7 @@ void BindDistributed(py::module *m) {
                     in_tensor.impl());
                 auto in_dense = *p_in_tensor;
 
-                return self.AllGather(out_dense,
-                                      in_dense,
-                                      /*offset*/ 0,
-                                      /*numel*/ -1,
-                                      sync_op);
+                return self.AllGather(out_dense, in_dense, sync_op);
               },
               py::arg("out"),
               py::arg("in"),
@@ -771,8 +759,6 @@ void BindDistributed(py::module *m) {
                 auto *dev_ctx = self.GetDeviceContext(in_tensor.place(), true);
                 auto task = self.AllGather(out_dense,
                                            in_dense,
-                                           /*offset*/ 0,
-                                           /*numel*/ -1,
                                            /*sync_op*/ true,
                                            /*use_calc_stream*/ true);
                 SplitTensor(*dev_ctx, *out_dense, &out_tensor_list);
@@ -799,8 +785,6 @@ void BindDistributed(py::module *m) {
 
                 return self.AllGather(out_dense,
                                       in_dense,
-                                      /*offset*/ 0,
-                                      /*numel*/ -1,
                                       /*sync_op*/ true,
                                       /*use_calc_stream*/ true);
               },
@@ -1127,11 +1111,8 @@ void BindDistributed(py::module *m) {
                 auto p_dense =
                     std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
                 auto out_dense = *p_dense;
-                // numel == -1 indicates sending the whole tensor
                 return self.Send(out_dense,
                                  dst,
-                                 /*offset*/ 0,
-                                 /*numel*/ -1,
                                  /*sync_op*/ true,
                                  /*use_calc_stream*/ true);
               },
@@ -1177,11 +1158,8 @@ void BindDistributed(py::module *m) {
                 auto p_dense =
                     std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
                 auto *in_dense = p_dense.get();
-                // numel == -1 indicates receiving the whole tensor
                 return self.Recv(in_dense,
                                  src,
-                                 /*offset*/ 0,
-                                 /*numel*/ -1,
                                  /*sync_op*/ true,
                                  /*use_calc_stream*/ true);
               },
diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
index c35bd2bc456..c9e110ae7b8 100644
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -67,7 +67,7 @@ endif()
 
 set(COMM_UTILS_DEPS processgroup)
 if(WITH_NCCL OR WITH_RCCL)
-  set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} processgroup_nccl)
+  set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} process_group_nccl)
 endif()
 if(WITH_CUSTOM_DEVICE)
   set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} processgroup_custom)
diff --git a/paddle/phi/backends/processgroup_comm_utils.cc b/paddle/phi/backends/processgroup_comm_utils.cc
index 841b88d752e..450c1763871 100644
--- a/paddle/phi/backends/processgroup_comm_utils.cc
+++ b/paddle/phi/backends/processgroup_comm_utils.cc
@@ -15,7 +15,7 @@
 #include "paddle/fluid/distributed/collective/ProcessGroup.h"
 #include "paddle/phi/backends/c_comm_lib.h"
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+#include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #endif
 #if defined(PADDLE_WITH_CUSTOM_DEVICE)
 #include "paddle/fluid/distributed/collective/ProcessGroupCustom.h"
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 808b18bb02d..abe35f284d6 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -80,7 +80,7 @@ set(COMMON_KERNEL_DEPS
 
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup)
 if(WITH_NCCL OR WITH_RCCL)
-  set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup_nccl)
+  set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} process_group_nccl)
 endif()
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup_comm_utils)
 if(WITH_CUDNN_FRONTEND)
diff --git a/paddle/phi/kernels/gpu/sync_batch_norm_utils.h b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h
index cfb2758e62d..f99da25dec9 100644
--- a/paddle/phi/kernels/gpu/sync_batch_norm_utils.h
+++ b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h
@@ -28,7 +28,7 @@ namespace cub = hipcub;
 #endif
 #include "paddle/fluid/distributed/collective/ProcessGroup.h"
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+#include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #endif
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
-- 
GitLab