diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
index 85efa52c3196a7b1e0684afa7d870512935012fe..de9059228a61bd06a60093595987a63248b089cd 100644
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -3,13 +3,13 @@ cc_library(
   SRCS ProcessGroup.cc
   DEPS dense_tensor)
 cc_library(
-  processgroup_stream
-  SRCS ProcessGroupStream.cc
+  process_group_stream
+  SRCS process_group_stream.cc
   DEPS dense_tensor)
 cc_library(
   eager_reducer
   SRCS reducer.cc
-  DEPS eager_api processgroup processgroup_stream phi_api string_helper)
+  DEPS eager_api processgroup process_group_stream phi_api string_helper)
 
 if(WITH_DISTRIBUTE)
   cc_library(
@@ -20,10 +20,10 @@ endif()
 
 if(WITH_NCCL OR WITH_RCCL)
   cc_library(
-    processgroup_nccl
-    SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc check.cc
+    process_group_nccl
+    SRCS process_group_nccl.cc nccl_tools.cc Common.cc check.cc
     DEPS processgroup
-         processgroup_stream
+         process_group_stream
          place
          enforce
          collective_helper
diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h
index 7abecd36e3d00f5f5e27c627764dbdcea6a3d5ea..3792e8bc835466d97add6aafabc22cc1c38f41df 100644
--- a/paddle/fluid/distributed/collective/ProcessGroup.h
+++ b/paddle/fluid/distributed/collective/ProcessGroup.h
@@ -97,6 +97,17 @@ class ProcessGroup {
         GetBackendName()));
   }
 
+  virtual std::shared_ptr<ProcessGroup::Task> AllGather(
+      phi::DenseTensor* out_tensor,
+      const phi::DenseTensor& in_tensor,
+      bool sync_op) {
+    return AllGather(out_tensor,
+                     in_tensor,
+                     /*offset*/ 0,
+                     /*numel*/ -1,  // -1 indicates the whole tensor
+                     sync_op);
+  }
+
   virtual std::shared_ptr<ProcessGroup::Task> AllGather(
       phi::DenseTensor* out_tensor,
       const phi::DenseTensor& in_tensor,
@@ -175,6 +186,16 @@ class ProcessGroup {
         GetBackendName()));
   }
 
+  virtual std::shared_ptr<ProcessGroup::Task> Recv(phi::DenseTensor* tensor,
+                                                   int src_rank,
+                                                   bool sync_op) {
+    return Recv(tensor,
+                src_rank,
+                /*offset*/ 0,
+                /*numel*/ -1,  // -1 indicates the whole tensor
+                sync_op);
+  }
+
   virtual std::shared_ptr<ProcessGroup::Task> Recv(phi::DenseTensor* tensor,
                                                    int src_rank,
                                                    int64_t offset,
@@ -185,6 +206,15 @@ class ProcessGroup {
         GetBackendName()));
   }
 
+  virtual std::shared_ptr<ProcessGroup::Task> Send(
+      const phi::DenseTensor& tensor, int dst_rank, bool sync_op) {
+    return Send(tensor,
+                dst_rank,
+                /*offset*/ 0,
+                /*numel*/ -1,  // -1 indicates the whole tensor
+                sync_op);
+  }
+
   virtual std::shared_ptr<ProcessGroup::Task> Send(
       const phi::DenseTensor& tensor,
       int dst_rank,
diff --git a/paddle/fluid/distributed/collective/ProcessGroupBKCL.h b/paddle/fluid/distributed/collective/ProcessGroupBKCL.h
index 79d97609d9274e86349c1e06e8166112ecaca071..822f690cb802f8fd2e864c109756f54b09f3fa33 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupBKCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupBKCL.h
@@ -20,7 +20,7 @@
 #include <string>
 #include <unordered_map>
 
-#include "paddle/fluid/distributed/collective/ProcessGroupStream.h"
+#include "paddle/fluid/distributed/collective/process_group_stream.h"
 #include "paddle/fluid/distributed/store/store.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/distributed/collective/check.cc b/paddle/fluid/distributed/collective/check.cc
index 9a2ca064024f4cfc37723e06fd34b9fe7bc35ac4..151d7f3574919d1c0e7a2a7a7e45a7b748ce4905 100644
--- a/paddle/fluid/distributed/collective/check.cc
+++ b/paddle/fluid/distributed/collective/check.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/fluid/distributed/collective/check.h"
 
-#include "paddle/fluid/distributed/collective/NCCLTools.h"
+#include "paddle/fluid/distributed/collective/nccl_tools.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/fluid/distributed/collective/NCCLTools.cc b/paddle/fluid/distributed/collective/nccl_tools.cc
similarity index 96%
rename from paddle/fluid/distributed/collective/NCCLTools.cc
rename to paddle/fluid/distributed/collective/nccl_tools.cc
index 47c0f547ee79eaf8a6cc871640e899a2beb69371..ffb51d706d9ef04c116c27230a05554b21cbd931 100644
--- a/paddle/fluid/distributed/collective/NCCLTools.cc
+++ b/paddle/fluid/distributed/collective/nccl_tools.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/distributed/collective/NCCLTools.h"
+#include "paddle/fluid/distributed/collective/nccl_tools.h"
 
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/distributed/collective/NCCLTools.h b/paddle/fluid/distributed/collective/nccl_tools.h
similarity index 100%
rename from paddle/fluid/distributed/collective/NCCLTools.h
rename to paddle/fluid/distributed/collective/nccl_tools.h
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/process_group_nccl.cc
similarity index 99%
rename from paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
rename to paddle/fluid/distributed/collective/process_group_nccl.cc
index 13de2625a6eeea292abde64eeccec75edaca983a..0859708f92cf4be5337175421e42e1625f0df15f 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/process_group_nccl.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+#include "paddle/fluid/distributed/collective/process_group_nccl.h"
 
 #include "paddle/fluid/distributed/collective/Common.h"
-#include "paddle/fluid/distributed/collective/NCCLTools.h"
 #include "paddle/fluid/distributed/collective/check.h"
+#include "paddle/fluid/distributed/collective/nccl_tools.h"
 #include "paddle/fluid/distributed/collective/utils.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/process_group_nccl.h
similarity index 98%
rename from paddle/fluid/distributed/collective/ProcessGroupNCCL.h
rename to paddle/fluid/distributed/collective/process_group_nccl.h
index 3ce77297f56f187472ed58963c3f75b9080f05f1..816a0d2ec90b65d8178fd5808c28ea61d5223c44 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+++ b/paddle/fluid/distributed/collective/process_group_nccl.h
@@ -20,7 +20,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "paddle/fluid/distributed/collective/ProcessGroupStream.h"
+#include "paddle/fluid/distributed/collective/process_group_stream.h"
 #include "paddle/fluid/distributed/store/store.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device_event.h"
@@ -29,7 +29,7 @@
 #include "paddle/phi/core/device_context.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/distributed/collective/NCCLTools.h"
+#include "paddle/fluid/distributed/collective/nccl_tools.h"
 #endif
 
 #ifdef PADDLE_WITH_RCCL
diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.cc b/paddle/fluid/distributed/collective/process_group_stream.cc
similarity index 86%
rename from paddle/fluid/distributed/collective/ProcessGroupStream.cc
rename to paddle/fluid/distributed/collective/process_group_stream.cc
index e1ee425f3f8888da27c966b6ef81058294e352db..2b69cf51fe630d91cb2d7ef653c637311d52b95b 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupStream.cc
+++ b/paddle/fluid/distributed/collective/process_group_stream.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/distributed/collective/ProcessGroupStream.h"
+#include "paddle/fluid/distributed/collective/process_group_stream.h"
 
 namespace paddle {
 namespace distributed {
@@ -40,6 +40,19 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllGather(
                    /*use_calc_stream*/ false);
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllGather(
+    phi::DenseTensor* out_tensor,
+    const phi::DenseTensor& in_tensor,
+    bool sync_op,
+    bool use_calc_stream) {
+  return AllGather(out_tensor,
+                   in_tensor,
+                   /*offset*/ 0,
+                   /*numel*/ -1,  // -1 indicates the whole tensor
+                   sync_op,
+                   use_calc_stream);
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllGather(
     phi::DenseTensor* out_tensor,
     const phi::DenseTensor& in_tensor,
@@ -200,6 +213,19 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Recv(
               /*use_calc_stream*/ false);
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Recv(
+    phi::DenseTensor* tensor,
+    int src_rank,
+    bool sync_op,
+    bool use_calc_stream) {
+  return Recv(tensor,
+              src_rank,
+              /*offset*/ 0,
+              /*numel*/ -1,  // -1 indicates sending the whole tensor
+              sync_op,
+              use_calc_stream);
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Recv(
     phi::DenseTensor* tensor,
     int src_rank,
@@ -225,6 +251,19 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send(
               /*use_calc_stream*/ false);
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send(
+    const phi::DenseTensor& tensor,
+    int dst_rank,
+    bool sync_op,
+    bool use_calc_stream) {
+  return Send(tensor,
+              dst_rank,
+              /*offset*/ 0,
+              /*numel*/ -1,  // -1 indicates receiving the whole tensor
+              sync_op,
+              use_calc_stream);
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send(
     const phi::DenseTensor& tensor,
     int dst_rank,
diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.h b/paddle/fluid/distributed/collective/process_group_stream.h
similarity index 90%
rename from paddle/fluid/distributed/collective/ProcessGroupStream.h
rename to paddle/fluid/distributed/collective/process_group_stream.h
index 4ad75be3658b97ed28aa3de91d5ceb4652fe6d1a..d48ff0f24f83bd3b03e9d8965c351af9c8a87f64 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupStream.h
+++ b/paddle/fluid/distributed/collective/process_group_stream.h
@@ -69,6 +69,12 @@ class ProcessGroupStream : public ProcessGroup {
       int64_t numel,
       bool sync_op) override;
 
+  virtual std::shared_ptr<ProcessGroup::Task> AllGather(
+      phi::DenseTensor* out_tensor,
+      const phi::DenseTensor& in_tensor,
+      bool sync_op,
+      bool use_calc_stream);
+
   virtual std::shared_ptr<ProcessGroup::Task> AllGather(
       phi::DenseTensor* out_tensor,
       const phi::DenseTensor& in_tensor,
@@ -161,6 +167,11 @@ class ProcessGroupStream : public ProcessGroup {
                                            int64_t numel,
                                            bool sync_op) override;
 
+  virtual std::shared_ptr<ProcessGroup::Task> Recv(phi::DenseTensor* tensor,
+                                                   int src_rank,
+                                                   bool sync_op,
+                                                   bool use_calc_stream);
+
   virtual std::shared_ptr<ProcessGroup::Task> Recv(phi::DenseTensor* tensor,
                                                    int src_rank,
                                                    int64_t offset,
@@ -174,6 +185,11 @@ class ProcessGroupStream : public ProcessGroup {
                                            int64_t numel,
                                            bool sync_op) override;
 
+  std::shared_ptr<ProcessGroup::Task> Send(const phi::DenseTensor& tensor,
+                                           int dst_rank,
+                                           bool sync_op,
+                                           bool use_calc_stream);
+
   virtual std::shared_ptr<ProcessGroup::Task> Send(
       const phi::DenseTensor& tensor,
       int dst_rank,
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
index 67ee877f72c8a73c9bee4f9c2189dabc141bac1f..559a2afb85f45f050192d582f1bf15e6868550e3 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -30,7 +30,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+#include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu
index 28a9cb167e093c62a1966800958c04606205cc88..925ec7d2060a48e1aa47020e7fd2d36d1800a314 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cu
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+#include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 973ef8a4a79992df7db279885dd52dc7af9ddb3b..37e085b82bc08277d0c4bbafe0dc1336cf605bb3 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -157,7 +157,7 @@ endif()
 if(WITH_PYTHON)
   set(PYBIND_DEPS ${PYBIND_DEPS} processgroup eager_reducer)
   if(WITH_NCCL OR WITH_RCCL)
-    set(PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl)
+    set(PYBIND_DEPS ${PYBIND_DEPS} process_group_nccl)
   endif()
   if(WITH_XPU_BKCL)
     set(PYBIND_DEPS ${PYBIND_DEPS} processgroup_bkcl)
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index c5d03ce8853e312540d5d6f5a7f090e5d3852992..e9d59132d0e4605f5f160b53c0d94157a5708bae 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -22,8 +22,8 @@ limitations under the License. */
 #endif
 
 #include "paddle/fluid/distributed/collective/ProcessGroup.h"
-#include "paddle/fluid/distributed/collective/ProcessGroupStream.h"
 #include "paddle/fluid/distributed/collective/Types.h"
+#include "paddle/fluid/distributed/collective/process_group_stream.h"
 #include "paddle/fluid/distributed/collective/reducer.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -34,7 +34,7 @@ limitations under the License. */
 #include "paddle/phi/api/all.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+#include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #endif
 
 #if defined(PADDLE_WITH_MPI)
@@ -169,9 +169,7 @@ void BindDistributed(py::module *m) {
                 auto p_dense =
                     std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
                 auto out_dense = *p_dense;
-                // numel == -1 indicates sending the whole tensor
-                return self.Send(
-                    out_dense, dst, /*offset*/ 0, /*numel*/ -1, sync_op);
+                return self.Send(out_dense, dst, sync_op);
               },
               py::arg("tensor"),
               py::arg("dst"),
@@ -215,9 +213,7 @@ void BindDistributed(py::module *m) {
                 auto p_dense =
                     std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
                 auto *in_dense = p_dense.get();
-                // numel == -1 indicates receiving the whole tensor
-                return self.Recv(
-                    in_dense, src, /*offset*/ 0, /*numel*/ -1, sync_op);
+                return self.Recv(in_dense, src, sync_op);
               },
               py::arg("tensor"),
               py::arg("src"),
@@ -270,11 +266,7 @@ void BindDistributed(py::module *m) {
                 auto in_dense = *p_in_tensor;
 
                 auto *dev_ctx = self.GetDeviceContext(in_tensor.place());
-                auto task = self.AllGather(out_dense,
-                                           in_dense,
-                                           /*offset*/ 0,
-                                           /*numel*/ -1,
-                                           sync_op);
+                auto task = self.AllGather(out_dense, in_dense, sync_op);
                 SplitTensor(*dev_ctx, *out_dense, &out_tensor_list);
                 task->UpdateWaitChain(*dev_ctx);
                 return task;
@@ -300,11 +292,7 @@ void BindDistributed(py::module *m) {
                     in_tensor.impl());
                 auto in_dense = *p_in_tensor;
 
-                return self.AllGather(out_dense,
-                                      in_dense,
-                                      /*offset*/ 0,
-                                      /*numel*/ -1,
-                                      sync_op);
+                return self.AllGather(out_dense, in_dense, sync_op);
               },
               py::arg("out"),
               py::arg("in"),
@@ -771,8 +759,6 @@ void BindDistributed(py::module *m) {
                 auto *dev_ctx = self.GetDeviceContext(in_tensor.place(), true);
                 auto task = self.AllGather(out_dense,
                                            in_dense,
-                                           /*offset*/ 0,
-                                           /*numel*/ -1,
                                            /*sync_op*/ true,
                                            /*use_calc_stream*/ true);
                 SplitTensor(*dev_ctx, *out_dense, &out_tensor_list);
@@ -799,8 +785,6 @@ void BindDistributed(py::module *m) {
 
                 return self.AllGather(out_dense,
                                       in_dense,
-                                      /*offset*/ 0,
-                                      /*numel*/ -1,
                                       /*sync_op*/ true,
                                       /*use_calc_stream*/ true);
               },
@@ -1127,11 +1111,8 @@ void BindDistributed(py::module *m) {
                 auto p_dense =
                     std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
                 auto out_dense = *p_dense;
-                // numel == -1 indicates sending the whole tensor
                 return self.Send(out_dense,
                                  dst,
-                                 /*offset*/ 0,
-                                 /*numel*/ -1,
                                  /*sync_op*/ true,
                                  /*use_calc_stream*/ true);
               },
@@ -1177,11 +1158,8 @@ void BindDistributed(py::module *m) {
                 auto p_dense =
                     std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
                 auto *in_dense = p_dense.get();
-                // numel == -1 indicates receiving the whole tensor
                 return self.Recv(in_dense,
                                  src,
-                                 /*offset*/ 0,
-                                 /*numel*/ -1,
                                  /*sync_op*/ true,
                                  /*use_calc_stream*/ true);
               },
diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
index c35bd2bc456f3296bbaabae000517759d4db87b1..c9e110ae7b8e40a17ac04d4771c770460f47106a 100644
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -67,7 +67,7 @@ endif()
 
 set(COMM_UTILS_DEPS processgroup)
 if(WITH_NCCL OR WITH_RCCL)
-  set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} processgroup_nccl)
+  set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} process_group_nccl)
 endif()
 if(WITH_CUSTOM_DEVICE)
   set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} processgroup_custom)
diff --git a/paddle/phi/backends/processgroup_comm_utils.cc b/paddle/phi/backends/processgroup_comm_utils.cc
index 841b88d752e9a3547164d21171f0e7def326707d..450c17638712228ff86f64d2dc65412894fb4206 100644
--- a/paddle/phi/backends/processgroup_comm_utils.cc
+++ b/paddle/phi/backends/processgroup_comm_utils.cc
@@ -15,7 +15,7 @@
 #include "paddle/fluid/distributed/collective/ProcessGroup.h"
 #include "paddle/phi/backends/c_comm_lib.h"
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+#include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #endif
 #if defined(PADDLE_WITH_CUSTOM_DEVICE)
 #include "paddle/fluid/distributed/collective/ProcessGroupCustom.h"
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 808b18bb02d45f03e82c6df9fabc563c92288282..abe35f284d66407dae03305d77860625066f34cb 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -80,7 +80,7 @@ set(COMMON_KERNEL_DEPS
 
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup)
 if(WITH_NCCL OR WITH_RCCL)
-  set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup_nccl)
+  set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} process_group_nccl)
 endif()
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup_comm_utils)
 if(WITH_CUDNN_FRONTEND)
diff --git a/paddle/phi/kernels/gpu/sync_batch_norm_utils.h b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h
index cfb2758e62def45b88f634614ed04a63105c3208..f99da25dec9600864796c14c5e8e1d84d22caeae 100644
--- a/paddle/phi/kernels/gpu/sync_batch_norm_utils.h
+++ b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h
@@ -28,7 +28,7 @@ namespace cub = hipcub;
 #endif
 #include "paddle/fluid/distributed/collective/ProcessGroup.h"
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+#include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #endif
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"