refactor: rename files (#49117)

40f3f4f0 · Wen Sun · GitHub · e77d1cac · 40f3f4f0 · 40f3f4f0
18 changed file
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -3,13 +3,13 @@ cc_library(
  SRCS ProcessGroup.cc
  DEPS dense_tensor)
 cc_library(
-  processgroup_stream
-  SRCS ProcessGroupStream.cc
+  process_group_stream
+  SRCS process_group_stream.cc
  DEPS dense_tensor)
 cc_library(
  eager_reducer
  SRCS reducer.cc
-  DEPS eager_api processgroup processgroup_stream phi_api string_helper)
+  DEPS eager_api processgroup process_group_stream phi_api string_helper)

 if(WITH_DISTRIBUTE)
  cc_library(
@@ -20,10 +20,10 @@ endif()

 if(WITH_NCCL OR WITH_RCCL)
  cc_library(
-    processgroup_nccl
-    SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc check.cc
+    process_group_nccl
+    SRCS process_group_nccl.cc nccl_tools.cc Common.cc check.cc
    DEPS processgroup
-         processgroup_stream
+         process_group_stream
         place
         enforce
         collective_helper

--- a/paddle/fluid/distributed/collective/ProcessGroup.h
+++ b/paddle/fluid/distributed/collective/ProcessGroup.h
@@ -97,6 +97,17 @@ class ProcessGroup {
        GetBackendName()));
  }

+  virtual std::shared_ptr<ProcessGroup::Task> AllGather(
+      phi::DenseTensor* out_tensor,
+      const phi::DenseTensor& in_tensor,
+      bool sync_op) {
+    return AllGather(out_tensor,
+                     in_tensor,
+                     /*offset*/ 0,
+                     /*numel*/ -1,  // -1 indicates the whole tensor
+                     sync_op);
+  }
+
  virtual std::shared_ptr<ProcessGroup::Task> AllGather(
      phi::DenseTensor* out_tensor,
      const phi::DenseTensor& in_tensor,
@@ -175,6 +186,16 @@ class ProcessGroup {
        GetBackendName()));
  }

+  virtual std::shared_ptr<ProcessGroup::Task> Recv(phi::DenseTensor* tensor,
+                                                   int src_rank,
+                                                   bool sync_op) {
+    return Recv(tensor,
+                src_rank,
+                /*offset*/ 0,
+                /*numel*/ -1,  // -1 indicates the whole tensor
+                sync_op);
+  }
+
  virtual std::shared_ptr<ProcessGroup::Task> Recv(phi::DenseTensor* tensor,
                                                   int src_rank,
                                                   int64_t offset,
@@ -185,6 +206,15 @@ class ProcessGroup {
        GetBackendName()));
  }

+  virtual std::shared_ptr<ProcessGroup::Task> Send(
+      const phi::DenseTensor& tensor, int dst_rank, bool sync_op) {
+    return Send(tensor,
+                dst_rank,
+                /*offset*/ 0,
+                /*numel*/ -1,  // -1 indicates the whole tensor
+                sync_op);
+  }
+
  virtual std::shared_ptr<ProcessGroup::Task> Send(
      const phi::DenseTensor& tensor,
      int dst_rank,

--- a/paddle/fluid/distributed/collective/ProcessGroupBKCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupBKCL.h
@@ -20,7 +20,7 @@
 #include <string>
 #include <unordered_map>

-#include "paddle/fluid/distributed/collective/ProcessGroupStream.h"
+#include "paddle/fluid/distributed/collective/process_group_stream.h"
 #include "paddle/fluid/distributed/store/store.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 #include "paddle/fluid/platform/enforce.h"

--- a/paddle/fluid/distributed/collective/check.cc
+++ b/paddle/fluid/distributed/collective/check.cc
@@ -14,7 +14,7 @@

 #include "paddle/fluid/distributed/collective/check.h"

-#include "paddle/fluid/distributed/collective/NCCLTools.h"
+#include "paddle/fluid/distributed/collective/nccl_tools.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/core/dense_tensor.h"

--- a/paddle/fluid/distributed/collective/NCCLTools.cc
+++ b/paddle/fluid/distributed/collective/NCCLTools.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/distributed/collective/NCCLTools.h"
+#include "paddle/fluid/distributed/collective/nccl_tools.h"

 #include "paddle/fluid/platform/enforce.h"


--- a/paddle/fluid/distributed/collective/NCCLTools.h
+++ b/paddle/fluid/distributed/collective/NCCLTools.h
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+#include "paddle/fluid/distributed/collective/process_group_nccl.h"

 #include "paddle/fluid/distributed/collective/Common.h"
-#include "paddle/fluid/distributed/collective/NCCLTools.h"
 #include "paddle/fluid/distributed/collective/check.h"
+#include "paddle/fluid/distributed/collective/nccl_tools.h"
 #include "paddle/fluid/distributed/collective/utils.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/fluid/platform/place.h"

--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
@@ -20,7 +20,7 @@
 #include <unordered_map>
 #include <vector>

-#include "paddle/fluid/distributed/collective/ProcessGroupStream.h"
+#include "paddle/fluid/distributed/collective/process_group_stream.h"
 #include "paddle/fluid/distributed/store/store.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device_event.h"
@@ -29,7 +29,7 @@
 #include "paddle/phi/core/device_context.h"

 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/distributed/collective/NCCLTools.h"
+#include "paddle/fluid/distributed/collective/nccl_tools.h"
 #endif

 #ifdef PADDLE_WITH_RCCL

--- a/paddle/fluid/distributed/collective/ProcessGroupStream.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupStream.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/distributed/collective/ProcessGroupStream.h"
+#include "paddle/fluid/distributed/collective/process_group_stream.h"

 namespace paddle {
 namespace distributed {
@@ -40,6 +40,19 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllGather(
                   /*use_calc_stream*/ false);
 }

+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllGather(
+    phi::DenseTensor* out_tensor,
+    const phi::DenseTensor& in_tensor,
+    bool sync_op,
+    bool use_calc_stream) {
+  return AllGather(out_tensor,
+                   in_tensor,
+                   /*offset*/ 0,
+                   /*numel*/ -1,  // -1 indicates the whole tensor
+                   sync_op,
+                   use_calc_stream);
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllGather(
    phi::DenseTensor* out_tensor,
    const phi::DenseTensor& in_tensor,
@@ -200,6 +213,19 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Recv(
              /*use_calc_stream*/ false);
 }

+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Recv(
+    phi::DenseTensor* tensor,
+    int src_rank,
+    bool sync_op,
+    bool use_calc_stream) {
+  return Recv(tensor,
+              src_rank,
+              /*offset*/ 0,
+              /*numel*/ -1,  // -1 indicates sending the whole tensor
+              sync_op,
+              use_calc_stream);
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Recv(
    phi::DenseTensor* tensor,
    int src_rank,
@@ -225,6 +251,19 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send(
              /*use_calc_stream*/ false);
 }

+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send(
+    const phi::DenseTensor& tensor,
+    int dst_rank,
+    bool sync_op,
+    bool use_calc_stream) {
+  return Send(tensor,
+              dst_rank,
+              /*offset*/ 0,
+              /*numel*/ -1,  // -1 indicates receiving the whole tensor
+              sync_op,
+              use_calc_stream);
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send(
    const phi::DenseTensor& tensor,
    int dst_rank,

--- a/paddle/fluid/distributed/collective/ProcessGroupStream.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupStream.h
@@ -69,6 +69,12 @@ class ProcessGroupStream : public ProcessGroup {
      int64_t numel,
      bool sync_op) override;

+  virtual std::shared_ptr<ProcessGroup::Task> AllGather(
+      phi::DenseTensor* out_tensor,
+      const phi::DenseTensor& in_tensor,
+      bool sync_op,
+      bool use_calc_stream);
+
  virtual std::shared_ptr<ProcessGroup::Task> AllGather(
      phi::DenseTensor* out_tensor,
      const phi::DenseTensor& in_tensor,
@@ -161,6 +167,11 @@ class ProcessGroupStream : public ProcessGroup {
                                           int64_t numel,
                                           bool sync_op) override;

+  virtual std::shared_ptr<ProcessGroup::Task> Recv(phi::DenseTensor* tensor,
+                                                   int src_rank,
+                                                   bool sync_op,
+                                                   bool use_calc_stream);
+
  virtual std::shared_ptr<ProcessGroup::Task> Recv(phi::DenseTensor* tensor,
                                                   int src_rank,
                                                   int64_t offset,
@@ -174,6 +185,11 @@ class ProcessGroupStream : public ProcessGroup {
                                           int64_t numel,
                                           bool sync_op) override;

+  std::shared_ptr<ProcessGroup::Task> Send(const phi::DenseTensor& tensor,
+                                           int dst_rank,
+                                           bool sync_op,
+                                           bool use_calc_stream);
+
  virtual std::shared_ptr<ProcessGroup::Task> Send(
      const phi::DenseTensor& tensor,
      int dst_rank,

--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -30,7 +30,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/math_function.h"

 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+#include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif

--- a/paddle/fluid/operators/fused/fused_feedforward_op.cu
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"

 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+#include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -157,7 +157,7 @@ endif()
 if(WITH_PYTHON)
  set(PYBIND_DEPS ${PYBIND_DEPS} processgroup eager_reducer)
  if(WITH_NCCL OR WITH_RCCL)
-    set(PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl)
+    set(PYBIND_DEPS ${PYBIND_DEPS} process_group_nccl)
  endif()
  if(WITH_XPU_BKCL)
    set(PYBIND_DEPS ${PYBIND_DEPS} processgroup_bkcl)

--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -22,8 +22,8 @@ limitations under the License. */
 #endif

 #include "paddle/fluid/distributed/collective/ProcessGroup.h"
-#include "paddle/fluid/distributed/collective/ProcessGroupStream.h"
 #include "paddle/fluid/distributed/collective/Types.h"
+#include "paddle/fluid/distributed/collective/process_group_stream.h"
 #include "paddle/fluid/distributed/collective/reducer.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -34,7 +34,7 @@ limitations under the License. */
 #include "paddle/phi/api/all.h"

 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+#include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #endif

 #if defined(PADDLE_WITH_MPI)
@@ -169,9 +169,7 @@ void BindDistributed(py::module *m) {
                auto p_dense =
                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
                auto out_dense = *p_dense;
-                // numel == -1 indicates sending the whole tensor
-                return self.Send(
-                    out_dense, dst, /*offset*/ 0, /*numel*/ -1, sync_op);
+                return self.Send(out_dense, dst, sync_op);
              },
              py::arg("tensor"),
              py::arg("dst"),
@@ -215,9 +213,7 @@ void BindDistributed(py::module *m) {
                auto p_dense =
                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
                auto *in_dense = p_dense.get();
-                // numel == -1 indicates receiving the whole tensor
-                return self.Recv(
-                    in_dense, src, /*offset*/ 0, /*numel*/ -1, sync_op);
+                return self.Recv(in_dense, src, sync_op);
              },
              py::arg("tensor"),
              py::arg("src"),
@@ -270,11 +266,7 @@ void BindDistributed(py::module *m) {
                auto in_dense = *p_in_tensor;

                auto *dev_ctx = self.GetDeviceContext(in_tensor.place());
-                auto task = self.AllGather(out_dense,
-                                           in_dense,
-                                           /*offset*/ 0,
-                                           /*numel*/ -1,
-                                           sync_op);
+                auto task = self.AllGather(out_dense, in_dense, sync_op);
                SplitTensor(*dev_ctx, *out_dense, &out_tensor_list);
                task->UpdateWaitChain(*dev_ctx);
                return task;
@@ -300,11 +292,7 @@ void BindDistributed(py::module *m) {
                    in_tensor.impl());
                auto in_dense = *p_in_tensor;

-                return self.AllGather(out_dense,
-                                      in_dense,
-                                      /*offset*/ 0,
-                                      /*numel*/ -1,
-                                      sync_op);
+                return self.AllGather(out_dense, in_dense, sync_op);
              },
              py::arg("out"),
              py::arg("in"),
@@ -771,8 +759,6 @@ void BindDistributed(py::module *m) {
                auto *dev_ctx = self.GetDeviceContext(in_tensor.place(), true);
                auto task = self.AllGather(out_dense,
                                           in_dense,
-                                           /*offset*/ 0,
-                                           /*numel*/ -1,
                                           /*sync_op*/ true,
                                           /*use_calc_stream*/ true);
                SplitTensor(*dev_ctx, *out_dense, &out_tensor_list);
@@ -799,8 +785,6 @@ void BindDistributed(py::module *m) {

                return self.AllGather(out_dense,
                                      in_dense,
-                                      /*offset*/ 0,
-                                      /*numel*/ -1,
                                      /*sync_op*/ true,
                                      /*use_calc_stream*/ true);
              },
@@ -1127,11 +1111,8 @@ void BindDistributed(py::module *m) {
                auto p_dense =
                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
                auto out_dense = *p_dense;
-                // numel == -1 indicates sending the whole tensor
                return self.Send(out_dense,
                                 dst,
-                                 /*offset*/ 0,
-                                 /*numel*/ -1,
                                 /*sync_op*/ true,
                                 /*use_calc_stream*/ true);
              },
@@ -1177,11 +1158,8 @@ void BindDistributed(py::module *m) {
                auto p_dense =
                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
                auto *in_dense = p_dense.get();
-                // numel == -1 indicates receiving the whole tensor
                return self.Recv(in_dense,
                                 src,
-                                 /*offset*/ 0,
-                                 /*numel*/ -1,
                                 /*sync_op*/ true,
                                 /*use_calc_stream*/ true);
              },

--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -67,7 +67,7 @@ endif()

 set(COMM_UTILS_DEPS processgroup)
 if(WITH_NCCL OR WITH_RCCL)
-  set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} processgroup_nccl)
+  set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} process_group_nccl)
 endif()
 if(WITH_CUSTOM_DEVICE)
  set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} processgroup_custom)

--- a/paddle/phi/backends/processgroup_comm_utils.cc
+++ b/paddle/phi/backends/processgroup_comm_utils.cc
@@ -15,7 +15,7 @@
 #include "paddle/fluid/distributed/collective/ProcessGroup.h"
 #include "paddle/phi/backends/c_comm_lib.h"
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+#include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #endif
 #if defined(PADDLE_WITH_CUSTOM_DEVICE)
 #include "paddle/fluid/distributed/collective/ProcessGroupCustom.h"

--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -80,7 +80,7 @@ set(COMMON_KERNEL_DEPS

 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup)
 if(WITH_NCCL OR WITH_RCCL)
-  set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup_nccl)
+  set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} process_group_nccl)
 endif()
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup_comm_utils)
 if(WITH_CUDNN_FRONTEND)

--- a/paddle/phi/kernels/gpu/sync_batch_norm_utils.h
+++ b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h
@@ -28,7 +28,7 @@ namespace cub = hipcub;
 #endif
 #include "paddle/fluid/distributed/collective/ProcessGroup.h"
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+#include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #endif
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"