[Auto Parallel] Compatible new comm library upgrade (#56604)

* for verify fluid operator support new comm library * u * u * u * compatiable new comm library upgrade for c_allgather, c_reduce, c_reduce_scatter and c_scatter. * Remove useless comments in process_group.py * Polish code style. * Fix some problems. * Remove use fluid api in phi comm_context_manager. * Add PPADDLE_WITH_CUDA and PADDLE_WITH_NCCL micro judgement. * Fix bug of HIP architecture. * Fix some problems. 1. remove useless loggings. 2. Fix conditional compilation for HIP. 3. Fix problems of test_pass_generation_pipeline.py. It calls paddle.distributed.init_parallel_env() at first, then auto.Engine calls _init_comm(), which will calls process_group.instantiate(). However, init_parallel_env() will call paddle.distributed.barrier(), it will call CreateNCCLEnvCache and create corresponding NCCLCommContext. But dev_id is not set, as a result, NCCLCommContext's dev_ctx is not initialized. * Fix some problems. * Polish code. * Polish code. * Revert compatiable upgrade for communication operators. Their upgrades will be submitted in another PR. * Remove StaticTCPStore. * Remove useless modification. * Remove useless set_cuda_device_id. * Polish code. * Remove fluid header files in phi files. * Remove useless comments. * Fix problems of hip arch. * Fix some problems. * Polish code. * Polish code style. --------- Co-authored-by: hitywt <yuwentao126@126.com>

[Auto Parallel] Compatible new comm library upgrade (#56604)
* for verify fluid operator support new comm library * u * u * u * compatiable new comm library upgrade for c_allgather, c_reduce, c_reduce_scatter and c_scatter. * Remove useless comments in process_group.py * Polish code style. * Fix some problems. * Remove use fluid api in phi comm_context_manager. * Add PPADDLE_WITH_CUDA and PADDLE_WITH_NCCL micro judgement. * Fix bug of HIP architecture. * Fix some problems. 1. remove useless loggings. 2. Fix conditional compilation for HIP. 3. Fix problems of test_pass_generation_pipeline.py. It calls paddle.distributed.init_parallel_env() at first, then auto.Engine calls _init_comm(), which will calls process_group.instantiate(). However, init_parallel_env() will call paddle.distributed.barrier(), it will call CreateNCCLEnvCache and create corresponding NCCLCommContext. But dev_id is not set, as a result, NCCLCommContext's dev_ctx is not initialized. * Fix some problems. * Polish code. * Polish code. * Revert compatiable upgrade for communication operators. Their upgrades will be submitted in another PR. * Remove StaticTCPStore. * Remove useless modification. * Remove useless set_cuda_device_id. * Polish code. * Remove fluid header files in phi files. * Remove useless comments. * Fix problems of hip arch. * Fix some problems. * Polish code. * Polish code style. --------- Co-authored-by: hitywt <yuwentao126@126.com>
ade51aa5 · Ghost Screaming · GitHub · ded10442 · ade51aa5 · ade51aa5
11 changed file
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -57,6 +57,11 @@ limitations under the License. */
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/custom_kernel.h"
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
+#include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
+#endif
 PHI_DECLARE_int32(paddle_num_threads);
 PADDLE_DEFINE_EXPORTED_int32(
    multiple_of_cupti_buffer_size,
@@ -440,6 +445,41 @@ void InitMemoryMethod() {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    memory_method->gpu_memory_usage = paddle::platform::GpuMemoryUsage;
 #endif
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
+    // TODO(GhostScreaming): Use phi methods later.
+    memory_method->get_allocator =
+        [](int device_id, phi::gpuStream_t stream) -> phi::Allocator * {
+      return paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::GPUPlace(device_id), stream)
+          .get();
+    };
+    memory_method->get_host_allocator = []() -> phi::Allocator * {
+      return paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get();
+    };
+    memory_method->get_zero_allocator = [](int device_id) -> phi::Allocator * {
+      return paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(phi::GPUPlace(device_id))
+          .get();
+    };
+    memory_method->get_host_zero_allocator = []() -> phi::Allocator * {
+      return paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(phi::CPUPlace())
+          .get();
+    };
+    memory_method->get_pinned_allocator = []() -> phi::Allocator * {
+      return paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::GPUPinnedPlace())
+          .get();
+    };
+    memory_method->get_new_cuda_event = [](int device_id) {
+      return paddle::platform::CudaEventResourcePool::Instance().New(device_id);
+    };
+#endif
    memory_method->emplace_device_contexts =
        paddle::platform::EmplaceDeviceContexts;
    memory_method->init_devices = InitDevices;

--- a/paddle/fluid/pybind/communication.cc
+++ b/paddle/fluid/pybind/communication.cc
@@ -42,14 +42,14 @@ void BindCommContextManager(py::module *m) {
      py::class_<phi::distributed::CommContextManager,
                 std::shared_ptr<phi::distributed::CommContextManager>>(
          *m, "CommContextManager")
+          .def_static("set_device_id",
+                      &phi::distributed::CommContextManager::SetDeviceId,
+                      py::call_guard<py::gil_scoped_release>())
 #if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)
          .def_static(
              "create_nccl_comm_context",
              &phi::distributed::CommContextManager::CreateNCCLCommContext,
              py::call_guard<py::gil_scoped_release>())
-          .def_static("set_cuda_device_id",
-                      &phi::distributed::CommContextManager::SetCUDADeviceId,
-                      py::call_guard<py::gil_scoped_release>())
 #endif
 #if defined(PADDLE_WITH_GLOO)
          .def_static(

--- a/paddle/phi/common/memory_utils.cc
+++ b/paddle/phi/common/memory_utils.cc
@@ -90,6 +90,34 @@ void EmplaceDeviceContexts(
      stream_priority);
 }
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
+const phi::Allocator* GetAllocator(int device_id, phi::gpuStream_t stream) {
+  return MemoryUtils::Instance().GetAllocator(device_id, stream);
+}
+const phi::Allocator* GetHostAllocator() {
+  return MemoryUtils::Instance().GetHostAllocator();
+}
+const phi::Allocator* GetZeroAllocator(int device_id) {
+  return MemoryUtils::Instance().GetZeroAllocator(device_id);
+}
+const phi::Allocator* GetHostZeroAllocator() {
+  return MemoryUtils::Instance().GetHostZeroAllocator();
+}
+const phi::Allocator* GetPinnedAllocator() {
+  return MemoryUtils::Instance().GetPinnedAllocator();
+}
+std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type> GetCudaEvent(
+    int device_id) {
+  return MemoryUtils::Instance().GetCudaEvent(device_id);
+}
+#endif
 }  // namespace memory_utils
 }  // namespace phi
--- a/paddle/phi/common/memory_utils.h
+++ b/paddle/phi/common/memory_utils.h
@@ -24,6 +24,15 @@
 #include "paddle/phi/core/macros.h"
 #include "paddle/phi/core/stream.h"
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#endif
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
 namespace phi {
 struct MemoryInterface {
@@ -150,6 +159,17 @@ struct MemoryInterface {
      const std::vector<phi::Place>& places,
      bool disable_setting_default_stream_for_allocator,
      int stream_priority);
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
+  phi::Allocator* (*get_allocator)(int device_id, phi::gpuStream_t stream);
+  phi::Allocator* (*get_host_allocator)();
+  phi::Allocator* (*get_zero_allocator)(int device_id);
+  phi::Allocator* (*get_host_zero_allocator)();
+  phi::Allocator* (*get_pinned_allocator)();
+  std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type> (
+      *get_new_cuda_event)(int device_id);
+#endif
 };
 class MemoryUtils {
@@ -323,6 +343,34 @@ class MemoryUtils {
            "Fluid. You can call InitMemoryMethod() for initialization."));
  }
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
+  const phi::Allocator* GetAllocator(int device_id, phi::gpuStream_t stream) {
+    return memory_method_->get_allocator(device_id, stream);
+  }
+  const phi::Allocator* GetHostAllocator() {
+    return memory_method_->get_host_allocator();
+  }
+  const phi::Allocator* GetZeroAllocator(int device_id) {
+    return memory_method_->get_zero_allocator(device_id);
+  }
+  const phi::Allocator* GetHostZeroAllocator() {
+    return memory_method_->get_host_zero_allocator();
+  }
+  const phi::Allocator* GetPinnedAllocator() {
+    return memory_method_->get_pinned_allocator();
+  }
+  std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type> GetCudaEvent(
+      int device_id) {
+    return memory_method_->get_new_cuda_event(device_id);
+  }
+#endif
 private:
  MemoryUtils() = default;
@@ -385,6 +433,22 @@ void EmplaceDeviceContexts(
    bool disable_setting_default_stream_for_allocator,
    int stream_priority);
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
+const Allocator* GetAllocator(int device_id, phi::gpuStream_t stream);
+const Allocator* GetHostAllocator();
+const Allocator* GetZeroAllocator(int device_id);
+const Allocator* GetHostZeroAllocator();
+const Allocator* GetPinnedAllocator();
+std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type> GetCudaEvent(
+    int device_id);
+#endif
 class Buffer {
 public:
  explicit Buffer(const phi::Place& place) : place_(place) {}

--- a/paddle/phi/core/distributed/comm_context_manager.cc
+++ b/paddle/phi/core/distributed/comm_context_manager.cc
@@ -12,44 +12,53 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#if defined(PADDLE_WITH_GLOO)
-#include <gloo/rendezvous/prefix_store.h>
-#include "paddle/phi/core/distributed/gloo_comm_context.h"
-#include "paddle/phi/core/distributed/gloo_utils.h"
-#include "paddle/phi/core/distributed/store/gloo_store.h"
-#endif
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include <memory>
 #include <string>
+#include "glog/logging.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/core/distributed/store/store.h"
 #include "paddle/phi/core/enforce.h"
+#if defined(PADDLE_WITH_GLOO)
+#include <gloo/rendezvous/prefix_store.h>
+#include "paddle/phi/core/distributed/gloo_comm_context.h"
+#include "paddle/phi/core/distributed/gloo_utils.h"
+#include "paddle/phi/core/distributed/store/gloo_store.h"
+#endif
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-#include "glog/logging.h"
 #include "paddle/phi/core/distributed/xccl_comm_context.h"
 #endif
 namespace phi {
 namespace distributed {
+int CommContextManager::device_id = -1;
+void CommContextManager::SetDeviceId(int dev_id) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-void CommContextManager::SetCUDADeviceId(int dev_id) {
  phi::backends::gpu::SetDeviceId(dev_id);
+  CommContextManager::device_id = dev_id;
+#endif
 }
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 void CommContextManager::CreateNCCLCommContext(
    const std::shared_ptr<Store>& store,
    const std::string& unique_comm_key,
    int rank,
    int size) {
+  auto& comm_context_manager = CommContextManager::GetInstance();
+  if (comm_context_manager.Has(unique_comm_key)) {
+    return;
+  }
  ncclUniqueId nccl_id;
  if (rank == 0) {
    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGetUniqueId(&nccl_id));
@@ -68,7 +77,28 @@ void CommContextManager::CreateNCCLCommContext(
  auto nccl_comm_context =
      std::make_unique<NCCLCommContext>(rank, size, nccl_id);
-  auto& comm_context_manager = CommContextManager::GetInstance();
+  if (CommContextManager::device_id != -1) {
+    std::unique_ptr<phi::GPUContext> dev_ctx(
+        new phi::GPUContext(phi::GPUPlace(CommContextManager::device_id)));
+    dev_ctx->SetAllocator(phi::memory_utils::GetAllocator(
+        CommContextManager::device_id, dev_ctx->stream()));
+    dev_ctx->SetHostAllocator(phi::memory_utils::GetHostAllocator());
+    dev_ctx->SetZeroAllocator(
+        phi::memory_utils::GetZeroAllocator(CommContextManager::device_id));
+    dev_ctx->SetHostZeroAllocator(phi::memory_utils::GetHostZeroAllocator());
+    dev_ctx->SetPinnedAllocator(phi::memory_utils::GetPinnedAllocator());
+    dev_ctx->PartialInitWithAllocator();
+    auto compute_event =
+        phi::memory_utils::GetCudaEvent(CommContextManager::device_id);
+    auto comm_event =
+        phi::memory_utils::GetCudaEvent(CommContextManager::device_id);
+    nccl_comm_context->SetDevContext(std::move(dev_ctx));
+    nccl_comm_context->SetComputeEvent(std::move(compute_event));
+    nccl_comm_context->SetCommEvent(std::move(comm_event));
+  }
  comm_context_manager.SetStore(store);
  comm_context_manager.Emplace(unique_comm_key, std::move(nccl_comm_context));
 }

--- a/paddle/phi/core/distributed/comm_context_manager.h
+++ b/paddle/phi/core/distributed/comm_context_manager.h
@@ -46,13 +46,13 @@ class CommContextManager {
  bool Has(const std::string& unique_comm_key) const;
+  static void SetDeviceId(int dev_id);
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
  static void CreateNCCLCommContext(const std::shared_ptr<Store>& store,
                                    const std::string& unique_comm_key,
                                    int rank,
                                    int size);
-  static void SetCUDADeviceId(int dev_id);
 #endif
 #if defined(PADDLE_WITH_GLOO)
@@ -76,6 +76,7 @@ class CommContextManager {
  std::unordered_map<std::string, std::unique_ptr<CommContext>>
      id_to_comm_context_;
  std::shared_ptr<Store> store_;
+  static int device_id;
 };
 }  // namespace distributed

--- a/paddle/phi/core/distributed/nccl_comm_context.cc
+++ b/paddle/phi/core/distributed/nccl_comm_context.cc
@@ -37,6 +37,30 @@ NCCLCommContext::NCCLCommContext(int rank, int size, ncclUniqueId nccl_id)
 ncclComm_t NCCLCommContext::GetNcclComm() { return nccl_comm_; }
+gpuStream_t NCCLCommContext::GetStream() { return dev_ctx_->stream(); }
+phi::GPUContext* NCCLCommContext::GetDevContext() { return dev_ctx_.get(); }
+void NCCLCommContext::SetDevContext(
+    std::unique_ptr<phi::GPUContext>&& dev_ctx) {
+  dev_ctx_ = std::move(dev_ctx);
+}
+gpuEvent_t NCCLCommContext::GetComputeEvent() { return compute_event_.get(); }
+void NCCLCommContext::SetComputeEvent(
+    std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type>&&
+        compute_event) {
+  compute_event_ = std::move(compute_event);
+}
+gpuEvent_t NCCLCommContext::GetCommEvent() { return comm_event_.get(); }
+void NCCLCommContext::SetCommEvent(
+    std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type>&& comm_event) {
+  comm_event_ = std::move(comm_event);
+}
 void NCCLCommContext::Broadcast(phi::DenseTensor* out_tensor,
                                const phi::DenseTensor& in_tensor,
                                int root,

--- a/paddle/phi/core/distributed/nccl_comm_context.h
+++ b/paddle/phi/core/distributed/nccl_comm_context.h
@@ -13,6 +13,16 @@
 // limitations under the License.
 #pragma once
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#endif
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/core/distributed/comm_context.h"
 #include "paddle/phi/core/macros.h"
@@ -30,9 +40,27 @@ namespace distributed {
 class NCCLCommContext final : public CommContext {
 public:
  NCCLCommContext(int rank, int size, ncclUniqueId nccl_id);
+  ~NCCLCommContext() {}
  ncclComm_t GetNcclComm();
+  gpuStream_t GetStream();
+  gpuEvent_t GetComputeEvent();
+  void SetComputeEvent(
+      std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type>&&
+          compute_event);
+  gpuEvent_t GetCommEvent();
+  void SetCommEvent(
+      std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type>&& comm_event);
+  phi::GPUContext* GetDevContext();
+  void SetDevContext(std::unique_ptr<phi::GPUContext>&& dev_ctx);
  void Broadcast(phi::DenseTensor* out_tensor,
                 const phi::DenseTensor& in_tensor,
                 int root,
@@ -75,6 +103,14 @@ class NCCLCommContext final : public CommContext {
  DISABLE_COPY_AND_ASSIGN(NCCLCommContext);
  ncclComm_t nccl_comm_;
+  std::unique_ptr<phi::GPUContext> dev_ctx_;
+  // used for comm wait compute, compute_stream-->event-->comm_stream
+  std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type> compute_event_;
+  // used for compute wait comm, comm_stream-->event-->compute_stream
+  std::shared_ptr<std::remove_pointer<phi::gpuEvent_t>::type> comm_event_;
 };
 }  // namespace distributed

--- a/python/paddle/distributed/auto_parallel/static/process_group.py
+++ b/python/paddle/distributed/auto_parallel/static/process_group.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
+import os
 from collections import OrderedDict
 import paddle
@@ -146,9 +147,6 @@ class ProcessGroup:
        global_rank = genv.rank
        if self.nranks >= 2 and global_rank in self.ranks:
-            logger.info(
-                f"group_id: {self.id}, ranks: {self.ranks}, nranks: {self.nranks}, trainer_endpoints: {genv.current_endpoint}"
-            )
            strategy = core.ParallelStrategy()
            strategy.nranks = self.nranks
            strategy.local_rank = self.local_rank(global_rank)
@@ -159,9 +157,22 @@ class ProcessGroup:
            strategy.nrings = 1
            if core.is_compiled_with_cuda():
                place = core.CUDAPlace(genv.device_id)
-                core.NCCLParallelContext(strategy, place).init_with_ring_id(
+                use_new_comm = os.getenv(
-                    ring_id
+                    "FLAGS_dynamic_static_unified_comm", "0"
                )
+                if use_new_comm in ["1", "True", "true"]:
+                    store = core.create_or_get_global_tcp_store()
+                    core.CommContextManager.set_device_id(genv.device_id)
+                    core.CommContextManager.create_nccl_comm_context(
+                        store,
+                        str(ring_id),
+                        strategy.local_rank,
+                        strategy.nranks,
+                    )
+                else:
+                    core.NCCLParallelContext(strategy, place).init_with_ring_id(
+                        ring_id
+                    )
            elif core.is_compiled_with_xpu():
                place = core.XPUPlace(genv.device_id)
                core.BKCLParallelContext(strategy, place).init_with_ring_id(

--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -330,7 +330,7 @@ def _init_parallel_env(backend):
            store, "0", rank, world_size
        )
    elif backend == "nccl":
-        core.CommContextManager.set_cuda_device_id(dev_id)
+        core.CommContextManager.set_device_id(dev_id)
        core.CommContextManager.create_nccl_comm_context(
            store, "0", rank, world_size
        )

--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -1084,13 +1084,7 @@ def init_parallel_env():
        master_port = int(master_port)
        is_master = rank == 0
        stop_check_timeout = int(os.getenv("FLAGS_stop_check_timeout", "900"))
-        default_store = core.TCPStore(
+        default_store = core.create_or_get_global_tcp_store()
-            master_addr,
-            master_port,
-            is_master,
-            world_size,
-            timeout=stop_check_timeout,
-        )
        _set_default_store(default_store)
        pg = _new_process_group_impl(
            backend,
@@ -1108,6 +1102,11 @@ def init_parallel_env():
        _add_new_group(group)
        parallel_helper._set_parallel_ctx(True)
+        # barrier will call CreateNCCLEnvCache which will call CreateNCCLCommContext.
+        # Set device_id to prevent creating null dev_ctx.
+        # TODO(mine): support XPU and other backends.
+        if backend in ["nccl", 'xccl', 'bkcl']:
+            core.CommContextManager.set_device_id(parallel_env.device_id)
        paddle.distributed.barrier(group=group)
        return group