fix p2p comm memory release logic (#47497)

f82d7e3c · Yuang Liu · GitHub · 5158fa4f · f82d7e3c
隐藏空白更改
内联并排

Showing with 12 addition and 12 deletion

paddle/fluid/distributed/collective/ProcessGroupNCCL.cc paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +12 -12

未找到文件。
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -453,7 +453,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
  platform::CUDADeviceGuard cuda_guard;
-  if (FLAGS_use_stream_safe_cuda_allocator) {
+  {
+    platform::NCCLGroupGuard nccl_guard;
    for (size_t i = 0; i < tensors.size(); ++i) {
      cuda_guard.SetDevice(places[i]);
      gpuStream_t nccl_stream;
@@ -465,12 +466,11 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
      } else {
        nccl_stream = places_to_ctx_[key][i]->stream();
      }
-      memory::RecordStream(tensors[i].Holder(), nccl_stream);
+      fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);
    }
  }
-  {
+  if (FLAGS_use_stream_safe_cuda_allocator) {
-    platform::NCCLGroupGuard nccl_guard;
    for (size_t i = 0; i < tensors.size(); ++i) {
      cuda_guard.SetDevice(places[i]);
      gpuStream_t nccl_stream;
@@ -482,7 +482,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
      } else {
        nccl_stream = places_to_ctx_[key][i]->stream();
      }
-      fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);
+      memory::RecordStream(tensors[i].Holder(), nccl_stream);
    }
  }
@@ -521,20 +521,20 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
  // construct uninitialize guard for device
  platform::CUDADeviceGuard cuda_guard;
-  if (FLAGS_use_stream_safe_cuda_allocator) {
+  {
+    platform::NCCLGroupGuard nccl_guard;
    for (size_t i = 0; i < tensors.size(); ++i) {
      cuda_guard.SetDevice(places[i]);
-      memory::RecordStream(tensors[i].Holder(),
+      const auto& nccl_stream = places_to_ctx_[key][i]->stream();
-                           places_to_ctx_[key][i]->stream());
+      fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);
    }
  }
-  {
+  if (FLAGS_use_stream_safe_cuda_allocator) {
-    platform::NCCLGroupGuard nccl_guard;
    for (size_t i = 0; i < tensors.size(); ++i) {
      cuda_guard.SetDevice(places[i]);
-      const auto& nccl_stream = places_to_ctx_[key][i]->stream();
+      memory::RecordStream(tensors[i].Holder(),
-      fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);
+                           places_to_ctx_[key][i]->stream());
    }
  }