Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into move_yolo_box_to_phi

c9beb885 · phlrain · 25fe41d0 · 4c0511fa · c9beb885 · c9beb885
8 changed file
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -17,7 +17,7 @@ if (WITH_GPU)
  nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
  nv_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info)
  nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
-  nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator)
+  nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator cuda_graph)
  nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)

  cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -210,12 +210,7 @@ class AllocatorFacadePrivate {
        InitNaiveBestFitCPUAllocator();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
        allow_free_idle_chunk_ = allow_free_idle_chunk;
-        if (FLAGS_use_stream_safe_cuda_allocator) {
-          for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount();
-               ++dev_id) {
-            InitStreamSafeCUDAAllocator(platform::CUDAPlace(dev_id), nullptr);
-          }
-        } else {
+        if (!FLAGS_use_stream_safe_cuda_allocator) {
          for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount();
               ++dev_id) {
            InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
@@ -298,6 +293,12 @@ class AllocatorFacadePrivate {
    }

    CheckAllocThreadSafe();
+
+#ifdef PADDLE_WITH_CUDA
+    if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
+      WrapCUDAGraphAllocator();
+    }
+#endif
  }

  inline const std::shared_ptr<Allocator>& GetAllocator(
@@ -388,39 +389,6 @@ class AllocatorFacadePrivate {
                                allocation.get()));
    return stream_safe_cuda_allocation->GetOwningStream();
  }
-
-#ifdef PADDLE_WITH_CUDA
-  void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
-    PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth,
-                      platform::errors::InvalidArgument(
-                          "CUDA Graph is only supported when the "
-                          "FLAGS_allocator_strategy=\"auto_growth\", but got "
-                          "FLAGS_allocator_strategy=\"%s\"",
-                          FLAGS_allocator_strategy));
-    auto& allocator = cuda_graph_allocator_map_[id];
-    PADDLE_ENFORCE_EQ(
-        allocator.get(), nullptr,
-        platform::errors::InvalidArgument(
-            "The memory pool of the CUDA Graph with ID %d have been prepared.",
-            id));
-    allocator.reset(
-        new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
-    for (auto& item : allocator->allocators_) {
-      auto& old_allocator = item.second;
-      old_allocator = CUDAGraphAllocator::Create(old_allocator);
-    }
-    VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
-  }
-
-  void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
-    auto iter = cuda_graph_allocator_map_.find(id);
-    PADDLE_ENFORCE_NE(iter, cuda_graph_allocator_map_.end(),
-                      platform::errors::InvalidArgument(
-                          "Cannot find CUDA Graph with ID = %d", id));
-    cuda_graph_allocator_map_.erase(iter);
-    VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id;
-  }
-#endif
 #endif

 private:
@@ -439,24 +407,7 @@ class AllocatorFacadePrivate {
    platform::Place place_;
  };

-  const AllocatorMap& GetAllocatorMap() {
-#ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
-      auto id = platform::CUDAGraph::CapturingID();
-      auto iter = cuda_graph_allocator_map_.find(id);
-      PADDLE_ENFORCE_NE(
-          iter, cuda_graph_allocator_map_.end(),
-          platform::errors::PermissionDenied(
-              "No memory pool is prepared for CUDA Graph capturing."));
-      VLOG(10) << "Choose CUDA Graph memory pool to allocate memory";
-      return iter->second->allocators_;
-    } else {
-      return allocators_;
-    }
-#else
-    return allocators_;
-#endif
-  }
+  const AllocatorMap& GetAllocatorMap() { return allocators_; }

  void InitNaiveBestFitCPUAllocator() {
    allocators_[platform::CPUPlace()] =
@@ -672,10 +623,10 @@ class AllocatorFacadePrivate {
  }

  void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
-    const std::shared_ptr<Allocator>& underlying_allocator =
-        cuda_allocators_[p][stream];
-    cuda_allocators_[p][stream] = std::make_shared<StreamSafeCUDAAllocator>(
-        underlying_allocator, p, stream);
+    std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
+    allocator = std::make_shared<StreamSafeCUDAAllocator>(
+        allocator, p, stream,
+        /* in_cuda_graph_capturing = */ !allow_free_idle_chunk_);
  }

  void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream,
@@ -684,10 +635,19 @@ class AllocatorFacadePrivate {
        retry_time, 0,
        platform::errors::InvalidArgument(
            "Retry time should be larger than 0, but got %d", retry_time));
-    std::shared_ptr<Allocator> allocator = cuda_allocators_[p][stream];
+    std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
    allocator = std::make_shared<RetryAllocator>(allocator, retry_time);
  }

+#ifdef PADDLE_WITH_CUDA
+  void WrapCUDAGraphAllocator() {
+    for (auto& item : allocators_) {
+      auto& allocator = item.second;
+      allocator = CUDAGraphAllocator::Create(allocator);
+    }
+  }
+#endif
+
  static void CheckCUDAAllocThreadSafe(const CUDAAllocatorMap& allocators) {
    for (auto& place_pair : allocators) {
      for (auto& stream_pair : place_pair.second) {
@@ -864,10 +824,6 @@ class AllocatorFacadePrivate {
  // a standalone CUDA allocator to support multi-stream GC in new executor
  CUDAAllocatorMap cuda_allocators_;
  std::shared_timed_mutex cuda_allocator_mutex_;
-#ifdef PADDLE_WITH_CUDA
-  std::unordered_map<CUDAGraphID, std::unique_ptr<AllocatorFacadePrivate>>
-      cuda_graph_allocator_map_;
-#endif
 #endif
  AllocatorStrategy strategy_;
  AllocatorMap allocators_;
@@ -886,8 +842,24 @@ AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {}
 AllocatorFacade::~AllocatorFacade() {}

 AllocatorFacade& AllocatorFacade::Instance() {
-  static AllocatorFacade instance;
-  return instance;
+  static AllocatorFacade* instance = new AllocatorFacade;
+  return *instance;
+}
+
+AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const {
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
+    auto id = platform::CUDAGraph::CapturingID();
+    auto iter = cuda_graph_map_.find(id);
+    PADDLE_ENFORCE_NE(
+        iter, cuda_graph_map_.end(),
+        platform::errors::PermissionDenied(
+            "No memory pool is prepared for CUDA Graph capturing."));
+    VLOG(10) << "Choose CUDA Graph memory pool";
+    return iter->second.get();
+  }
+#endif
+  return m_;
 }

 const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
@@ -895,19 +867,14 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
      FLAGS_use_system_allocator == false) {
-#ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-      return m_->GetAllocator(place,
-                              /* A non-zero num to choose allocator_ */ 1);
-    }
-#endif
-
+    AllocatorFacadePrivate* m = GetPrivate();
    platform::CUDAPlace cuda_place(place.GetDeviceId());
-    return m_->GetAllocator(cuda_place, m_->GetDefaultStream(cuda_place));
+    return m->GetAllocator(cuda_place, m->GetDefaultStream(cuda_place));
  }
 #endif

-  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
+  return GetPrivate()->GetAllocator(
+      place, /* A non-zero num to choose allocator_ */ 1);
 }

 void* AllocatorFacade::GetBasePtr(
@@ -922,7 +889,7 @@ void* AllocatorFacade::GetBasePtr(
                        "GetBasePtr() is only implemented for CUDAPlace(), not "
                        "suppot place: %s",
                        allocation->place()));
-  return m_->GetBasePtr(allocation);
+  return GetPrivate()->GetBasePtr(allocation);
 }

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -930,21 +897,17 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
    const platform::Place& place, const gpuStream_t& stream) {
  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
      FLAGS_use_system_allocator == false) {
-#ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-      return m_->GetAllocator(place,
-                              /* A non-zero num to choose allocator_ */ 1);
-    }
-#endif
-    return m_->GetAllocator(place, stream, /*create_if_not_found=*/true);
+    return GetPrivate()->GetAllocator(place, stream,
+                                      /*create_if_not_found=*/true);
  }
-  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
+  return GetPrivate()->GetAllocator(
+      place, /* A non-zero num to choose allocator_ */ 1);
 }
 #endif

 const std::shared_ptr<Allocator>& AllocatorFacade::GetZeroAllocator(
    const platform::Place& place) {
-  return m_->GetAllocator(place, /* zero size */ 0);
+  return GetPrivate()->GetAllocator(place, /* zero size */ 0);
 }

 std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
@@ -957,43 +920,30 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
      size > 0 && FLAGS_use_system_allocator == false) {
-#ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-      return m_->GetAllocator(place, size)->Allocate(size);
-    }
-#endif
-
    platform::CUDAPlace cuda_place(place.GetDeviceId());
-    return Alloc(cuda_place, size, m_->GetDefaultStream(cuda_place));
+    phi::Stream default_stream = phi::Stream(reinterpret_cast<phi::StreamId>(
+        GetPrivate()->GetDefaultStream(cuda_place)));
+    return Alloc(cuda_place, size, default_stream);
  }
 #endif
-
-  return m_->GetAllocator(place, size)->Allocate(size);
+  return GetPrivate()->GetAllocator(place, size)->Allocate(size);
 }

 uint64_t AllocatorFacade::Release(const platform::Place& place) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
      FLAGS_use_system_allocator == false) {
-#ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-      return m_
-          ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
-          ->Release(place);
-    }
-#endif
-
    platform::CUDAPlace cuda_place(place.GetDeviceId());
-    return Release(cuda_place, m_->GetDefaultStream(cuda_place));
+    return Release(cuda_place, GetPrivate()->GetDefaultStream(cuda_place));
  }
 #endif
-  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
+  return GetPrivate()
+      ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
      ->Release(place);
 }

 std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
    const platform::Place& place, size_t size, const phi::Stream& stream) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  PADDLE_ENFORCE_EQ(
      FLAGS_use_stream_safe_cuda_allocator, true,
      platform::errors::Unimplemented(
@@ -1001,71 +951,53 @@ std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
          "multi-stream 'AllocaShared' function. To enable it, you can enter"
          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
          "terminal."));
-
-#ifdef PADDLE_WITH_CUDA
-  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
-  }
-#endif
-  gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
-  return std::shared_ptr<phi::Allocation>(Alloc(place, size, s));
-#else
-  PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
-#endif
+  return std::shared_ptr<phi::Allocation>(Alloc(place, size, stream));
 }

-bool AllocatorFacade::InSameStream(
-    const std::shared_ptr<phi::Allocation>& allocation,
+AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
                                     const phi::Stream& stream) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  PADDLE_ENFORCE_EQ(
      FLAGS_use_stream_safe_cuda_allocator, true,
      platform::errors::Unimplemented(
          "StreamSafeCUDAAllocator is disabled, you should not call this "
-          "multi-stream 'InSameStream' function. To enable it, you can enter"
+          "multi-stream 'Alloc' function. To enable it, you can enter"
          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
          "terminal."));

-#ifdef PADDLE_WITH_CUDA
-  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
-  }
-#endif
+  platform::CUDAPlace p(place.GetDeviceId());
+  if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
    gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
-  return s == GetStream(allocation);
+    return GetPrivate()
+        ->GetAllocator(p, s, /* create_if_not_found = */ true)
+        ->Allocate(size);
+  } else {
+    return GetPrivate()->GetAllocator(p, size)->Allocate(size);
+  }
 #else
  PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
 #endif
 }

+bool AllocatorFacade::InSameStream(
+    const std::shared_ptr<phi::Allocation>& allocation,
+    const phi::Stream& stream) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
-                                     const gpuStream_t& stream) {
  PADDLE_ENFORCE_EQ(
      FLAGS_use_stream_safe_cuda_allocator, true,
      platform::errors::Unimplemented(
          "StreamSafeCUDAAllocator is disabled, you should not call this "
-          "multi-stream 'Alloc' function. To enable it, you can enter"
+          "multi-stream 'InSameStream' function. To enable it, you can enter"
          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
          "terminal."));
-
-#ifdef PADDLE_WITH_CUDA
-  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
-  }
+  gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
+  return s == GetStream(allocation);
+#else
+  PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
 #endif
-  platform::CUDAPlace p(place.GetDeviceId());
-  if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
-    return m_->GetAllocator(p, stream, /* create_if_not_found = */ true)
-        ->Allocate(size);
-  } else {
-    return m_->GetAllocator(p, size)->Allocate(size);
-  }
 }

+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
                                  const gpuStream_t& stream) {
  PADDLE_ENFORCE_EQ(
@@ -1075,15 +1007,7 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
          "multi-stream 'Release' function. To enable it, you can enter"
          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
          "terminal."));
-
-#ifdef PADDLE_WITH_CUDA
-  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
-  }
-#endif
-
-  return m_->GetAllocator(place, stream)->Release(place);
+  return GetPrivate()->GetAllocator(place, stream)->Release(place);
 }

 void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
@@ -1095,15 +1019,7 @@ void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
          "'RecordStream' function. To enable it, you can enter"
          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
          "terminal."));
-
-#ifdef PADDLE_WITH_CUDA
-  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
-  }
-#endif
-
-  m_->RecordStream(allocation, stream);
+  GetPrivate()->RecordStream(allocation, stream);
 }

 const gpuStream_t& AllocatorFacade::GetStream(
@@ -1115,24 +1031,34 @@ const gpuStream_t& AllocatorFacade::GetStream(
          "'GetStream' function. To enable it, you can enter"
          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
          "terminal."));
-
-#ifdef PADDLE_WITH_CUDA
-  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
-  }
-#endif
-
-  return m_->GetStream(allocation);
+  return GetPrivate()->GetStream(allocation);
 }

 #ifdef PADDLE_WITH_CUDA
 void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
-  return m_->PrepareMemoryPoolForCUDAGraph(id);
+  PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth,
+                    platform::errors::InvalidArgument(
+                        "CUDA Graph is only supported when the "
+                        "FLAGS_allocator_strategy=\"auto_growth\", but got "
+                        "FLAGS_allocator_strategy=\"%s\"",
+                        FLAGS_allocator_strategy));
+  auto& allocator = cuda_graph_map_[id];
+  PADDLE_ENFORCE_EQ(
+      allocator.get(), nullptr,
+      platform::errors::InvalidArgument(
+          "The memory pool of the CUDA Graph with ID %d have been prepared.",
+          id));
+  allocator.reset(new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
+  VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
 }

 void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
-  return m_->RemoveMemoryPoolOfCUDAGraph(id);
+  auto iter = cuda_graph_map_.find(id);
+  PADDLE_ENFORCE_NE(iter, cuda_graph_map_.end(),
+                    platform::errors::InvalidArgument(
+                        "Cannot find CUDA Graph with ID = %d", id));
+  cuda_graph_map_.erase(iter);
+  VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id;
 }
 #endif
 #endif

--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -49,6 +49,8 @@ class AllocatorFacade {

  static AllocatorFacade& Instance();

+  AllocatorFacadePrivate* GetPrivate() const;
+
  const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place);

  void* GetBasePtr(const std::shared_ptr<Allocation>& allocation);
@@ -73,13 +75,14 @@ class AllocatorFacade {
                                          size_t size,
                                          const phi::Stream& stream);

+  AllocationPtr Alloc(const platform::Place& place, size_t size,
+                      const phi::Stream& stream);
+
  bool InSameStream(const std::shared_ptr<Allocation>& allocation,
                    const phi::Stream& stream);

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  // TODO(zhiqiu): change gpuStream_t to phi::Stream if needed.
-  AllocationPtr Alloc(const platform::Place& place, size_t size,
-                      const gpuStream_t& stream);
  uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream);
  void RecordStream(std::shared_ptr<Allocation> allocation,
                    const gpuStream_t& stream);
@@ -96,6 +99,10 @@ class AllocatorFacade {
 private:
  AllocatorFacade();
  AllocatorFacadePrivate* m_;
+#ifdef PADDLE_WITH_CUDA
+  std::unordered_map<CUDAGraphID, std::unique_ptr<AllocatorFacadePrivate>>
+      cuda_graph_map_;
+#endif
 };

 }  // namespace allocation

--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -15,56 +15,52 @@
 #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"

+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
+#endif
+
 namespace paddle {
 namespace memory {
 namespace allocation {

 StreamSafeCUDAAllocation::StreamSafeCUDAAllocation(
-    DecoratedAllocationPtr underlying_allocation, gpuStream_t owning_stream)
+    DecoratedAllocationPtr underlying_allocation, gpuStream_t owning_stream,
+    StreamSafeCUDAAllocator* allocator)
    : Allocation(underlying_allocation->ptr(),
                 underlying_allocation->base_ptr(),
                 underlying_allocation->size(), underlying_allocation->place()),
      underlying_allocation_(std::move(underlying_allocation)),
-      owning_stream_(std::move(owning_stream)) {}
+      owning_stream_(std::move(owning_stream)),
+      allocator_(allocator->shared_from_this()) {}

 void StreamSafeCUDAAllocation::RecordStream(const gpuStream_t& stream) {
  VLOG(8) << "Try record stream " << stream << " for address " << ptr();
  if (stream == owning_stream_) {
-    VLOG(9) << "Record the same stream of " << stream;
    return;
  }

  std::lock_guard<SpinLock> lock_guard(outstanding_event_map_lock_);
-  gpuEvent_t record_event;
-  auto it = outstanding_event_map_.find(stream);
-  if (it == outstanding_event_map_.end()) {
-    gpuEvent_t new_event;
 #ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        hipEventCreateWithFlags(&new_event, hipEventDisableTiming));
-#endif
-    outstanding_event_map_[stream] = new_event;
-    record_event = new_event;
-    VLOG(9) << "Create a new event " << new_event;
-  } else {
-    record_event = it->second;
-    VLOG(9) << "Reuse event " << record_event;
+  if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
+    graph_capturing_stream_set_.insert(stream);
+    return;
  }
-
-#ifdef PADDLE_WITH_CUDA
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream));
-#else
-  PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream));
 #endif
-  VLOG(8) << "Record event " << record_event << " to stream " << stream;
+
+  RecordStreamWithNoGraphCapturing(stream);
+  RecordGraphCapturingStreams();
 }

 bool StreamSafeCUDAAllocation::CanBeFreed() {
-  // NOTE(Ruibiao): This function will not execute concurrently,
-  // so outstanding_event_lock_ is not required here
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
+    return graph_capturing_stream_set_.empty() &&
+           outstanding_event_map_.empty();
+  }
+#endif
+
+  RecordGraphCapturingStreams();
+
  for (auto it = outstanding_event_map_.begin();
       it != outstanding_event_map_.end(); ++it) {
    gpuEvent_t& event = it->second;
@@ -98,21 +94,62 @@ const gpuStream_t& StreamSafeCUDAAllocation::GetOwningStream() const {
  return owning_stream_;
 }

+void StreamSafeCUDAAllocation::RecordGraphCapturingStreams() {
+  for (gpuStream_t stream : graph_capturing_stream_set_) {
+    RecordStreamWithNoGraphCapturing(stream);
+  }
+  graph_capturing_stream_set_.clear();
+}
+
+void StreamSafeCUDAAllocation::RecordStreamWithNoGraphCapturing(
+    const gpuStream_t& stream) {
+  gpuEvent_t record_event;
+  auto it = outstanding_event_map_.find(stream);
+  if (it == outstanding_event_map_.end()) {
+    gpuEvent_t new_event;
+#ifdef PADDLE_WITH_CUDA
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        hipEventCreateWithFlags(&new_event, hipEventDisableTiming));
+#endif
+    outstanding_event_map_[stream] = new_event;
+    record_event = new_event;
+    VLOG(9) << "Create a new event " << new_event;
+  } else {
+    record_event = it->second;
+    VLOG(9) << "Reuse event " << record_event;
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream));
+#endif
+  VLOG(8) << "Record event " << record_event << " to stream " << stream;
+}
+
 StreamSafeCUDAAllocator::StreamSafeCUDAAllocator(
    std::shared_ptr<Allocator> underlying_allocator, platform::CUDAPlace place,
-    gpuStream_t default_stream)
+    gpuStream_t default_stream, bool in_cuda_graph_capturing)
    : underlying_allocator_(std::move(underlying_allocator)),
      place_(std::move(place)),
-      default_stream_(std::move(default_stream)) {
+      default_stream_(std::move(default_stream)),
+      in_cuda_graph_capturing_(in_cuda_graph_capturing) {
+  if (LIKELY(!in_cuda_graph_capturing)) {
    std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
    allocator_map_[place].emplace_back(this);
+  }
 }

 StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() {
+  if (LIKELY(!in_cuda_graph_capturing_)) {
    std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
    std::vector<StreamSafeCUDAAllocator*>& allocators = allocator_map_[place_];
    allocators.erase(std::remove(allocators.begin(), allocators.end(), this),
                     allocators.end());
+  }
 }

 bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; }
@@ -140,7 +177,7 @@ phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
  }
  StreamSafeCUDAAllocation* allocation = new StreamSafeCUDAAllocation(
      static_unique_ptr_cast<Allocation>(std::move(underlying_allocation)),
-      default_stream_);
+      default_stream_, this);
  VLOG(8) << "Allocate " << allocation->size() << " bytes at address "
          << allocation->ptr();
  return allocation;
@@ -157,22 +194,27 @@ void StreamSafeCUDAAllocator::FreeImpl(phi::Allocation* allocation) {
                              "StreamSafeCUDAAllocation*",
                              allocation));
  VLOG(8) << "Try free allocation " << stream_safe_cuda_allocation->ptr();
-  std::lock_guard<SpinLock> lock_guard(unfreed_allocation_lock_);
  if (stream_safe_cuda_allocation->CanBeFreed()) {
    VLOG(9) << "Directly delete allocation";
    delete stream_safe_cuda_allocation;
  } else {
    VLOG(9) << "Put into unfreed_allocation list";
+    std::lock_guard<SpinLock> lock_guard(unfreed_allocation_lock_);
    unfreed_allocations_.emplace_back(stream_safe_cuda_allocation);
  }
 }

 uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) {
+  if (UNLIKELY(in_cuda_graph_capturing_)) {
+    VLOG(7) << "Memory release forbidden in CUDA Graph Captruing";
+    return 0;
+  }
+
  std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
  std::vector<StreamSafeCUDAAllocator*>& allocators = allocator_map_[place];
  uint64_t released_size = 0;
  for (StreamSafeCUDAAllocator* allocator : allocators) {
-    released_size += allocator->ProcessUnfreedAllocationsWithRelease();
+    released_size += allocator->ProcessUnfreedAllocationsAndRelease();
  }
  VLOG(8) << "Release " << released_size << " bytes memory from all streams";
  return released_size;
@@ -191,7 +233,7 @@ void StreamSafeCUDAAllocator::ProcessUnfreedAllocations() {
  }
 }

-uint64_t StreamSafeCUDAAllocator::ProcessUnfreedAllocationsWithRelease() {
+uint64_t StreamSafeCUDAAllocator::ProcessUnfreedAllocationsAndRelease() {
  ProcessUnfreedAllocations();
  return underlying_allocator_->Release(place_);
 }

--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
@@ -14,10 +14,9 @@

 #pragma once

-#include <deque>
 #include <list>
 #include <map>
-#include <mutex>
+#include <set>
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/allocation/spin_lock.h"
 #include "paddle/fluid/platform/place.h"
@@ -32,27 +31,38 @@ namespace paddle {
 namespace memory {
 namespace allocation {

+class StreamSafeCUDAAllocator;
+
 class StreamSafeCUDAAllocation : public Allocation {
 public:
  StreamSafeCUDAAllocation(DecoratedAllocationPtr underlying_allocation,
-                           gpuStream_t owning_stream);
+                           gpuStream_t owning_stream,
+                           StreamSafeCUDAAllocator *allocator);
+
  void RecordStream(const gpuStream_t &stream);
  bool CanBeFreed();
-
  const gpuStream_t &GetOwningStream() const;

 private:
+  void RecordGraphCapturingStreams();
+  void RecordStreamWithNoGraphCapturing(const gpuStream_t &stream);
  DecoratedAllocationPtr underlying_allocation_;
+  std::set<gpuStream_t> graph_capturing_stream_set_;
  std::map<gpuStream_t, gpuEvent_t> outstanding_event_map_;
  gpuStream_t owning_stream_;
  SpinLock outstanding_event_map_lock_;
+  // To compatiable with CUDA Graph, hold the allocator shared_ptr so that
+  // Allocator will not deconstruct before Allocation
+  std::shared_ptr<Allocator> allocator_;
 };

-class StreamSafeCUDAAllocator : public Allocator {
+class StreamSafeCUDAAllocator
+    : public Allocator,
+      public std::enable_shared_from_this<StreamSafeCUDAAllocator> {
 public:
  StreamSafeCUDAAllocator(std::shared_ptr<Allocator> underlying_allocator,
-                          platform::CUDAPlace place,
-                          gpuStream_t default_stream);
+                          platform::CUDAPlace place, gpuStream_t default_stream,
+                          bool in_cuda_graph_capturing = false);
  ~StreamSafeCUDAAllocator();
  bool IsAllocThreadSafe() const override;

@@ -63,7 +73,7 @@ class StreamSafeCUDAAllocator : public Allocator {

 private:
  void ProcessUnfreedAllocations();
-  uint64_t ProcessUnfreedAllocationsWithRelease();
+  uint64_t ProcessUnfreedAllocationsAndRelease();

  static std::map<platform::Place, std::vector<StreamSafeCUDAAllocator *>>
      allocator_map_;
@@ -74,6 +84,8 @@ class StreamSafeCUDAAllocator : public Allocator {
  gpuStream_t default_stream_;
  std::list<StreamSafeCUDAAllocation *> unfreed_allocations_;
  SpinLock unfreed_allocation_lock_;
+
+  bool in_cuda_graph_capturing_;
 };

 }  // namespace allocation

--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -41,6 +41,11 @@ std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
                                                             stream);
 }

+AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
+                    const phi::Stream& stream) {
+  return allocation::AllocatorFacade::Instance().Alloc(place, size, stream);
+}
+
 bool InSameStream(const std::shared_ptr<Allocation>& allocation,
                  const phi::Stream& stream) {
  return allocation::AllocatorFacade::Instance().InSameStream(allocation,
@@ -52,11 +57,6 @@ void* GetBasePtr(const std::shared_ptr<Allocation>& allocation) {
 }

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
-                    const gpuStream_t& stream) {
-  return allocation::AllocatorFacade::Instance().Alloc(place, size, stream);
-}
-
 uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream) {
  return allocation::AllocatorFacade::Instance().Release(place, stream);
 }

--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -41,15 +41,15 @@ extern std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
                                               size_t size,
                                               const phi::Stream& stream);

+extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
+                           const phi::Stream& stream);
+
 extern bool InSameStream(const std::shared_ptr<Allocation>& allocation,
                         const phi::Stream& stream);

 extern void* GetBasePtr(const std::shared_ptr<Allocation>& allocation);

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
-                           const gpuStream_t& stream);
-
 extern uint64_t Release(const platform::CUDAPlace& place,
                        const gpuStream_t& stream);


--- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
+++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
@@ -12,34 +12,35 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifdef PADDLE_WITH_CUDA
-#include <cuda.h>
-#include <cuda_runtime.h>
-#endif
-
-#ifdef PADDLE_WITH_HIP
-#include <hip/hip_runtime.h>
-#endif
-
 #include <thread>  // NOLINT
 #include <vector>

 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+#include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/core/stream.h"

+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+#endif
+
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+
 namespace paddle {
 namespace memory {

-__global__ void add_kernel(int *x, int n) {
+// y += (x + 1)
+__global__ void add_kernel(int *x, int *y, int n) {
  int thread_num = gridDim.x * blockDim.x;
  int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
  for (int i = thread_id; i < n; i += thread_num) {
-    atomicAdd(x + i, thread_id);
+    y[i] += x[i] + 1;
  }
 }

@@ -51,153 +52,6 @@ void CheckMemLeak(const platform::CUDAPlace &place) {
                                 << " there may be a memory leak problem";
 }

-class StreamSafeCUDAAllocTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    place_ = platform::CUDAPlace();
-    stream_num_ = 64;
-    grid_num_ = 1;
-    block_num_ = 32;
-    data_num_ = 131072;
-    workspace_size_ = data_num_ * sizeof(int);
-
-    // alloc workspace for each stream
-    for (size_t i = 0; i < stream_num_; ++i) {
-      gpuStream_t stream;
-#ifdef PADDLE_WITH_CUDA
-      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream));
-#endif
-
-      std::shared_ptr<Allocation> allocation =
-          AllocShared(place_, workspace_size_,
-                      phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
-#ifdef PADDLE_WITH_CUDA
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          cudaMemset(allocation->ptr(), 0, allocation->size()));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          hipMemset(allocation->ptr(), 0, allocation->size()));
-#endif
-
-      streams_.emplace_back(stream);
-      workspaces_.emplace_back(allocation);
-    }
-
-    result_ = Alloc(place_, stream_num_ * workspace_size_);
-  }
-
-  void SingleStreamRun(size_t idx) {
-    // for all stream i,
-    // stream idx lauch a kernel to add (j % thread_num) to workspaces_[i][j]
-    for (size_t i = 0; i < stream_num_; ++i) {
-      int *x = reinterpret_cast<int *>(workspaces_[i]->ptr());
-      add_kernel<<<grid_num_, block_num_, 0, streams_[idx]>>>(x, data_num_);
-      RecordStream(workspaces_[i], streams_[idx]);
-    }
-  }
-
-  void CopyResultAsync() {
-    for (size_t i = 0; i < stream_num_; ++i) {
-#ifdef PADDLE_WITH_CUDA
-      PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(
-          reinterpret_cast<int *>(result_->ptr()) + i * data_num_,
-          workspaces_[i]->ptr(), workspace_size_, cudaMemcpyDeviceToDevice));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(
-          reinterpret_cast<int *>(result_->ptr()) + i * data_num_,
-          workspaces_[i]->ptr(), workspace_size_, hipMemcpyDeviceToDevice));
-#endif
-    }
-  }
-
-  void MultiStreamRun() {
-    for (size_t i = 0; i < stream_num_; ++i) {
-      SingleStreamRun(i);
-    }
-    CopyResultAsync();
-    workspaces_.clear();  // fast_gc
-    cudaDeviceSynchronize();
-  }
-
-  void MultiThreadMUltiStreamRun() {
-    std::vector<std::thread> threads;
-    for (size_t i = 0; i < stream_num_; ++i) {
-      threads.push_back(
-          std::thread(&StreamSafeCUDAAllocTest::SingleStreamRun, this, i));
-    }
-    for (size_t i = 0; i < stream_num_; ++i) {
-      threads[i].join();
-    }
-    CopyResultAsync();
-    workspaces_.clear();  // fast_gc
-    cudaDeviceSynchronize();
-  }
-
-  void CheckResult() {
-    auto result_host = std::unique_ptr<int[]>(new int[result_->size()]);
-#ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(result_host.get(), result_->ptr(),
-                                          result_->size(),
-                                          cudaMemcpyDeviceToHost));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(result_host.get(), result_->ptr(),
-                                         result_->size(),
-                                         hipMemcpyDeviceToHost));
-#endif
-    size_t thread_num = grid_num_ * block_num_;
-    for (size_t i = 0; i < stream_num_; ++i) {
-      for (size_t j = 0; j < data_num_; ++j) {
-        EXPECT_TRUE(result_host[i * stream_num_ + j] ==
-                    (j % thread_num) * stream_num_);
-      }
-    }
-    result_.reset();
-  }
-
-  void TearDown() override {
-#ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
-#endif
-    for (gpuStream_t stream : streams_) {
-      Release(place_, stream);
-    }
-
-    for (size_t i = 1; i < stream_num_; ++i) {
-#ifdef PADDLE_WITH_CUDA
-      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams_[i]));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(streams_[i]));
-#endif
-    }
-
-    CheckMemLeak(place_);
-  }
-
-  size_t stream_num_;
-  size_t grid_num_;
-  size_t block_num_;
-  size_t data_num_;
-  size_t workspace_size_;
-  platform::CUDAPlace place_;
-  std::vector<gpuStream_t> streams_;
-  std::vector<std::shared_ptr<Allocation>> workspaces_;
-  allocation::AllocationPtr result_;
-};
-
-TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) {
-  MultiStreamRun();
-  CheckResult();
-}
-
-TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStreamTest) {
-  MultiThreadMUltiStreamRun();
-  CheckResult();
-}
-
 TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
  platform::CUDAPlace place = platform::CUDAPlace();
  size_t alloc_size = 256;
@@ -214,7 +68,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
          paddle::platform::DeviceContextPool::Instance().Get(place))
          ->stream();
  allocation::AllocationPtr allocation_unique =
-      Alloc(place, alloc_size, default_stream);
+      Alloc(place, alloc_size,
+            phi::Stream(reinterpret_cast<phi::StreamId>(default_stream)));
  EXPECT_GE(allocation_unique->size(), alloc_size);
  EXPECT_EQ(allocation_unique->ptr(), address);
  allocation_unique.reset();
@@ -303,36 +158,6 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) {
  CheckMemLeak(place);
 }

-#ifdef PADDLE_WITH_CUDA
-TEST(StreamSafeCUDAAllocInterfaceTest, CUDAGraphExceptionTest) {
-  platform::CUDAPlace place = platform::CUDAPlace();
-  size_t alloc_size = 1;
-  std::shared_ptr<Allocation> allocation = AllocShared(place, alloc_size);
-
-  platform::BeginCUDAGraphCapture(place, cudaStreamCaptureModeGlobal);
-  EXPECT_THROW(AllocShared(place, alloc_size), paddle::platform::EnforceNotMet);
-  EXPECT_THROW(Alloc(place, alloc_size), paddle::platform::EnforceNotMet);
-  EXPECT_THROW(Release(place), paddle::platform::EnforceNotMet);
-  EXPECT_THROW(allocation::AllocatorFacade::Instance().GetAllocator(place),
-               paddle::platform::EnforceNotMet);
-  EXPECT_THROW(
-      AllocShared(place, alloc_size,
-                  phi::Stream(reinterpret_cast<phi::StreamId>(nullptr))),
-      paddle::platform::EnforceNotMet);
-  EXPECT_THROW(Alloc(place, alloc_size, nullptr),
-               paddle::platform::EnforceNotMet);
-  EXPECT_THROW(Release(place, nullptr), paddle::platform::EnforceNotMet);
-  EXPECT_THROW(RecordStream(allocation, nullptr),
-               paddle::platform::EnforceNotMet);
-  EXPECT_THROW(GetStream(allocation), paddle::platform::EnforceNotMet);
-  platform::EndCUDAGraphCapture();
-
-  allocation.reset();
-  Release(place);
-  CheckMemLeak(place);
-}
-#endif
-
 TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
  platform::CUDAPlace place = platform::CUDAPlace();
  gpuStream_t stream1, stream2;
@@ -348,12 +173,14 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
  // so the second alloc will fail and retry
  size_t alloc_size = available_size / 4 * 3;

-  allocation::AllocationPtr allocation1 = Alloc(place, alloc_size, stream1);
+  allocation::AllocationPtr allocation1 = Alloc(
+      place, alloc_size, phi::Stream(reinterpret_cast<phi::StreamId>(stream1)));
  allocation::AllocationPtr allocation2;

  std::thread th([&allocation2, &place, &stream2, alloc_size]() {
    std::this_thread::sleep_for(std::chrono::seconds(1));
-    allocation2 = Alloc(place, alloc_size, stream2);
+    allocation2 = Alloc(place, alloc_size,
+                        phi::Stream(reinterpret_cast<phi::StreamId>(stream2)));
  });
  allocation1.reset();  // free but not release
  th.join();
@@ -371,5 +198,201 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
  CheckMemLeak(place);
 }

+class StreamSafeCUDAAllocTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    place_ = platform::CUDAPlace();
+    stream_num_ = 64;
+    grid_num_ = 1;
+    block_num_ = 32;
+    data_num_ = 131072;
+    workspace_size_ = data_num_ * sizeof(int);
+
+    for (size_t i = 0; i < stream_num_; ++i) {
+      gpuStream_t stream;
+#ifdef PADDLE_WITH_CUDA
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream));
+#endif
+
+      std::shared_ptr<phi::Allocation> workspace_allocation =
+          AllocShared(place_, workspace_size_,
+                      phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+      std::shared_ptr<phi::Allocation> result_allocation =
+          AllocShared(place_, workspace_size_,
+                      phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+      std::shared_ptr<phi::Allocation> host_result_allocation =
+          AllocShared(platform::CPUPlace(), workspace_size_);
+
+#ifdef PADDLE_WITH_CUDA
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaMemset(workspace_allocation->ptr(), 0,
+                                            workspace_allocation->size()));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          cudaMemset(result_allocation->ptr(), 0, result_allocation->size()));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(hipMemset(workspace_allocation->ptr(), 0,
+                                           workspace_allocation->size()));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          hipMemset(result_allocation->ptr(), 0, result_allocation->size()));
+#endif
+
+      streams_.emplace_back(stream);
+      workspaces_.emplace_back(workspace_allocation);
+      results_.emplace_back(result_allocation);
+      host_results_.emplace_back(host_result_allocation);
+    }
+  }
+
+  void SingleStreamRun(size_t idx) {
+    int *y = reinterpret_cast<int *>(results_[idx]->ptr());
+    int neighbouring_idx = idx > 0 ? idx - 1 : idx;
+
+    add_kernel<<<grid_num_, block_num_, 0, streams_[idx]>>>(
+        reinterpret_cast<int *>(workspaces_[idx]->ptr()), y, data_num_);
+    add_kernel<<<grid_num_, block_num_, 0, streams_[idx]>>>(
+        reinterpret_cast<int *>(workspaces_[neighbouring_idx]->ptr()), y,
+        data_num_);
+    RecordStream(workspaces_[neighbouring_idx], streams_[idx]);
+  }
+
+  void MultiStreamRun() {
+    // Must run in reverse order, or the workspace_[i - 1] will be released
+    // before streams_[i]'s kernel launch
+    for (int i = stream_num_ - 1; i >= 0; --i) {
+      SingleStreamRun(i);
+      workspaces_[i].reset();  // fast GC
+    }
+  }
+
+  void MultiThreadMultiStreamRun() {
+    std::vector<std::thread> threads;
+    for (size_t i = 0; i < stream_num_; ++i) {
+      threads.push_back(
+          std::thread(&StreamSafeCUDAAllocTest::SingleStreamRun, this, i));
+    }
+    for (size_t i = 0; i < stream_num_; ++i) {
+      threads[i].join();
+    }
+    workspaces_.clear();
+  }
+
+  void CUDAGraphRun() {
+    testing_cuda_graph_ = true;
+    platform::BeginCUDAGraphCapture(platform::CUDAPlace(),
+                                    cudaStreamCaptureModeGlobal);
+
+    std::shared_ptr<Allocation> data_allocation =
+        AllocShared(platform::CUDAPlace(), workspace_size_);
+    std::shared_ptr<Allocation> result_allocation =
+        AllocShared(platform::CUDAPlace(), workspace_size_);
+
+    int *data = static_cast<int *>(data_allocation->ptr());
+    int *result = static_cast<int *>(result_allocation->ptr());
+
+    gpuStream_t main_stream = GetStream(data_allocation);
+    gpuStream_t other_stream;
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&other_stream));
+
+    add_kernel<<<grid_num_, block_num_, 0, main_stream>>>(data, result,
+                                                          data_num_);
+    RecordStream(data_allocation, other_stream);
+
+    std::unique_ptr<platform::CUDAGraph> cuda_graph =
+        platform::EndCUDAGraphCapture();
+
+    int replay_times = 10;
+    for (int i = 0; i < replay_times; ++i) {
+      cuda_graph->Replay();
+    }
+
+    std::shared_ptr<Allocation> host_result_allocation =
+        AllocShared(platform::CPUPlace(), workspace_size_);
+    Copy(host_result_allocation->place(), host_result_allocation->ptr(),
+         result_allocation->place(), result_allocation->ptr(), workspace_size_,
+         main_stream);
+    cudaStreamSynchronize(main_stream);
+
+    int *host_result = static_cast<int *>(host_result_allocation->ptr());
+    for (int i = 0; i < data_num_; ++i) {
+      EXPECT_EQ(host_result[i], replay_times);
+    }
+
+    data_allocation.reset();
+    result_allocation.reset();
+    cuda_graph.release();
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(other_stream));
+  }
+
+  void CheckResult() {
+    for (size_t i = 0; i < stream_num_; ++i) {
+      Copy(host_results_[i]->place(), host_results_[i]->ptr(),
+           results_[i]->place(), results_[i]->ptr(), workspace_size_,
+           streams_[i]);
+    }
+    cudaDeviceSynchronize();
+
+    size_t thread_num = grid_num_ * block_num_;
+    for (size_t i = 0; i < stream_num_; ++i) {
+      int *result = static_cast<int *>(host_results_[i]->ptr());
+      for (size_t j = 0; j < data_num_; ++j) {
+        EXPECT_EQ(result[j], 2);
+      }
+    }
+  }
+
+  void TearDown() override {
+    workspaces_.clear();
+    results_.clear();
+    host_results_.clear();
+    for (gpuStream_t stream : streams_) {
+      Release(place_, stream);
+    }
+
+    for (size_t i = 0; i < stream_num_; ++i) {
+#ifdef PADDLE_WITH_CUDA
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams_[i]));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(streams_[i]));
+#endif
+    }
+
+    // Memory release for CUDA Graph memory pool is forbidden
+    if (!testing_cuda_graph_) {
+      CheckMemLeak(place_);
+    }
+  }
+
+  bool testing_cuda_graph_{0};
+  size_t stream_num_;
+  size_t grid_num_;
+  size_t block_num_;
+  size_t data_num_;
+  size_t workspace_size_;
+  platform::CUDAPlace place_;
+  std::vector<gpuStream_t> streams_;
+  std::vector<std::shared_ptr<phi::Allocation>> workspaces_;
+  std::vector<std::shared_ptr<phi::Allocation>> results_;
+  std::vector<std::shared_ptr<phi::Allocation>> host_results_;
+};
+
+TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) {
+  MultiStreamRun();
+  CheckResult();
+}
+
+TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStreamTest) {
+  MultiThreadMultiStreamRun();
+  CheckResult();
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST_F(StreamSafeCUDAAllocTest, CUDAGraphTest) {
+  MultiStreamRun();
+  CUDAGraphRun();
+  CheckResult();
+}
+#endif
+
 }  // namespace memory
 }  // namespace paddle