Fix CUDAGraphAllocator bug for StreamSafeCUDAAllocator (#37821)

* Fix CUDAGraph bug for StreamSafeCUDAAllocator * Add CUDAGrapthAllocator check in multi-stream interface * Set FLAGS_use_stream_safe_cuda_allocator defaulted to false * Fix environment error for cmake * Fix cmake error * Add UT of GetAllocatorInterfaceTest * Add UT of CUDAGraphExceptionTest * Enhance CUDAGraphExceptionTest

Fix CUDAGraphAllocator bug for StreamSafeCUDAAllocator (#37821)
* Fix CUDAGraph bug for StreamSafeCUDAAllocator * Add CUDAGrapthAllocator check in multi-stream interface * Set FLAGS_use_stream_safe_cuda_allocator defaulted to false * Fix environment error for cmake * Fix cmake error * Add UT of GetAllocatorInterfaceTest * Add UT of CUDAGraphExceptionTest * Enhance CUDAGraphExceptionTest
b4a67491 · From00 · GitHub · 2c02a580 · b4a67491 · b4a67491
4 changed file
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@@ -19,14 +19,13 @@ if (WITH_GPU)
            DEPS device_context malloc)
    nv_test(stream_safe_cuda_alloc_test
            SRCS stream_safe_cuda_alloc_test.cu
-            DEPS malloc)
+            DEPS malloc cuda_graph_with_memory_pool)
    if(WITH_TESTING AND TEST stream_safe_cuda_alloc_test)
        set_tests_properties(stream_safe_cuda_alloc_test PROPERTIES 
-                             ENVIRONMENT "FLAGS_use_system_allocator=false"
+                             ENVIRONMENT "FLAGS_use_stream_safe_cuda_allocator=true;
-                             ENVIRONMENT "FLAGS_enable_stream_safe_cuda_allocator=true"
+                                          FLAGS_allocator_strategy=auto_growth")
-                             ENVIRONMENT "FLAGS_allocator_strategy=auto_growth")   
+    endif()  
-    endif()
 endif()
 if (WITH_ROCM)

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -71,7 +71,7 @@ PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth, false,
 // NOTE(Ruibiao): This FLAGS is just to be compatibled with
 // the old single-stream CUDA allocator. It will be removed
 // after StreamSafeCudaAllocator has been fully tested.
-PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, true,
+PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, false,
                            "Enable StreamSafeCUDAAllocator");
 DECLARE_string(allocator_strategy);
@@ -737,10 +737,18 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
      FLAGS_use_system_allocator == false) {
+#ifdef PADDLE_WITH_CUDA
+    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+      return m_->GetAllocator(place,
+                              /* A non-zero num to choose allocator_ */ 1);
+    }
+#endif
    return m_->GetAllocator(BOOST_GET_CONST(platform::CUDAPlace, place),
                            m_->GetDefaultStream());
  }
 #endif
  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
 }
@@ -754,10 +762,17 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
      size > 0 && FLAGS_use_system_allocator == false) {
+#ifdef PADDLE_WITH_CUDA
+    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+      return m_->GetAllocator(place, size)->Allocate(size);
+    }
+#endif
    return Alloc(BOOST_GET_CONST(platform::CUDAPlace, place), size,
                 m_->GetDefaultStream());
  }
 #endif
  return m_->GetAllocator(place, size)->Allocate(size);
 }
@@ -765,6 +780,14 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
      FLAGS_use_system_allocator == false) {
+#ifdef PADDLE_WITH_CUDA
+    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+      return m_
+          ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
+          ->Release(place);
+    }
+#endif
    return Release(BOOST_GET_CONST(platform::CUDAPlace, place),
                   m_->GetDefaultStream());
  }
@@ -783,6 +806,14 @@ std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
          "multi-stream 'AllocaShared' function. "
          "To enable it, you can enter 'export "
          "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
+  }
+#endif
  return std::shared_ptr<Allocation>(Alloc(place, size, stream));
 }
@@ -795,6 +826,14 @@ AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place,
          "multi-stream 'Alloca' function. "
          "To enable it, you can enter 'export "
          "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
+  }
+#endif
  if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
    return m_->GetAllocator(place, stream, /* creat_if_not_found = */ true)
        ->Allocate(size);
@@ -812,6 +851,14 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
          "multi-stream 'Release' function. "
          "To enable it, you can enter 'export "
          "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
+  }
+#endif
  return m_->GetAllocator(place, stream)->Release(place);
 }
@@ -824,6 +871,14 @@ void AllocatorFacade::RecordStream(Allocation* allocation,
          "'RecordStream' function. "
          "To enable it, you can enter 'export "
          "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
+  }
+#endif
  m_->RecordStream(allocation, stream);
 }

--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -103,6 +103,8 @@ uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) {
  for (StreamSafeCUDAAllocator* allocator : allocators) {
    release_size += allocator->ProcessEventsAndFreeWithRelease();
  }
+  VLOG(8) << "Release " << release_size
+          << " bytes memory from all stream for place " << place;
  return release_size;
 }

--- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
+++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
@@ -25,7 +25,9 @@
 #include <vector>
 #include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 namespace paddle {
@@ -38,6 +40,14 @@ __global__ void add_kernel(int *x, int n) {
  }
 }
+void CheckMemLeak(const platform::CUDAPlace &place) {
+  uint64_t cuda_malloc_size =
+      platform::RecordedGpuMallocSize(place.GetDeviceId());
+  ASSERT_EQ(cuda_malloc_size, 0) << "Found " << cuda_malloc_size
+                                 << " bytes memory that not released yet,"
+                                 << " there may be a memory leak problem";
+}
 class StreamSafeCUDAAllocTest : public ::testing::Test {
 protected:
  void SetUp() override {
@@ -143,11 +153,7 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
 #endif
    }
-    uint64_t cuda_malloc_size =
+    CheckMemLeak(place_);
-        platform::RecordedGpuMallocSize(place_.GetDeviceId());
-    ASSERT_EQ(cuda_malloc_size, 0) << "Found " << cuda_malloc_size
-                                   << " bytes memory that not released yet,"
-                                   << " there may be a memory leak problem";
  }
  size_t stream_num_;
@@ -186,8 +192,61 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
      Alloc(place, alloc_size, default_stream);
  EXPECT_GE(allocation_unique->size(), alloc_size);
  EXPECT_EQ(allocation_unique->ptr(), address);
+  allocation_unique.reset();
+  Release(place);
+  CheckMemLeak(place);
 }
+TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorInterfaceTest) {
+  platform::CUDAPlace place = platform::CUDAPlace();
+  auto &instance = allocation::AllocatorFacade::Instance();
+  const std::shared_ptr<Allocator> &allocator = instance.GetAllocator(place);
+  size_t alloc_size = 256;
+  std::shared_ptr<Allocation> allocation_from_allocator =
+      allocator->Allocate(alloc_size);
+  EXPECT_GE(allocation_from_allocator->size(), alloc_size);
+  void *address = allocation_from_allocator->ptr();
+  allocation_from_allocator.reset();
+  std::shared_ptr<Allocation> allocation_implicit_stream =
+      AllocShared(place, alloc_size);
+  EXPECT_GE(allocation_implicit_stream->size(), alloc_size);
+  EXPECT_EQ(allocation_implicit_stream->ptr(), address);
+  allocation_implicit_stream.reset();
+  Release(place);
+  CheckMemLeak(place);
+}
+#ifdef PADDLE_WITH_CUDA
+TEST(StreamSafeCUDAAllocInterfaceTest, CUDAGraphExceptionTest) {
+  platform::CUDAPlace place = platform::CUDAPlace();
+  size_t alloc_size = 1;
+  std::shared_ptr<Allocation> allocation = AllocShared(place, alloc_size);
+  platform::BeginCUDAGraphCapture(place, cudaStreamCaptureModeGlobal);
+  EXPECT_THROW(AllocShared(place, alloc_size), paddle::platform::EnforceNotMet);
+  EXPECT_THROW(Alloc(place, alloc_size), paddle::platform::EnforceNotMet);
+  EXPECT_THROW(Release(place), paddle::platform::EnforceNotMet);
+  EXPECT_THROW(allocation::AllocatorFacade::Instance().GetAllocator(place),
+               paddle::platform::EnforceNotMet);
+  EXPECT_THROW(AllocShared(place, alloc_size, nullptr),
+               paddle::platform::EnforceNotMet);
+  EXPECT_THROW(Alloc(place, alloc_size, nullptr),
+               paddle::platform::EnforceNotMet);
+  EXPECT_THROW(Release(place, nullptr), paddle::platform::EnforceNotMet);
+  EXPECT_THROW(RecordStream(allocation.get(), nullptr),
+               paddle::platform::EnforceNotMet);
+  platform::EndCUDAGraphCapture();
+  allocation.reset();
+  Release(place);
+  CheckMemLeak(place);
+}
+#endif
 TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
  platform::CUDAPlace place = platform::CUDAPlace();
  gpuStream_t stream1, stream2;
@@ -223,6 +282,7 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
  Release(place, stream1);
  Release(place, stream2);
+  CheckMemLeak(place);
 }
 }  // namespace memory