diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt index 69134e1c76bb74ede7562e0bec57a060fecab1ec..97952e4b71641e00e27592380a0fd88f2c17b1a0 100644 --- a/paddle/fluid/memory/CMakeLists.txt +++ b/paddle/fluid/memory/CMakeLists.txt @@ -19,14 +19,13 @@ if (WITH_GPU) DEPS device_context malloc) nv_test(stream_safe_cuda_alloc_test SRCS stream_safe_cuda_alloc_test.cu - DEPS malloc) + DEPS malloc cuda_graph_with_memory_pool) if(WITH_TESTING AND TEST stream_safe_cuda_alloc_test) set_tests_properties(stream_safe_cuda_alloc_test PROPERTIES - ENVIRONMENT "FLAGS_use_system_allocator=false" - ENVIRONMENT "FLAGS_enable_stream_safe_cuda_allocator=true" - ENVIRONMENT "FLAGS_allocator_strategy=auto_growth") - endif() + ENVIRONMENT "FLAGS_use_stream_safe_cuda_allocator=true; + FLAGS_allocator_strategy=auto_growth") + endif() endif() if (WITH_ROCM) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 13cd980881bd546243713309720db75acfce7f67..2aed7ec001d2a9e7738dad038f146ee076772da9 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -71,7 +71,7 @@ PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth, false, // NOTE(Ruibiao): This FLAGS is just to be compatibled with // the old single-stream CUDA allocator. It will be removed // after StreamSafeCudaAllocator has been fully tested. -PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, true, +PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, false, "Enable StreamSafeCUDAAllocator"); DECLARE_string(allocator_strategy); @@ -737,10 +737,18 @@ const std::shared_ptr& AllocatorFacade::GetAllocator( #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && FLAGS_use_system_allocator == false) { +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { + return m_->GetAllocator(place, + /* A non-zero num to choose allocator_ */ 1); + } +#endif + return m_->GetAllocator(BOOST_GET_CONST(platform::CUDAPlace, place), m_->GetDefaultStream()); } #endif + return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); } @@ -754,10 +762,17 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && size > 0 && FLAGS_use_system_allocator == false) { +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { + return m_->GetAllocator(place, size)->Allocate(size); + } +#endif + return Alloc(BOOST_GET_CONST(platform::CUDAPlace, place), size, m_->GetDefaultStream()); } #endif + return m_->GetAllocator(place, size)->Allocate(size); } @@ -765,6 +780,14 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && FLAGS_use_system_allocator == false) { +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { + return m_ + ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1) + ->Release(place); + } +#endif + return Release(BOOST_GET_CONST(platform::CUDAPlace, place), m_->GetDefaultStream()); } @@ -783,6 +806,14 @@ std::shared_ptr AllocatorFacade::AllocShared( "multi-stream 'AllocaShared' function. " "To enable it, you can enter 'export " "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal.")); + +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { + PADDLE_THROW(platform::errors::Unavailable( + "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); + } +#endif + return std::shared_ptr(Alloc(place, size, stream)); } @@ -795,6 +826,14 @@ AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place, "multi-stream 'Alloca' function. " "To enable it, you can enter 'export " "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal.")); + +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { + PADDLE_THROW(platform::errors::Unavailable( + "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); + } +#endif + if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) { return m_->GetAllocator(place, stream, /* creat_if_not_found = */ true) ->Allocate(size); @@ -812,6 +851,14 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place, "multi-stream 'Release' function. " "To enable it, you can enter 'export " "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal.")); + +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { + PADDLE_THROW(platform::errors::Unavailable( + "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); + } +#endif + return m_->GetAllocator(place, stream)->Release(place); } @@ -824,6 +871,14 @@ void AllocatorFacade::RecordStream(Allocation* allocation, "'RecordStream' function. " "To enable it, you can enter 'export " "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal.")); + +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { + PADDLE_THROW(platform::errors::Unavailable( + "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); + } +#endif + m_->RecordStream(allocation, stream); } diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc index d11240bc84487067a57d4a76baba1d597b3a6ca3..86f3135ee4d147028eb12057f9be7ae451f6860e 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc @@ -103,6 +103,8 @@ uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) { for (StreamSafeCUDAAllocator* allocator : allocators) { release_size += allocator->ProcessEventsAndFreeWithRelease(); } + VLOG(8) << "Release " << release_size + << " bytes memory from all stream for place " << place; return release_size; } diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu index a0293e8410c5863d466c834ac1daccb7766e33e6..134c368d4340e379feb05172d0963049ec65e4d5 100644 --- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu +++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu @@ -25,7 +25,9 @@ #include #include "gtest/gtest.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" namespace paddle { @@ -38,6 +40,14 @@ __global__ void add_kernel(int *x, int n) { } } +void CheckMemLeak(const platform::CUDAPlace &place) { + uint64_t cuda_malloc_size = + platform::RecordedGpuMallocSize(place.GetDeviceId()); + ASSERT_EQ(cuda_malloc_size, 0) << "Found " << cuda_malloc_size + << " bytes memory that not released yet," + << " there may be a memory leak problem"; +} + class StreamSafeCUDAAllocTest : public ::testing::Test { protected: void SetUp() override { @@ -143,11 +153,7 @@ class StreamSafeCUDAAllocTest : public ::testing::Test { #endif } - uint64_t cuda_malloc_size = - platform::RecordedGpuMallocSize(place_.GetDeviceId()); - ASSERT_EQ(cuda_malloc_size, 0) << "Found " << cuda_malloc_size - << " bytes memory that not released yet," - << " there may be a memory leak problem"; + CheckMemLeak(place_); } size_t stream_num_; @@ -186,8 +192,61 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) { Alloc(place, alloc_size, default_stream); EXPECT_GE(allocation_unique->size(), alloc_size); EXPECT_EQ(allocation_unique->ptr(), address); + allocation_unique.reset(); + + Release(place); + CheckMemLeak(place); } +TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorInterfaceTest) { + platform::CUDAPlace place = platform::CUDAPlace(); + auto &instance = allocation::AllocatorFacade::Instance(); + const std::shared_ptr &allocator = instance.GetAllocator(place); + + size_t alloc_size = 256; + std::shared_ptr allocation_from_allocator = + allocator->Allocate(alloc_size); + EXPECT_GE(allocation_from_allocator->size(), alloc_size); + void *address = allocation_from_allocator->ptr(); + allocation_from_allocator.reset(); + + std::shared_ptr allocation_implicit_stream = + AllocShared(place, alloc_size); + EXPECT_GE(allocation_implicit_stream->size(), alloc_size); + EXPECT_EQ(allocation_implicit_stream->ptr(), address); + allocation_implicit_stream.reset(); + + Release(place); + CheckMemLeak(place); +} + +#ifdef PADDLE_WITH_CUDA +TEST(StreamSafeCUDAAllocInterfaceTest, CUDAGraphExceptionTest) { + platform::CUDAPlace place = platform::CUDAPlace(); + size_t alloc_size = 1; + std::shared_ptr allocation = AllocShared(place, alloc_size); + + platform::BeginCUDAGraphCapture(place, cudaStreamCaptureModeGlobal); + EXPECT_THROW(AllocShared(place, alloc_size), paddle::platform::EnforceNotMet); + EXPECT_THROW(Alloc(place, alloc_size), paddle::platform::EnforceNotMet); + EXPECT_THROW(Release(place), paddle::platform::EnforceNotMet); + EXPECT_THROW(allocation::AllocatorFacade::Instance().GetAllocator(place), + paddle::platform::EnforceNotMet); + EXPECT_THROW(AllocShared(place, alloc_size, nullptr), + paddle::platform::EnforceNotMet); + EXPECT_THROW(Alloc(place, alloc_size, nullptr), + paddle::platform::EnforceNotMet); + EXPECT_THROW(Release(place, nullptr), paddle::platform::EnforceNotMet); + EXPECT_THROW(RecordStream(allocation.get(), nullptr), + paddle::platform::EnforceNotMet); + platform::EndCUDAGraphCapture(); + + allocation.reset(); + Release(place); + CheckMemLeak(place); +} +#endif + TEST(StreamSafeCUDAAllocRetryTest, RetryTest) { platform::CUDAPlace place = platform::CUDAPlace(); gpuStream_t stream1, stream2; @@ -223,6 +282,7 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) { Release(place, stream1); Release(place, stream2); + CheckMemLeak(place); } } // namespace memory