未验证 提交 b4a67491 编写于 作者: F From00 提交者: GitHub

Fix CUDAGraphAllocator bug for StreamSafeCUDAAllocator (#37821)

* Fix CUDAGraph bug for StreamSafeCUDAAllocator

* Add CUDAGrapthAllocator check in multi-stream interface

* Set FLAGS_use_stream_safe_cuda_allocator defaulted to false

* Fix environment error for cmake

* Fix cmake error

* Add UT of GetAllocatorInterfaceTest

* Add UT of CUDAGraphExceptionTest

* Enhance CUDAGraphExceptionTest
上级 2c02a580
...@@ -19,14 +19,13 @@ if (WITH_GPU) ...@@ -19,14 +19,13 @@ if (WITH_GPU)
DEPS device_context malloc) DEPS device_context malloc)
nv_test(stream_safe_cuda_alloc_test nv_test(stream_safe_cuda_alloc_test
SRCS stream_safe_cuda_alloc_test.cu SRCS stream_safe_cuda_alloc_test.cu
DEPS malloc) DEPS malloc cuda_graph_with_memory_pool)
if(WITH_TESTING AND TEST stream_safe_cuda_alloc_test) if(WITH_TESTING AND TEST stream_safe_cuda_alloc_test)
set_tests_properties(stream_safe_cuda_alloc_test PROPERTIES set_tests_properties(stream_safe_cuda_alloc_test PROPERTIES
ENVIRONMENT "FLAGS_use_system_allocator=false" ENVIRONMENT "FLAGS_use_stream_safe_cuda_allocator=true;
ENVIRONMENT "FLAGS_enable_stream_safe_cuda_allocator=true" FLAGS_allocator_strategy=auto_growth")
ENVIRONMENT "FLAGS_allocator_strategy=auto_growth") endif()
endif()
endif() endif()
if (WITH_ROCM) if (WITH_ROCM)
......
...@@ -71,7 +71,7 @@ PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth, false, ...@@ -71,7 +71,7 @@ PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth, false,
// NOTE(Ruibiao): This FLAGS is just to be compatibled with // NOTE(Ruibiao): This FLAGS is just to be compatibled with
// the old single-stream CUDA allocator. It will be removed // the old single-stream CUDA allocator. It will be removed
// after StreamSafeCudaAllocator has been fully tested. // after StreamSafeCudaAllocator has been fully tested.
PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, true, PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, false,
"Enable StreamSafeCUDAAllocator"); "Enable StreamSafeCUDAAllocator");
DECLARE_string(allocator_strategy); DECLARE_string(allocator_strategy);
...@@ -737,10 +737,18 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator( ...@@ -737,10 +737,18 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
FLAGS_use_system_allocator == false) { FLAGS_use_system_allocator == false) {
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
return m_->GetAllocator(place,
/* A non-zero num to choose allocator_ */ 1);
}
#endif
return m_->GetAllocator(BOOST_GET_CONST(platform::CUDAPlace, place), return m_->GetAllocator(BOOST_GET_CONST(platform::CUDAPlace, place),
m_->GetDefaultStream()); m_->GetDefaultStream());
} }
#endif #endif
return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
} }
...@@ -754,10 +762,17 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, ...@@ -754,10 +762,17 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
size > 0 && FLAGS_use_system_allocator == false) { size > 0 && FLAGS_use_system_allocator == false) {
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
return m_->GetAllocator(place, size)->Allocate(size);
}
#endif
return Alloc(BOOST_GET_CONST(platform::CUDAPlace, place), size, return Alloc(BOOST_GET_CONST(platform::CUDAPlace, place), size,
m_->GetDefaultStream()); m_->GetDefaultStream());
} }
#endif #endif
return m_->GetAllocator(place, size)->Allocate(size); return m_->GetAllocator(place, size)->Allocate(size);
} }
...@@ -765,6 +780,14 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) { ...@@ -765,6 +780,14 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
FLAGS_use_system_allocator == false) { FLAGS_use_system_allocator == false) {
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
return m_
->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
->Release(place);
}
#endif
return Release(BOOST_GET_CONST(platform::CUDAPlace, place), return Release(BOOST_GET_CONST(platform::CUDAPlace, place),
m_->GetDefaultStream()); m_->GetDefaultStream());
} }
...@@ -783,6 +806,14 @@ std::shared_ptr<Allocation> AllocatorFacade::AllocShared( ...@@ -783,6 +806,14 @@ std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
"multi-stream 'AllocaShared' function. " "multi-stream 'AllocaShared' function. "
"To enable it, you can enter 'export " "To enable it, you can enter 'export "
"FLAGS_use_stream_safe_cuda_allocator=true' in the terminal.")); "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
PADDLE_THROW(platform::errors::Unavailable(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
}
#endif
return std::shared_ptr<Allocation>(Alloc(place, size, stream)); return std::shared_ptr<Allocation>(Alloc(place, size, stream));
} }
...@@ -795,6 +826,14 @@ AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place, ...@@ -795,6 +826,14 @@ AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place,
"multi-stream 'Alloca' function. " "multi-stream 'Alloca' function. "
"To enable it, you can enter 'export " "To enable it, you can enter 'export "
"FLAGS_use_stream_safe_cuda_allocator=true' in the terminal.")); "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
PADDLE_THROW(platform::errors::Unavailable(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
}
#endif
if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) { if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
return m_->GetAllocator(place, stream, /* creat_if_not_found = */ true) return m_->GetAllocator(place, stream, /* creat_if_not_found = */ true)
->Allocate(size); ->Allocate(size);
...@@ -812,6 +851,14 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place, ...@@ -812,6 +851,14 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
"multi-stream 'Release' function. " "multi-stream 'Release' function. "
"To enable it, you can enter 'export " "To enable it, you can enter 'export "
"FLAGS_use_stream_safe_cuda_allocator=true' in the terminal.")); "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
PADDLE_THROW(platform::errors::Unavailable(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
}
#endif
return m_->GetAllocator(place, stream)->Release(place); return m_->GetAllocator(place, stream)->Release(place);
} }
...@@ -824,6 +871,14 @@ void AllocatorFacade::RecordStream(Allocation* allocation, ...@@ -824,6 +871,14 @@ void AllocatorFacade::RecordStream(Allocation* allocation,
"'RecordStream' function. " "'RecordStream' function. "
"To enable it, you can enter 'export " "To enable it, you can enter 'export "
"FLAGS_use_stream_safe_cuda_allocator=true' in the terminal.")); "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
PADDLE_THROW(platform::errors::Unavailable(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
}
#endif
m_->RecordStream(allocation, stream); m_->RecordStream(allocation, stream);
} }
......
...@@ -103,6 +103,8 @@ uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) { ...@@ -103,6 +103,8 @@ uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) {
for (StreamSafeCUDAAllocator* allocator : allocators) { for (StreamSafeCUDAAllocator* allocator : allocators) {
release_size += allocator->ProcessEventsAndFreeWithRelease(); release_size += allocator->ProcessEventsAndFreeWithRelease();
} }
VLOG(8) << "Release " << release_size
<< " bytes memory from all stream for place " << place;
return release_size; return release_size;
} }
......
...@@ -25,7 +25,9 @@ ...@@ -25,7 +25,9 @@
#include <vector> #include <vector>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
namespace paddle { namespace paddle {
...@@ -38,6 +40,14 @@ __global__ void add_kernel(int *x, int n) { ...@@ -38,6 +40,14 @@ __global__ void add_kernel(int *x, int n) {
} }
} }
void CheckMemLeak(const platform::CUDAPlace &place) {
uint64_t cuda_malloc_size =
platform::RecordedGpuMallocSize(place.GetDeviceId());
ASSERT_EQ(cuda_malloc_size, 0) << "Found " << cuda_malloc_size
<< " bytes memory that not released yet,"
<< " there may be a memory leak problem";
}
class StreamSafeCUDAAllocTest : public ::testing::Test { class StreamSafeCUDAAllocTest : public ::testing::Test {
protected: protected:
void SetUp() override { void SetUp() override {
...@@ -143,11 +153,7 @@ class StreamSafeCUDAAllocTest : public ::testing::Test { ...@@ -143,11 +153,7 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
#endif #endif
} }
uint64_t cuda_malloc_size = CheckMemLeak(place_);
platform::RecordedGpuMallocSize(place_.GetDeviceId());
ASSERT_EQ(cuda_malloc_size, 0) << "Found " << cuda_malloc_size
<< " bytes memory that not released yet,"
<< " there may be a memory leak problem";
} }
size_t stream_num_; size_t stream_num_;
...@@ -186,8 +192,61 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) { ...@@ -186,8 +192,61 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
Alloc(place, alloc_size, default_stream); Alloc(place, alloc_size, default_stream);
EXPECT_GE(allocation_unique->size(), alloc_size); EXPECT_GE(allocation_unique->size(), alloc_size);
EXPECT_EQ(allocation_unique->ptr(), address); EXPECT_EQ(allocation_unique->ptr(), address);
allocation_unique.reset();
Release(place);
CheckMemLeak(place);
} }
TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorInterfaceTest) {
platform::CUDAPlace place = platform::CUDAPlace();
auto &instance = allocation::AllocatorFacade::Instance();
const std::shared_ptr<Allocator> &allocator = instance.GetAllocator(place);
size_t alloc_size = 256;
std::shared_ptr<Allocation> allocation_from_allocator =
allocator->Allocate(alloc_size);
EXPECT_GE(allocation_from_allocator->size(), alloc_size);
void *address = allocation_from_allocator->ptr();
allocation_from_allocator.reset();
std::shared_ptr<Allocation> allocation_implicit_stream =
AllocShared(place, alloc_size);
EXPECT_GE(allocation_implicit_stream->size(), alloc_size);
EXPECT_EQ(allocation_implicit_stream->ptr(), address);
allocation_implicit_stream.reset();
Release(place);
CheckMemLeak(place);
}
#ifdef PADDLE_WITH_CUDA
TEST(StreamSafeCUDAAllocInterfaceTest, CUDAGraphExceptionTest) {
platform::CUDAPlace place = platform::CUDAPlace();
size_t alloc_size = 1;
std::shared_ptr<Allocation> allocation = AllocShared(place, alloc_size);
platform::BeginCUDAGraphCapture(place, cudaStreamCaptureModeGlobal);
EXPECT_THROW(AllocShared(place, alloc_size), paddle::platform::EnforceNotMet);
EXPECT_THROW(Alloc(place, alloc_size), paddle::platform::EnforceNotMet);
EXPECT_THROW(Release(place), paddle::platform::EnforceNotMet);
EXPECT_THROW(allocation::AllocatorFacade::Instance().GetAllocator(place),
paddle::platform::EnforceNotMet);
EXPECT_THROW(AllocShared(place, alloc_size, nullptr),
paddle::platform::EnforceNotMet);
EXPECT_THROW(Alloc(place, alloc_size, nullptr),
paddle::platform::EnforceNotMet);
EXPECT_THROW(Release(place, nullptr), paddle::platform::EnforceNotMet);
EXPECT_THROW(RecordStream(allocation.get(), nullptr),
paddle::platform::EnforceNotMet);
platform::EndCUDAGraphCapture();
allocation.reset();
Release(place);
CheckMemLeak(place);
}
#endif
TEST(StreamSafeCUDAAllocRetryTest, RetryTest) { TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
platform::CUDAPlace place = platform::CUDAPlace(); platform::CUDAPlace place = platform::CUDAPlace();
gpuStream_t stream1, stream2; gpuStream_t stream1, stream2;
...@@ -223,6 +282,7 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) { ...@@ -223,6 +282,7 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
Release(place, stream1); Release(place, stream1);
Release(place, stream2); Release(place, stream2);
CheckMemLeak(place);
} }
} // namespace memory } // namespace memory
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册