未验证 提交 b4a67491 编写于 作者: F From00 提交者: GitHub

Fix CUDAGraphAllocator bug for StreamSafeCUDAAllocator (#37821)

* Fix CUDAGraph bug for StreamSafeCUDAAllocator

* Add CUDAGrapthAllocator check in multi-stream interface

* Set FLAGS_use_stream_safe_cuda_allocator defaulted to false

* Fix environment error for cmake

* Fix cmake error

* Add UT of GetAllocatorInterfaceTest

* Add UT of CUDAGraphExceptionTest

* Enhance CUDAGraphExceptionTest
上级 2c02a580
......@@ -19,13 +19,12 @@ if (WITH_GPU)
DEPS device_context malloc)
nv_test(stream_safe_cuda_alloc_test
SRCS stream_safe_cuda_alloc_test.cu
DEPS malloc)
DEPS malloc cuda_graph_with_memory_pool)
if(WITH_TESTING AND TEST stream_safe_cuda_alloc_test)
set_tests_properties(stream_safe_cuda_alloc_test PROPERTIES
ENVIRONMENT "FLAGS_use_system_allocator=false"
ENVIRONMENT "FLAGS_enable_stream_safe_cuda_allocator=true"
ENVIRONMENT "FLAGS_allocator_strategy=auto_growth")
ENVIRONMENT "FLAGS_use_stream_safe_cuda_allocator=true;
FLAGS_allocator_strategy=auto_growth")
endif()
endif()
......
......@@ -71,7 +71,7 @@ PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth, false,
// NOTE(Ruibiao): This FLAGS is just to be compatibled with
// the old single-stream CUDA allocator. It will be removed
// after StreamSafeCudaAllocator has been fully tested.
PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, true,
PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, false,
"Enable StreamSafeCUDAAllocator");
DECLARE_string(allocator_strategy);
......@@ -737,10 +737,18 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
FLAGS_use_system_allocator == false) {
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
return m_->GetAllocator(place,
/* A non-zero num to choose allocator_ */ 1);
}
#endif
return m_->GetAllocator(BOOST_GET_CONST(platform::CUDAPlace, place),
m_->GetDefaultStream());
}
#endif
return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
}
......@@ -754,10 +762,17 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
size > 0 && FLAGS_use_system_allocator == false) {
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
return m_->GetAllocator(place, size)->Allocate(size);
}
#endif
return Alloc(BOOST_GET_CONST(platform::CUDAPlace, place), size,
m_->GetDefaultStream());
}
#endif
return m_->GetAllocator(place, size)->Allocate(size);
}
......@@ -765,6 +780,14 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
FLAGS_use_system_allocator == false) {
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
return m_
->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
->Release(place);
}
#endif
return Release(BOOST_GET_CONST(platform::CUDAPlace, place),
m_->GetDefaultStream());
}
......@@ -783,6 +806,14 @@ std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
"multi-stream 'AllocaShared' function. "
"To enable it, you can enter 'export "
"FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
PADDLE_THROW(platform::errors::Unavailable(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
}
#endif
return std::shared_ptr<Allocation>(Alloc(place, size, stream));
}
......@@ -795,6 +826,14 @@ AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place,
"multi-stream 'Alloca' function. "
"To enable it, you can enter 'export "
"FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
PADDLE_THROW(platform::errors::Unavailable(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
}
#endif
if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
return m_->GetAllocator(place, stream, /* creat_if_not_found = */ true)
->Allocate(size);
......@@ -812,6 +851,14 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
"multi-stream 'Release' function. "
"To enable it, you can enter 'export "
"FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
PADDLE_THROW(platform::errors::Unavailable(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
}
#endif
return m_->GetAllocator(place, stream)->Release(place);
}
......@@ -824,6 +871,14 @@ void AllocatorFacade::RecordStream(Allocation* allocation,
"'RecordStream' function. "
"To enable it, you can enter 'export "
"FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
PADDLE_THROW(platform::errors::Unavailable(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
}
#endif
m_->RecordStream(allocation, stream);
}
......
......@@ -103,6 +103,8 @@ uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) {
for (StreamSafeCUDAAllocator* allocator : allocators) {
release_size += allocator->ProcessEventsAndFreeWithRelease();
}
VLOG(8) << "Release " << release_size
<< " bytes memory from all stream for place " << place;
return release_size;
}
......
......@@ -25,7 +25,9 @@
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
namespace paddle {
......@@ -38,6 +40,14 @@ __global__ void add_kernel(int *x, int n) {
}
}
void CheckMemLeak(const platform::CUDAPlace &place) {
uint64_t cuda_malloc_size =
platform::RecordedGpuMallocSize(place.GetDeviceId());
ASSERT_EQ(cuda_malloc_size, 0) << "Found " << cuda_malloc_size
<< " bytes memory that not released yet,"
<< " there may be a memory leak problem";
}
class StreamSafeCUDAAllocTest : public ::testing::Test {
protected:
void SetUp() override {
......@@ -143,11 +153,7 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
#endif
}
uint64_t cuda_malloc_size =
platform::RecordedGpuMallocSize(place_.GetDeviceId());
ASSERT_EQ(cuda_malloc_size, 0) << "Found " << cuda_malloc_size
<< " bytes memory that not released yet,"
<< " there may be a memory leak problem";
CheckMemLeak(place_);
}
size_t stream_num_;
......@@ -186,8 +192,61 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
Alloc(place, alloc_size, default_stream);
EXPECT_GE(allocation_unique->size(), alloc_size);
EXPECT_EQ(allocation_unique->ptr(), address);
allocation_unique.reset();
Release(place);
CheckMemLeak(place);
}
TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorInterfaceTest) {
platform::CUDAPlace place = platform::CUDAPlace();
auto &instance = allocation::AllocatorFacade::Instance();
const std::shared_ptr<Allocator> &allocator = instance.GetAllocator(place);
size_t alloc_size = 256;
std::shared_ptr<Allocation> allocation_from_allocator =
allocator->Allocate(alloc_size);
EXPECT_GE(allocation_from_allocator->size(), alloc_size);
void *address = allocation_from_allocator->ptr();
allocation_from_allocator.reset();
std::shared_ptr<Allocation> allocation_implicit_stream =
AllocShared(place, alloc_size);
EXPECT_GE(allocation_implicit_stream->size(), alloc_size);
EXPECT_EQ(allocation_implicit_stream->ptr(), address);
allocation_implicit_stream.reset();
Release(place);
CheckMemLeak(place);
}
#ifdef PADDLE_WITH_CUDA
TEST(StreamSafeCUDAAllocInterfaceTest, CUDAGraphExceptionTest) {
platform::CUDAPlace place = platform::CUDAPlace();
size_t alloc_size = 1;
std::shared_ptr<Allocation> allocation = AllocShared(place, alloc_size);
platform::BeginCUDAGraphCapture(place, cudaStreamCaptureModeGlobal);
EXPECT_THROW(AllocShared(place, alloc_size), paddle::platform::EnforceNotMet);
EXPECT_THROW(Alloc(place, alloc_size), paddle::platform::EnforceNotMet);
EXPECT_THROW(Release(place), paddle::platform::EnforceNotMet);
EXPECT_THROW(allocation::AllocatorFacade::Instance().GetAllocator(place),
paddle::platform::EnforceNotMet);
EXPECT_THROW(AllocShared(place, alloc_size, nullptr),
paddle::platform::EnforceNotMet);
EXPECT_THROW(Alloc(place, alloc_size, nullptr),
paddle::platform::EnforceNotMet);
EXPECT_THROW(Release(place, nullptr), paddle::platform::EnforceNotMet);
EXPECT_THROW(RecordStream(allocation.get(), nullptr),
paddle::platform::EnforceNotMet);
platform::EndCUDAGraphCapture();
allocation.reset();
Release(place);
CheckMemLeak(place);
}
#endif
TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
platform::CUDAPlace place = platform::CUDAPlace();
gpuStream_t stream1, stream2;
......@@ -223,6 +282,7 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
Release(place, stream1);
Release(place, stream2);
CheckMemLeak(place);
}
} // namespace memory
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册