From ea6ee76fa9f3971ed92cc7a3f62326500c37fb43 Mon Sep 17 00:00:00 2001 From: Huihuang Zheng Date: Wed, 31 Jul 2019 17:09:32 +0800 Subject: [PATCH] GPU allocation uses fraction of available memory (#18896) GPU allocation uses fraction of available memory, also fix the GetUsed without lock --- paddle/fluid/memory/detail/buddy_allocator.cc | 9 +- .../memory/detail/buddy_allocator_test.cc | 134 ++++++++++++++---- paddle/fluid/platform/gpu_info.cc | 84 +++++------ paddle/fluid/platform/gpu_info.h | 4 + 4 files changed, 153 insertions(+), 78 deletions(-) diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc index edd6ea4adec..8fce86eeec8 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.cc +++ b/paddle/fluid/memory/detail/buddy_allocator.cc @@ -23,6 +23,10 @@ DEFINE_bool(free_idle_memory, false, "If it is true, Paddle will try to free idle memory trunks during " "running time."); +#ifdef PADDLE_WITH_CUDA +DECLARE_uint64(reallocate_gpu_memory_in_mb); +#endif + namespace paddle { namespace memory { namespace detail { @@ -200,8 +204,9 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool( // Compute the allocation size for gpu for the first allocation. allocate_bytes = std::max(platform::GpuInitAllocSize(), request_bytes); } else { - // Reallocation size - if (realloc_size_ == 0) { + // Compute the re-allocation size, we store the re-allocation size when + // user set FLAGS_reallocate_gpu_memory_in_mb to fix value. + if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) { realloc_size_ = platform::GpuReallocSize(); } allocate_bytes = std::max(realloc_size_, request_bytes); diff --git a/paddle/fluid/memory/detail/buddy_allocator_test.cc b/paddle/fluid/memory/detail/buddy_allocator_test.cc index 1edc9f2034c..24154ad9ac3 100644 --- a/paddle/fluid/memory/detail/buddy_allocator_test.cc +++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc @@ -22,6 +22,8 @@ limitations under the License. */ #include "paddle/fluid/platform/gpu_info.h" #ifdef PADDLE_WITH_CUDA +#include + DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_uint64(initial_gpu_memory_in_mb); DECLARE_uint64(reallocate_gpu_memory_in_mb); @@ -31,9 +33,11 @@ namespace paddle { namespace memory { namespace detail { -constexpr static int test_gpu_id = 0; +constexpr static int TEST_GPU_ID = 0; -void TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes) { +int* TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes, + bool use_system_allocator = false, + bool free_ptr = true) { bool freed = false; size_t used_bytes = allocator->Used(); @@ -41,19 +45,25 @@ void TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes) { void* p = allocator->Alloc(size_bytes); EXPECT_NE(p, nullptr); + #ifdef PADDLE_WITH_CUDA - if (size_bytes < platform::GpuMaxChunkSize()) { + if (size_bytes < allocator->GetMaxChunkSize()) { #else - if (size_bytes < platform::CpuMaxChunkSize()) { + if (size_bytes < allocator->GetMaxChunkSize()) { #endif // Not allocate from SystemAllocator + EXPECT_FALSE(use_system_allocator); EXPECT_GE(allocator->Used(), used_bytes + size_bytes); } else { // Allocate from SystemAllocator doesn't count in Used() + EXPECT_TRUE(use_system_allocator); EXPECT_EQ(allocator->Used(), used_bytes); } int* intp = static_cast(p); + if (!free_ptr) { + return intp; + } std::shared_ptr ptr(intp, [&](void* p) { allocator->Free(intp); freed = true; @@ -64,20 +74,30 @@ void TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes) { EXPECT_EQ(used_bytes, allocator->Used()); EXPECT_TRUE(freed); + return nullptr; } #ifdef PADDLE_WITH_CUDA TEST(BuddyAllocator, GpuFraction) { + // In a 16 GB machine, the pool size will be about 160 MB FLAGS_fraction_of_gpu_memory_to_use = 0.01; + FLAGS_initial_gpu_memory_in_mb = 0; + FLAGS_reallocate_gpu_memory_in_mb = 0; BuddyAllocator buddy_allocator( - std::unique_ptr(new GPUAllocator(test_gpu_id)), + std::unique_ptr(new GPUAllocator(TEST_GPU_ID)), platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); + // Less than pool size TestBuddyAllocator(&buddy_allocator, 10); TestBuddyAllocator(&buddy_allocator, 10 << 10); TestBuddyAllocator(&buddy_allocator, 10 << 20); - TestBuddyAllocator(&buddy_allocator, 2 * static_cast(1 << 30)); + + // Greater than max chunk size + TestBuddyAllocator(&buddy_allocator, 499 << 20, + /* use_system_allocator = */ true); + TestBuddyAllocator(&buddy_allocator, 2 * static_cast(1 << 30), + /* use_system_allocator = */ true); } TEST(BuddyAllocator, InitRealloc) { @@ -87,19 +107,19 @@ TEST(BuddyAllocator, InitRealloc) { EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast(100 << 20)); BuddyAllocator buddy_allocator( - std::unique_ptr(new GPUAllocator(test_gpu_id)), + std::unique_ptr(new GPUAllocator(TEST_GPU_ID)), platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); // Less then initial size and reallocate size TestBuddyAllocator(&buddy_allocator, 10 << 20); // Between initial size and reallocate size and not exceed pool TestBuddyAllocator(&buddy_allocator, 80 << 20); - // Less then reallocate size and exceed pool - TestBuddyAllocator(&buddy_allocator, 40 << 20); - // Greater then reallocate size and exceed pool - TestBuddyAllocator(&buddy_allocator, 80 << 20); - // Greater then initial size and reallocate size - TestBuddyAllocator(&buddy_allocator, 2 * static_cast(1 << 30)); + TestBuddyAllocator(&buddy_allocator, 99 << 20); + // Greater than max chunk size + TestBuddyAllocator(&buddy_allocator, 101 << 20, + /* use_system_allocator = */ true); + TestBuddyAllocator(&buddy_allocator, 2 * static_cast(1 << 30), + /* use_system_allocator = */ true); } TEST(BuddyAllocator, ReallocSizeGreaterThanInit) { @@ -109,23 +129,87 @@ TEST(BuddyAllocator, ReallocSizeGreaterThanInit) { EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast(10 << 20)); BuddyAllocator buddy_allocator( - std::unique_ptr(new GPUAllocator(test_gpu_id)), + std::unique_ptr(new GPUAllocator(TEST_GPU_ID)), platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); - // Less then initial size and reallocate size + // Less than initial size and reallocate size TestBuddyAllocator(&buddy_allocator, 1 << 20); - // Between initial size and reallocate size and not exceed pool - TestBuddyAllocator(&buddy_allocator, 3 << 20); - // Less then initial size and exceed pool - TestBuddyAllocator(&buddy_allocator, 3 << 20); - // Less then reallocate size and not exceed pool (now pool is 15 MB, used 7 - // MB) - TestBuddyAllocator(&buddy_allocator, 7 << 20); - // Less then reallocate size and exceed pool + // Between initial size and reallocate size and exceed pool + TestBuddyAllocator(&buddy_allocator, 6 << 20); TestBuddyAllocator(&buddy_allocator, 8 << 20); - // Greater then initial size and reallocate size - TestBuddyAllocator(&buddy_allocator, 2 * static_cast(1 << 30)); + TestBuddyAllocator(&buddy_allocator, 9 << 20); + // Greater than max trunk size + TestBuddyAllocator(&buddy_allocator, 11 << 20, + /* use_system_allocator = */ true); + TestBuddyAllocator(&buddy_allocator, 2 * static_cast(1 << 30), + /* use_system_allocator = */ true); +} + +TEST(BuddyAllocator, FractionRefillPool) { + FLAGS_fraction_of_gpu_memory_to_use = 0.6; + FLAGS_initial_gpu_memory_in_mb = 0; + FLAGS_reallocate_gpu_memory_in_mb = 0; + + size_t max_chunk_size = platform::GpuMaxChunkSize(); + BuddyAllocator buddy_allocator( + std::unique_ptr(new GPUAllocator(TEST_GPU_ID)), + platform::GpuMinChunkSize(), max_chunk_size); + + // Less than pool size + int* p0 = TestBuddyAllocator(&buddy_allocator, max_chunk_size - 1000, + /* use_system_allocator = */ false, + /* free_ptr = */ false); + // Max chunk size should be same during allocation + EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize()); + + size_t alloc = + platform::GpuAvailableMemToAlloc() * FLAGS_fraction_of_gpu_memory_to_use; + // Exceed pool trigger refilling size of fraction of avaiable gpu, and should + // be able to alloc 60% of the remaining GPU + int* p1 = TestBuddyAllocator(&buddy_allocator, alloc, + /* use_system_allocator = */ false, + /* free_ptr = */ false); + // Max chunk size should be same during allocation + EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize()); + + alloc = + platform::GpuAvailableMemToAlloc() * FLAGS_fraction_of_gpu_memory_to_use; + // Exceed pool trigger refilling size of fraction of avaiable gpu, and should + // be able to alloc 60% of the remaining GPU + TestBuddyAllocator(&buddy_allocator, alloc, + /* use_system_allocator = */ false); + // Max chunk size should be same during allocation + EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize()); + + buddy_allocator.Free(p0); + buddy_allocator.Free(p1); } + +TEST(BuddyAllocator, AllocFromAvailable) { + FLAGS_fraction_of_gpu_memory_to_use = 0.7; + FLAGS_initial_gpu_memory_in_mb = 0; + FLAGS_reallocate_gpu_memory_in_mb = 0; + + size_t total = 0, available = 0; + platform::SetDeviceId(TEST_GPU_ID); + platform::GpuMemoryUsage(&available, &total); + + // Take half of available GPU + void* p; + cudaError_t result = cudaMalloc(&p, available >> 1); + EXPECT_TRUE(result == cudaSuccess); + + // BuddyAllocator should be able to alloc the remaining GPU + BuddyAllocator buddy_allocator( + std::unique_ptr(new GPUAllocator(TEST_GPU_ID)), + platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); + + TestBuddyAllocator(&buddy_allocator, 10); + TestBuddyAllocator(&buddy_allocator, 10 << 10); + TestBuddyAllocator(&buddy_allocator, 10 << 20); + TestBuddyAllocator(&buddy_allocator, static_cast(1 << 30)); +} + #endif } // namespace detail diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index 2420966288d..09df33e4530 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -227,47 +227,44 @@ void GpuMemoryUsage(size_t *available, size_t *total) { error_code, CudaErrorWebsite()); } -size_t GpuMaxAllocSize() { - return std::max(GpuInitAllocSize(), GpuReallocSize()); -} - -size_t GpuInitAllocSize() { - if (FLAGS_initial_gpu_memory_in_mb > 0ul) { - // Initial memory will be allocated by FLAGS_initial_gpu_memory_in_mb - return static_cast(FLAGS_initial_gpu_memory_in_mb << 20); - } - - // FLAGS_initial_gpu_memory_in_mb is 0, initial memory will be allocated by - // fraction +size_t GpuAvailableMemToAlloc() { size_t total = 0; size_t available = 0; - - GpuMemoryUsage(&available, &total); size_t reserving = static_cast(fraction_reserve_gpu_memory * total); - - return static_cast((total - reserving) * - FLAGS_fraction_of_gpu_memory_to_use); + GpuMemoryUsage(&available, &total); + // If available size is less than minimum chunk size, no usable memory exists + size_t min_chunk_size = GpuMinChunkSize(); + size_t available_to_alloc = + std::min(available > min_chunk_size ? available : 0, total - reserving); + VLOG(10) << "GPU usage " << (available >> 20) << "M/" << (total >> 20) + << "M, " << (available_to_alloc >> 20) << "M available to allocate"; + return available_to_alloc; } -size_t GpuReallocSize() { - if (FLAGS_reallocate_gpu_memory_in_mb > 0ul) { - // Additional memory will be allocated by - // FLAGS_reallocate_gpu_memory_in_mb - return static_cast(FLAGS_reallocate_gpu_memory_in_mb << 20); - } +size_t GpuMaxAllocSize() { + return std::max(GpuInitAllocSize(), GpuReallocSize()); +} - // FLAGS_reallocate_gpu_memory_in_mb is 0, additional memory will be - // allocated - // by fraction - size_t total = 0; - size_t available = 0; +static size_t GpuAllocSize(bool realloc) { + size_t available_to_alloc = GpuAvailableMemToAlloc(); + PADDLE_ENFORCE_GT(available_to_alloc, 0, "No enough available GPU memory"); + // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be + // allocated by fraction + size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb + : FLAGS_initial_gpu_memory_in_mb; + size_t alloc_bytes = + (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc * + FLAGS_fraction_of_gpu_memory_to_use); + PADDLE_ENFORCE_GT(available_to_alloc, alloc_bytes, + "No enough available GPU memory"); + VLOG(10) << "Alloc size is " << (alloc_bytes >> 20) + << " MiB, is it Re-alloc: " << realloc; + return alloc_bytes; +} - GpuMemoryUsage(&available, &total); - size_t reserving = static_cast(fraction_reserve_gpu_memory * total); +size_t GpuInitAllocSize() { return GpuAllocSize(/* realloc = */ false); } - return static_cast((total - reserving) * - FLAGS_fraction_of_gpu_memory_to_use); -} +size_t GpuReallocSize() { return GpuAllocSize(/* realloc = */ true); } size_t GpuMinChunkSize() { // Allow to allocate the minimum chunk size is 256 bytes. @@ -275,24 +272,9 @@ size_t GpuMinChunkSize() { } size_t GpuMaxChunkSize() { - size_t total = 0; - size_t available = 0; - - GpuMemoryUsage(&available, &total); - VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/" - << total / 1024 / 1024 << "M"; - size_t reserving = static_cast(fraction_reserve_gpu_memory * total); - // If available less than minimum chunk size, no usable memory exists. - available = - std::min(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(), - total - reserving); - - size_t allocating = GpuMaxAllocSize(); - - PADDLE_ENFORCE_LE(allocating, available, - "Insufficient GPU memory to allocation."); - - return allocating; + size_t max_chunk_size = GpuMaxAllocSize(); + VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M"; + return max_chunk_size; } void GpuMemcpyAsync(void *dst, const void *src, size_t count, diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h index d4be7ac97b2..e468c4aab0b 100644 --- a/paddle/fluid/platform/gpu_info.h +++ b/paddle/fluid/platform/gpu_info.h @@ -57,6 +57,10 @@ void SetDeviceId(int device_id); //! Get the memory usage of current GPU device. void GpuMemoryUsage(size_t *available, size_t *total); +//! Get the available memory to allocate, which is the size of available gpu +//! minus reserving. +size_t GpuAvailableMemToAlloc(); + //! Get the maximum allocation size of current GPU device. size_t GpuMaxAllocSize(); -- GitLab