未验证 提交 737f21b8 编写于 作者: Z Zeng Jinle 提交者: GitHub

cherry-pick refine default gpu memory, test=release/1.5 (#19281)

上级 71168dad
...@@ -23,6 +23,10 @@ DEFINE_bool(free_idle_memory, false, ...@@ -23,6 +23,10 @@ DEFINE_bool(free_idle_memory, false,
"If it is true, Paddle will try to free idle memory trunks during " "If it is true, Paddle will try to free idle memory trunks during "
"running time."); "running time.");
#ifdef PADDLE_WITH_CUDA
DECLARE_uint64(reallocate_gpu_memory_in_mb);
#endif
namespace paddle { namespace paddle {
namespace memory { namespace memory {
namespace detail { namespace detail {
...@@ -200,8 +204,9 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool( ...@@ -200,8 +204,9 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
// Compute the allocation size for gpu for the first allocation. // Compute the allocation size for gpu for the first allocation.
allocate_bytes = std::max(platform::GpuInitAllocSize(), request_bytes); allocate_bytes = std::max(platform::GpuInitAllocSize(), request_bytes);
} else { } else {
// Reallocation size // Compute the re-allocation size, we store the re-allocation size when
if (realloc_size_ == 0) { // user set FLAGS_reallocate_gpu_memory_in_mb to fix value.
if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) {
realloc_size_ = platform::GpuReallocSize(); realloc_size_ = platform::GpuReallocSize();
} }
allocate_bytes = std::max(realloc_size_, request_bytes); allocate_bytes = std::max(realloc_size_, request_bytes);
......
...@@ -22,6 +22,8 @@ limitations under the License. */ ...@@ -22,6 +22,8 @@ limitations under the License. */
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h>
DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb); DECLARE_uint64(initial_gpu_memory_in_mb);
DECLARE_uint64(reallocate_gpu_memory_in_mb); DECLARE_uint64(reallocate_gpu_memory_in_mb);
...@@ -31,9 +33,11 @@ namespace paddle { ...@@ -31,9 +33,11 @@ namespace paddle {
namespace memory { namespace memory {
namespace detail { namespace detail {
constexpr static int test_gpu_id = 0; constexpr static int TEST_GPU_ID = 0;
void TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes) { int* TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes,
bool use_system_allocator = false,
bool free_ptr = true) {
bool freed = false; bool freed = false;
size_t used_bytes = allocator->Used(); size_t used_bytes = allocator->Used();
...@@ -41,19 +45,25 @@ void TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes) { ...@@ -41,19 +45,25 @@ void TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes) {
void* p = allocator->Alloc(size_bytes); void* p = allocator->Alloc(size_bytes);
EXPECT_NE(p, nullptr); EXPECT_NE(p, nullptr);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (size_bytes < platform::GpuMaxChunkSize()) { if (size_bytes < allocator->GetMaxChunkSize()) {
#else #else
if (size_bytes < platform::CpuMaxChunkSize()) { if (size_bytes < allocator->GetMaxChunkSize()) {
#endif #endif
// Not allocate from SystemAllocator // Not allocate from SystemAllocator
EXPECT_FALSE(use_system_allocator);
EXPECT_GE(allocator->Used(), used_bytes + size_bytes); EXPECT_GE(allocator->Used(), used_bytes + size_bytes);
} else { } else {
// Allocate from SystemAllocator doesn't count in Used() // Allocate from SystemAllocator doesn't count in Used()
EXPECT_TRUE(use_system_allocator);
EXPECT_EQ(allocator->Used(), used_bytes); EXPECT_EQ(allocator->Used(), used_bytes);
} }
int* intp = static_cast<int*>(p); int* intp = static_cast<int*>(p);
if (!free_ptr) {
return intp;
}
std::shared_ptr<int> ptr(intp, [&](void* p) { std::shared_ptr<int> ptr(intp, [&](void* p) {
allocator->Free(intp); allocator->Free(intp);
freed = true; freed = true;
...@@ -64,20 +74,30 @@ void TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes) { ...@@ -64,20 +74,30 @@ void TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes) {
EXPECT_EQ(used_bytes, allocator->Used()); EXPECT_EQ(used_bytes, allocator->Used());
EXPECT_TRUE(freed); EXPECT_TRUE(freed);
return nullptr;
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
TEST(BuddyAllocator, GpuFraction) { TEST(BuddyAllocator, GpuFraction) {
// In a 16 GB machine, the pool size will be about 160 MB
FLAGS_fraction_of_gpu_memory_to_use = 0.01; FLAGS_fraction_of_gpu_memory_to_use = 0.01;
FLAGS_initial_gpu_memory_in_mb = 0;
FLAGS_reallocate_gpu_memory_in_mb = 0;
BuddyAllocator buddy_allocator( BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new GPUAllocator(test_gpu_id)), std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
// Less than pool size
TestBuddyAllocator(&buddy_allocator, 10); TestBuddyAllocator(&buddy_allocator, 10);
TestBuddyAllocator(&buddy_allocator, 10 << 10); TestBuddyAllocator(&buddy_allocator, 10 << 10);
TestBuddyAllocator(&buddy_allocator, 10 << 20); TestBuddyAllocator(&buddy_allocator, 10 << 20);
TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30));
// Greater than max chunk size
TestBuddyAllocator(&buddy_allocator, 499 << 20,
/* use_system_allocator = */ true);
TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30),
/* use_system_allocator = */ true);
} }
TEST(BuddyAllocator, InitRealloc) { TEST(BuddyAllocator, InitRealloc) {
...@@ -87,19 +107,19 @@ TEST(BuddyAllocator, InitRealloc) { ...@@ -87,19 +107,19 @@ TEST(BuddyAllocator, InitRealloc) {
EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast<size_t>(100 << 20)); EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast<size_t>(100 << 20));
BuddyAllocator buddy_allocator( BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new GPUAllocator(test_gpu_id)), std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
// Less then initial size and reallocate size // Less then initial size and reallocate size
TestBuddyAllocator(&buddy_allocator, 10 << 20); TestBuddyAllocator(&buddy_allocator, 10 << 20);
// Between initial size and reallocate size and not exceed pool // Between initial size and reallocate size and not exceed pool
TestBuddyAllocator(&buddy_allocator, 80 << 20); TestBuddyAllocator(&buddy_allocator, 80 << 20);
// Less then reallocate size and exceed pool TestBuddyAllocator(&buddy_allocator, 99 << 20);
TestBuddyAllocator(&buddy_allocator, 40 << 20); // Greater than max chunk size
// Greater then reallocate size and exceed pool TestBuddyAllocator(&buddy_allocator, 101 << 20,
TestBuddyAllocator(&buddy_allocator, 80 << 20); /* use_system_allocator = */ true);
// Greater then initial size and reallocate size TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30),
TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30)); /* use_system_allocator = */ true);
} }
TEST(BuddyAllocator, ReallocSizeGreaterThanInit) { TEST(BuddyAllocator, ReallocSizeGreaterThanInit) {
...@@ -109,23 +129,112 @@ TEST(BuddyAllocator, ReallocSizeGreaterThanInit) { ...@@ -109,23 +129,112 @@ TEST(BuddyAllocator, ReallocSizeGreaterThanInit) {
EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast<size_t>(10 << 20)); EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast<size_t>(10 << 20));
BuddyAllocator buddy_allocator( BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new GPUAllocator(test_gpu_id)), std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
// Less then initial size and reallocate size // Less than initial size and reallocate size
TestBuddyAllocator(&buddy_allocator, 1 << 20); TestBuddyAllocator(&buddy_allocator, 1 << 20);
// Between initial size and reallocate size and not exceed pool // Between initial size and reallocate size and exceed pool
TestBuddyAllocator(&buddy_allocator, 3 << 20); TestBuddyAllocator(&buddy_allocator, 6 << 20);
// Less then initial size and exceed pool
TestBuddyAllocator(&buddy_allocator, 3 << 20);
// Less then reallocate size and not exceed pool (now pool is 15 MB, used 7
// MB)
TestBuddyAllocator(&buddy_allocator, 7 << 20);
// Less then reallocate size and exceed pool
TestBuddyAllocator(&buddy_allocator, 8 << 20); TestBuddyAllocator(&buddy_allocator, 8 << 20);
// Greater then initial size and reallocate size TestBuddyAllocator(&buddy_allocator, 9 << 20);
TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30)); // Greater than max trunk size
TestBuddyAllocator(&buddy_allocator, 11 << 20,
/* use_system_allocator = */ true);
TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30),
/* use_system_allocator = */ true);
}
TEST(BuddyAllocator, FractionRefillPool) {
FLAGS_fraction_of_gpu_memory_to_use = 0.6;
FLAGS_initial_gpu_memory_in_mb = 0;
FLAGS_reallocate_gpu_memory_in_mb = 0;
size_t max_chunk_size = platform::GpuMaxChunkSize();
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
platform::GpuMinChunkSize(), max_chunk_size);
// Less than pool size
int* p0 = TestBuddyAllocator(&buddy_allocator, max_chunk_size - 1000,
/* use_system_allocator = */ false,
/* free_ptr = */ false);
// Max chunk size should be same during allocation
EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
size_t alloc =
platform::GpuAvailableMemToAlloc() * FLAGS_fraction_of_gpu_memory_to_use;
// Exceed pool trigger refilling size of fraction of avaiable gpu, and should
// be able to alloc 60% of the remaining GPU
int* p1 = TestBuddyAllocator(&buddy_allocator, alloc,
/* use_system_allocator = */ false,
/* free_ptr = */ false);
// Max chunk size should be same during allocation
EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
alloc =
platform::GpuAvailableMemToAlloc() * FLAGS_fraction_of_gpu_memory_to_use;
// Exceed pool trigger refilling size of fraction of avaiable gpu, and should
// be able to alloc 60% of the remaining GPU
TestBuddyAllocator(&buddy_allocator, alloc,
/* use_system_allocator = */ false);
// Max chunk size should be same during allocation
EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
buddy_allocator.Free(p0);
buddy_allocator.Free(p1);
}
TEST(BuddyAllocator, AllocFromAvailable) {
FLAGS_fraction_of_gpu_memory_to_use = 0.7;
FLAGS_initial_gpu_memory_in_mb = 0;
FLAGS_reallocate_gpu_memory_in_mb = 0;
size_t total = 0, available = 0;
platform::SetDeviceId(TEST_GPU_ID);
platform::GpuMemoryUsage(&available, &total);
// Take half of available GPU
void* p;
cudaError_t result = cudaMalloc(&p, available >> 1);
EXPECT_TRUE(result == cudaSuccess);
// BuddyAllocator should be able to alloc the remaining GPU
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
TestBuddyAllocator(&buddy_allocator, 10);
TestBuddyAllocator(&buddy_allocator, 10 << 10);
TestBuddyAllocator(&buddy_allocator, 10 << 20);
TestBuddyAllocator(&buddy_allocator, static_cast<size_t>(1 << 30));
if (p) {
EXPECT_TRUE(cudaFree(p) == cudaSuccess);
}
} }
TEST(BuddyAllocator, AllocFromAvailableWhenFractionIsOne) {
FLAGS_fraction_of_gpu_memory_to_use = 1.0;
FLAGS_initial_gpu_memory_in_mb = 0;
FLAGS_reallocate_gpu_memory_in_mb = 0;
void* p = nullptr;
EXPECT_TRUE(cudaMalloc(&p, static_cast<size_t>(4) << 30) == cudaSuccess);
// BuddyAllocator should be able to alloc the remaining GPU
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
TestBuddyAllocator(&buddy_allocator, static_cast<size_t>(1) << 30);
TestBuddyAllocator(&buddy_allocator, static_cast<size_t>(5) << 30);
if (p) {
EXPECT_TRUE(cudaFree(p) == cudaSuccess);
}
}
#endif #endif
} // namespace detail } // namespace detail
......
...@@ -215,45 +215,47 @@ void GpuMemoryUsage(size_t *available, size_t *total) { ...@@ -215,45 +215,47 @@ void GpuMemoryUsage(size_t *available, size_t *total) {
"cudaMemGetInfo failed in paddle::platform::GetMemoryUsage"); "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
} }
size_t GpuMaxAllocSize() { size_t GpuAvailableMemToAlloc() {
return std::max(GpuInitAllocSize(), GpuReallocSize());
}
size_t GpuInitAllocSize() {
if (FLAGS_initial_gpu_memory_in_mb > 0ul) {
// Initial memory will be allocated by FLAGS_initial_gpu_memory_in_mb
return static_cast<size_t>(FLAGS_initial_gpu_memory_in_mb << 20);
}
// FLAGS_initial_gpu_memory_in_mb is 0, initial memory will be allocated by
// fraction
size_t total = 0; size_t total = 0;
size_t available = 0; size_t available = 0;
GpuMemoryUsage(&available, &total); GpuMemoryUsage(&available, &total);
size_t reserving = static_cast<size_t>(fraction_reserve_gpu_memory * total); size_t reserving =
static_cast<size_t>(fraction_reserve_gpu_memory * available);
return static_cast<size_t>((total - reserving) * // If available size is less than minimum chunk size, no usable memory exists
FLAGS_fraction_of_gpu_memory_to_use); size_t available_to_alloc = available - reserving;
size_t min_chunk_size = GpuMinChunkSize();
if (available_to_alloc < min_chunk_size) {
available_to_alloc = 0;
}
VLOG(10) << "GPU usage " << (available >> 20) << "M/" << (total >> 20)
<< "M, " << (available_to_alloc >> 20) << "M available to allocate";
return available_to_alloc;
} }
size_t GpuReallocSize() { size_t GpuMaxAllocSize() {
if (FLAGS_reallocate_gpu_memory_in_mb > 0ul) { return std::max(GpuInitAllocSize(), GpuReallocSize());
// Additional memory will be allocated by FLAGS_reallocate_gpu_memory_in_mb }
return static_cast<size_t>(FLAGS_reallocate_gpu_memory_in_mb << 20);
}
// FLAGS_reallocate_gpu_memory_in_mb is 0, additional memory will be allocated static size_t GpuAllocSize(bool realloc) {
// by fraction size_t available_to_alloc = GpuAvailableMemToAlloc();
size_t total = 0; PADDLE_ENFORCE_GT(available_to_alloc, 0, "No enough available GPU memory");
size_t available = 0; // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
// allocated by fraction
size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
: FLAGS_initial_gpu_memory_in_mb;
size_t alloc_bytes =
(flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
FLAGS_fraction_of_gpu_memory_to_use);
PADDLE_ENFORCE_GE(available_to_alloc, alloc_bytes,
"No enough available GPU memory");
VLOG(10) << "Alloc size is " << (alloc_bytes >> 20)
<< " MiB, is it Re-alloc: " << realloc;
return alloc_bytes;
}
GpuMemoryUsage(&available, &total); size_t GpuInitAllocSize() { return GpuAllocSize(/* realloc = */ false); }
size_t reserving = static_cast<size_t>(fraction_reserve_gpu_memory * total);
return static_cast<size_t>((total - reserving) * size_t GpuReallocSize() { return GpuAllocSize(/* realloc = */ true); }
FLAGS_fraction_of_gpu_memory_to_use);
}
size_t GpuMinChunkSize() { size_t GpuMinChunkSize() {
// Allow to allocate the minimum chunk size is 256 bytes. // Allow to allocate the minimum chunk size is 256 bytes.
...@@ -261,24 +263,9 @@ size_t GpuMinChunkSize() { ...@@ -261,24 +263,9 @@ size_t GpuMinChunkSize() {
} }
size_t GpuMaxChunkSize() { size_t GpuMaxChunkSize() {
size_t total = 0; size_t max_chunk_size = GpuMaxAllocSize();
size_t available = 0; VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
return max_chunk_size;
GpuMemoryUsage(&available, &total);
VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
<< total / 1024 / 1024 << "M";
size_t reserving = static_cast<size_t>(fraction_reserve_gpu_memory * total);
// If available less than minimum chunk size, no usable memory exists.
available =
std::min(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(),
total - reserving);
size_t allocating = GpuMaxAllocSize();
PADDLE_ENFORCE_LE(allocating, available,
"Insufficient GPU memory to allocation.");
return allocating;
} }
void GpuMemcpyAsync(void *dst, const void *src, size_t count, void GpuMemcpyAsync(void *dst, const void *src, size_t count,
......
...@@ -57,6 +57,10 @@ void SetDeviceId(int device_id); ...@@ -57,6 +57,10 @@ void SetDeviceId(int device_id);
//! Get the memory usage of current GPU device. //! Get the memory usage of current GPU device.
void GpuMemoryUsage(size_t *available, size_t *total); void GpuMemoryUsage(size_t *available, size_t *total);
//! Get the available memory to allocate, which is the size of available gpu
//! minus reserving.
size_t GpuAvailableMemToAlloc();
//! Get the maximum allocation size of current GPU device. //! Get the maximum allocation size of current GPU device.
size_t GpuMaxAllocSize(); size_t GpuMaxAllocSize();
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册