cherry-pick refine default gpu memory， test=release/1.5 (#19281)

737f21b8 · Zeng Jinle · GitHub · 71168dad · 737f21b8 · 737f21b8
4 changed file
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -23,6 +23,10 @@ DEFINE_bool(free_idle_memory, false,
            "If it is true, Paddle will try to free idle memory trunks during "
            "running time.");
+#ifdef PADDLE_WITH_CUDA
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
+#endif
 namespace paddle {
 namespace memory {
 namespace detail {
@@ -200,8 +204,9 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
      // Compute the allocation size for gpu for the first allocation.
      allocate_bytes = std::max(platform::GpuInitAllocSize(), request_bytes);
    } else {
-      // Reallocation size
+      // Compute the re-allocation size, we store the re-allocation size when
-      if (realloc_size_ == 0) {
+      // user set FLAGS_reallocate_gpu_memory_in_mb to fix value.
+      if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) {
        realloc_size_ = platform::GpuReallocSize();
      }
      allocate_bytes = std::max(realloc_size_, request_bytes);

--- a/paddle/fluid/memory/detail/buddy_allocator_test.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc
@@ -22,6 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
 #ifdef PADDLE_WITH_CUDA
+#include <cuda_runtime.h>
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_uint64(initial_gpu_memory_in_mb);
 DECLARE_uint64(reallocate_gpu_memory_in_mb);
@@ -31,9 +33,11 @@ namespace paddle {
 namespace memory {
 namespace detail {
-constexpr static int test_gpu_id = 0;
+constexpr static int TEST_GPU_ID = 0;
-void TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes) {
+int* TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes,
+                        bool use_system_allocator = false,
+                        bool free_ptr = true) {
  bool freed = false;
  size_t used_bytes = allocator->Used();
@@ -41,19 +45,25 @@ void TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes) {
    void* p = allocator->Alloc(size_bytes);
    EXPECT_NE(p, nullptr);
 #ifdef PADDLE_WITH_CUDA
-    if (size_bytes < platform::GpuMaxChunkSize()) {
+    if (size_bytes < allocator->GetMaxChunkSize()) {
 #else
-    if (size_bytes < platform::CpuMaxChunkSize()) {
+    if (size_bytes < allocator->GetMaxChunkSize()) {
 #endif
      // Not allocate from SystemAllocator
+      EXPECT_FALSE(use_system_allocator);
      EXPECT_GE(allocator->Used(), used_bytes + size_bytes);
    } else {
      // Allocate from SystemAllocator doesn't count in Used()
+      EXPECT_TRUE(use_system_allocator);
      EXPECT_EQ(allocator->Used(), used_bytes);
    }
    int* intp = static_cast<int*>(p);
+    if (!free_ptr) {
+      return intp;
+    }
    std::shared_ptr<int> ptr(intp, [&](void* p) {
      allocator->Free(intp);
      freed = true;
@@ -64,20 +74,30 @@ void TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes) {
  EXPECT_EQ(used_bytes, allocator->Used());
  EXPECT_TRUE(freed);
+  return nullptr;
 }
 #ifdef PADDLE_WITH_CUDA
 TEST(BuddyAllocator, GpuFraction) {
+  // In a 16 GB machine, the pool size will be about 160 MB
  FLAGS_fraction_of_gpu_memory_to_use = 0.01;
+  FLAGS_initial_gpu_memory_in_mb = 0;
+  FLAGS_reallocate_gpu_memory_in_mb = 0;
  BuddyAllocator buddy_allocator(
-      std::unique_ptr<SystemAllocator>(new GPUAllocator(test_gpu_id)),
+      std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
      platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
+  // Less than pool size
  TestBuddyAllocator(&buddy_allocator, 10);
  TestBuddyAllocator(&buddy_allocator, 10 << 10);
  TestBuddyAllocator(&buddy_allocator, 10 << 20);
-  TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30));
+  // Greater than max chunk size
+  TestBuddyAllocator(&buddy_allocator, 499 << 20,
+                     /* use_system_allocator = */ true);
+  TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30),
+                     /* use_system_allocator = */ true);
 }
 TEST(BuddyAllocator, InitRealloc) {
@@ -87,19 +107,19 @@ TEST(BuddyAllocator, InitRealloc) {
  EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast<size_t>(100 << 20));
  BuddyAllocator buddy_allocator(
-      std::unique_ptr<SystemAllocator>(new GPUAllocator(test_gpu_id)),
+      std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
      platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
  // Less then initial size and reallocate size
  TestBuddyAllocator(&buddy_allocator, 10 << 20);
  // Between initial size and reallocate size and not exceed pool
  TestBuddyAllocator(&buddy_allocator, 80 << 20);
-  // Less then reallocate size and exceed pool
+  TestBuddyAllocator(&buddy_allocator, 99 << 20);
-  TestBuddyAllocator(&buddy_allocator, 40 << 20);
+  // Greater than max chunk size
-  // Greater then reallocate size and exceed pool
+  TestBuddyAllocator(&buddy_allocator, 101 << 20,
-  TestBuddyAllocator(&buddy_allocator, 80 << 20);
+                     /* use_system_allocator = */ true);
-  // Greater then initial size and reallocate size
+  TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30),
-  TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30));
+                     /* use_system_allocator = */ true);
 }
 TEST(BuddyAllocator, ReallocSizeGreaterThanInit) {
@@ -109,23 +129,112 @@ TEST(BuddyAllocator, ReallocSizeGreaterThanInit) {
  EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast<size_t>(10 << 20));
  BuddyAllocator buddy_allocator(
-      std::unique_ptr<SystemAllocator>(new GPUAllocator(test_gpu_id)),
+      std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
      platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
-  // Less then initial size and reallocate size
+  // Less than initial size and reallocate size
  TestBuddyAllocator(&buddy_allocator, 1 << 20);
-  // Between initial size and reallocate size and not exceed pool
+  // Between initial size and reallocate size and exceed pool
-  TestBuddyAllocator(&buddy_allocator, 3 << 20);
+  TestBuddyAllocator(&buddy_allocator, 6 << 20);
-  // Less then initial size and exceed pool
-  TestBuddyAllocator(&buddy_allocator, 3 << 20);
-  // Less then reallocate size and not exceed pool (now pool is 15 MB, used 7
-  // MB)
-  TestBuddyAllocator(&buddy_allocator, 7 << 20);
-  // Less then reallocate size and exceed pool
  TestBuddyAllocator(&buddy_allocator, 8 << 20);
-  // Greater then initial size and reallocate size
+  TestBuddyAllocator(&buddy_allocator, 9 << 20);
-  TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30));
+  // Greater than max trunk size
+  TestBuddyAllocator(&buddy_allocator, 11 << 20,
+                     /* use_system_allocator = */ true);
+  TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30),
+                     /* use_system_allocator = */ true);
+}
+TEST(BuddyAllocator, FractionRefillPool) {
+  FLAGS_fraction_of_gpu_memory_to_use = 0.6;
+  FLAGS_initial_gpu_memory_in_mb = 0;
+  FLAGS_reallocate_gpu_memory_in_mb = 0;
+  size_t max_chunk_size = platform::GpuMaxChunkSize();
+  BuddyAllocator buddy_allocator(
+      std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
+      platform::GpuMinChunkSize(), max_chunk_size);
+  // Less than pool size
+  int* p0 = TestBuddyAllocator(&buddy_allocator, max_chunk_size - 1000,
+                               /* use_system_allocator = */ false,
+                               /* free_ptr = */ false);
+  // Max chunk size should be same during allocation
+  EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
+  size_t alloc =
+      platform::GpuAvailableMemToAlloc() * FLAGS_fraction_of_gpu_memory_to_use;
+  // Exceed pool trigger refilling size of fraction of avaiable gpu, and should
+  // be able to alloc 60% of the remaining GPU
+  int* p1 = TestBuddyAllocator(&buddy_allocator, alloc,
+                               /* use_system_allocator = */ false,
+                               /* free_ptr = */ false);
+  // Max chunk size should be same during allocation
+  EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
+  alloc =
+      platform::GpuAvailableMemToAlloc() * FLAGS_fraction_of_gpu_memory_to_use;
+  // Exceed pool trigger refilling size of fraction of avaiable gpu, and should
+  // be able to alloc 60% of the remaining GPU
+  TestBuddyAllocator(&buddy_allocator, alloc,
+                     /* use_system_allocator = */ false);
+  // Max chunk size should be same during allocation
+  EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
+  buddy_allocator.Free(p0);
+  buddy_allocator.Free(p1);
+}
+TEST(BuddyAllocator, AllocFromAvailable) {
+  FLAGS_fraction_of_gpu_memory_to_use = 0.7;
+  FLAGS_initial_gpu_memory_in_mb = 0;
+  FLAGS_reallocate_gpu_memory_in_mb = 0;
+  size_t total = 0, available = 0;
+  platform::SetDeviceId(TEST_GPU_ID);
+  platform::GpuMemoryUsage(&available, &total);
+  // Take half of available GPU
+  void* p;
+  cudaError_t result = cudaMalloc(&p, available >> 1);
+  EXPECT_TRUE(result == cudaSuccess);
+  // BuddyAllocator should be able to alloc the remaining GPU
+  BuddyAllocator buddy_allocator(
+      std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
+      platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
+  TestBuddyAllocator(&buddy_allocator, 10);
+  TestBuddyAllocator(&buddy_allocator, 10 << 10);
+  TestBuddyAllocator(&buddy_allocator, 10 << 20);
+  TestBuddyAllocator(&buddy_allocator, static_cast<size_t>(1 << 30));
+  if (p) {
+    EXPECT_TRUE(cudaFree(p) == cudaSuccess);
+  }
 }
+TEST(BuddyAllocator, AllocFromAvailableWhenFractionIsOne) {
+  FLAGS_fraction_of_gpu_memory_to_use = 1.0;
+  FLAGS_initial_gpu_memory_in_mb = 0;
+  FLAGS_reallocate_gpu_memory_in_mb = 0;
+  void* p = nullptr;
+  EXPECT_TRUE(cudaMalloc(&p, static_cast<size_t>(4) << 30) == cudaSuccess);
+  // BuddyAllocator should be able to alloc the remaining GPU
+  BuddyAllocator buddy_allocator(
+      std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
+      platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
+  TestBuddyAllocator(&buddy_allocator, static_cast<size_t>(1) << 30);
+  TestBuddyAllocator(&buddy_allocator, static_cast<size_t>(5) << 30);
+  if (p) {
+    EXPECT_TRUE(cudaFree(p) == cudaSuccess);
+  }
+}
 #endif
 }  // namespace detail

--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -215,45 +215,47 @@ void GpuMemoryUsage(size_t *available, size_t *total) {
                 "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
 }
-size_t GpuMaxAllocSize() {
+size_t GpuAvailableMemToAlloc() {
-  return std::max(GpuInitAllocSize(), GpuReallocSize());
-}
-size_t GpuInitAllocSize() {
-  if (FLAGS_initial_gpu_memory_in_mb > 0ul) {
-    // Initial memory will be allocated by FLAGS_initial_gpu_memory_in_mb
-    return static_cast<size_t>(FLAGS_initial_gpu_memory_in_mb << 20);
-  }
-  // FLAGS_initial_gpu_memory_in_mb is 0, initial memory will be allocated by
-  // fraction
  size_t total = 0;
  size_t available = 0;
  GpuMemoryUsage(&available, &total);
-  size_t reserving = static_cast<size_t>(fraction_reserve_gpu_memory * total);
+  size_t reserving =
+      static_cast<size_t>(fraction_reserve_gpu_memory * available);
-  return static_cast<size_t>((total - reserving) *
+  // If available size is less than minimum chunk size, no usable memory exists
-                             FLAGS_fraction_of_gpu_memory_to_use);
+  size_t available_to_alloc = available - reserving;
+  size_t min_chunk_size = GpuMinChunkSize();
+  if (available_to_alloc < min_chunk_size) {
+    available_to_alloc = 0;
+  }
+  VLOG(10) << "GPU usage " << (available >> 20) << "M/" << (total >> 20)
+           << "M, " << (available_to_alloc >> 20) << "M available to allocate";
+  return available_to_alloc;
 }
-size_t GpuReallocSize() {
+size_t GpuMaxAllocSize() {
-  if (FLAGS_reallocate_gpu_memory_in_mb > 0ul) {
+  return std::max(GpuInitAllocSize(), GpuReallocSize());
-    // Additional memory will be allocated by FLAGS_reallocate_gpu_memory_in_mb
+}
-    return static_cast<size_t>(FLAGS_reallocate_gpu_memory_in_mb << 20);
-  }
-  // FLAGS_reallocate_gpu_memory_in_mb is 0, additional memory will be allocated
+static size_t GpuAllocSize(bool realloc) {
-  // by fraction
+  size_t available_to_alloc = GpuAvailableMemToAlloc();
-  size_t total = 0;
+  PADDLE_ENFORCE_GT(available_to_alloc, 0, "No enough available GPU memory");
-  size_t available = 0;
+  // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
+  // allocated by fraction
+  size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
+                           : FLAGS_initial_gpu_memory_in_mb;
+  size_t alloc_bytes =
+      (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
+                                           FLAGS_fraction_of_gpu_memory_to_use);
+  PADDLE_ENFORCE_GE(available_to_alloc, alloc_bytes,
+                    "No enough available GPU memory");
+  VLOG(10) << "Alloc size is " << (alloc_bytes >> 20)
+           << " MiB, is it Re-alloc: " << realloc;
+  return alloc_bytes;
+}
-  GpuMemoryUsage(&available, &total);
+size_t GpuInitAllocSize() { return GpuAllocSize(/* realloc = */ false); }
-  size_t reserving = static_cast<size_t>(fraction_reserve_gpu_memory * total);
-  return static_cast<size_t>((total - reserving) *
+size_t GpuReallocSize() { return GpuAllocSize(/* realloc = */ true); }
-                             FLAGS_fraction_of_gpu_memory_to_use);
-}
 size_t GpuMinChunkSize() {
  // Allow to allocate the minimum chunk size is 256 bytes.
@@ -261,24 +263,9 @@ size_t GpuMinChunkSize() {
 }
 size_t GpuMaxChunkSize() {
-  size_t total = 0;
+  size_t max_chunk_size = GpuMaxAllocSize();
-  size_t available = 0;
+  VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
+  return max_chunk_size;
-  GpuMemoryUsage(&available, &total);
-  VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
-           << total / 1024 / 1024 << "M";
-  size_t reserving = static_cast<size_t>(fraction_reserve_gpu_memory * total);
-  // If available less than minimum chunk size, no usable memory exists.
-  available =
-      std::min(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(),
-               total - reserving);
-  size_t allocating = GpuMaxAllocSize();
-  PADDLE_ENFORCE_LE(allocating, available,
-                    "Insufficient GPU memory to allocation.");
-  return allocating;
 }
 void GpuMemcpyAsync(void *dst, const void *src, size_t count,

--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -57,6 +57,10 @@ void SetDeviceId(int device_id);
 //! Get the memory usage of current GPU device.
 void GpuMemoryUsage(size_t *available, size_t *total);
+//! Get the available memory to allocate, which is the size of available gpu
+//! minus reserving.
+size_t GpuAvailableMemToAlloc();
 //! Get the maximum allocation size of current GPU device.
 size_t GpuMaxAllocSize();