From b3efc92397292097f13e05ef1d9666a87c0e71ce Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Tue, 28 Mar 2023 17:05:46 +0800
Subject: [PATCH] add support to set chunk size of auto_growth_allocator
 (#52204)

* add flag to set chunk size

* use the flag

* add vlog

* add ut

* rename ut
---
 .../memory/allocation/allocator_facade.cc     | 25 ++++++++++-----
 .../auto_growth_best_fit_allocator.cc         |  1 +
 paddle/phi/core/flags.cc                      | 17 ++++++++++
 ...t.py => test_auto_growth_allocator_gpu.py} | 31 ++++++++++++++++---
 4 files changed, 62 insertions(+), 12 deletions(-)
 rename python/paddle/fluid/tests/unittests/{test_auto_growth_gpu_memory_limit.py => test_auto_growth_allocator_gpu.py} (65%)

diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 7f7336a24c3..029288f1539 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -101,6 +101,7 @@ PADDLE_DEFINE_EXPORTED_bool(use_cuda_managed_memory,
                             "strategy");
 
 DECLARE_string(allocator_strategy);
+DECLARE_uint64(auto_growth_chunk_size_in_mb);
 
 namespace paddle {
 namespace memory {
@@ -563,10 +564,16 @@ class AllocatorFacadePrivate {
   }
 
   void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
+    auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
+    VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is "
+            << FLAGS_auto_growth_chunk_size_in_mb;
 #if defined(PADDLE_WITH_HIP)
     auto cuda_allocator = CreateCUDAAllocator(p);
     cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
-        cuda_allocator, platform::GpuMinChunkSize(), 0, allow_free_idle_chunk_);
+        cuda_allocator,
+        platform::GpuMinChunkSize(),
+        chunk_size,
+        allow_free_idle_chunk_);
 #endif
 
 #if defined(PADDLE_WITH_CUDA)
@@ -597,7 +604,7 @@ class AllocatorFacadePrivate {
           std::make_shared<AutoGrowthBestFitAllocator>(
               cuda_allocator,
               platform::GpuMinChunkSize(),
-              /*chunk_size=*/0,
+              /*chunk_size=*/chunk_size,
               allow_free_idle_chunk_);
     }
 #else
@@ -635,7 +642,7 @@ class AllocatorFacadePrivate {
     }
 
     cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
-        underlying_allocator, alignment, 0, allow_free_idle_chunk_);
+        underlying_allocator, alignment, chunk_size, allow_free_idle_chunk_);
 #endif
 #endif
   }
@@ -643,12 +650,15 @@ class AllocatorFacadePrivate {
   // NOTE(Ruibiao): Old single-stream version, will be removed later
   void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p,
                                    bool allow_free_idle_chunk) {
+    auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
+    VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is "
+            << FLAGS_auto_growth_chunk_size_in_mb;
 #if defined(PADDLE_WITH_HIP)
     auto cuda_allocator = CreateCUDAAllocator(p);
     allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
         cuda_allocator,
         platform::GpuMinChunkSize(),
-        /*chunk_size=*/0,
+        /*chunk_size=*/chunk_size,
         allow_free_idle_chunk);
 #endif
 
@@ -679,7 +689,7 @@ class AllocatorFacadePrivate {
       allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
           cuda_allocator,
           platform::GpuMinChunkSize(),
-          /*chunk_size=*/0,
+          /*chunk_size=*/chunk_size,
           allow_free_idle_chunk);
     }
 
@@ -717,7 +727,7 @@ class AllocatorFacadePrivate {
       underlying_allocator = cuda_allocator;
     }
     allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
-        underlying_allocator, alignment, 0, allow_free_idle_chunk);
+        underlying_allocator, alignment, chunk_size, allow_free_idle_chunk);
 #endif
 #endif
   }
@@ -831,12 +841,13 @@ class AllocatorFacadePrivate {
 
   void InitAutoGrowthCustomDeviceAllocator(platform::CustomPlace p,
                                            bool allow_free_idle_chunk) {
+    auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
     auto custom_allocator =
         std::make_shared<paddle::memory::allocation::CustomAllocator>(p);
     allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
         custom_allocator,
         phi::DeviceManager::GetMinChunkSize(p),
-        /*chunk_size=*/0,
+        /*chunk_size=*/chunk_size,
         allow_free_idle_chunk);
   }
 #endif
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
index 26f988f43af..cf3bb15fdd8 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
@@ -54,6 +54,7 @@ AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
   total_alloc_size_ = 0;
   total_free_times_ = 0;
   total_free_size_ = 0;
+  VLOG(4) << "chunk_size_:" << chunk_size_;
 }
 
 phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc
index 1b3cf23f3fc..b384bed077b 100644
--- a/paddle/phi/core/flags.cc
+++ b/paddle/phi/core/flags.cc
@@ -648,6 +648,23 @@ PADDLE_DEFINE_EXPORTED_uint64(
     "memory exceeds the limit even though there is available "
     "memory on the gpu card. The unit is MB and default value is 0.");
 
+/**
+ * Memory related FLAG
+ * Name: FLAGS_auto_growth_chunk_size_in_mb
+ * Since Version: 2.5.0
+ * Value Range: uint64, default=0 (MB)
+ * Example:
+ * Note: The minimal chunk size of GPU memory block in auto_growth allocator.
+ *       The real chunk size is max(request_size,
+ *       FLAGS_auto_growth_chunk_size_in_mb).
+ */
+PADDLE_DEFINE_EXPORTED_uint64(
+    auto_growth_chunk_size_in_mb,
+    0ul,
+    "The minimal chunk size of GPU memory block in auto_growth allocator.  "
+    "The real chunk size is max(request_size, "
+    "FLAGS_auto_growth_chunk_size_in_mb).");
+
 #endif
 
 /**
diff --git a/python/paddle/fluid/tests/unittests/test_auto_growth_gpu_memory_limit.py b/python/paddle/fluid/tests/unittests/test_auto_growth_allocator_gpu.py
similarity index 65%
rename from python/paddle/fluid/tests/unittests/test_auto_growth_gpu_memory_limit.py
rename to python/paddle/fluid/tests/unittests/test_auto_growth_allocator_gpu.py
index c3432fd430d..f2edff5eac4 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_growth_gpu_memory_limit.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_growth_allocator_gpu.py
@@ -16,18 +16,24 @@ import unittest
 
 import numpy as np
 
+import paddle
 from paddle import fluid
 
-fluid.core.globals()['FLAGS_allocator_strategy'] = 'auto_growth'
-
+# it should be set at the beginning
 if fluid.is_compiled_with_cuda():
-    fluid.core.globals()['FLAGS_gpu_memory_limit_mb'] = 10
+    paddle.set_flags(
+        {
+            'FLAGS_allocator_strategy': 'auto_growth',
+            'FLAGS_auto_growth_chunk_size_in_mb': 10,
+        }
+    )
 
 
-class TestBase(unittest.TestCase):
+class TestMemoryLimit(unittest.TestCase):
     def setUp(self):
+        self._limit = 10
         if fluid.is_compiled_with_cuda():
-            self._limit = fluid.core.globals()['FLAGS_gpu_memory_limit_mb']
+            paddle.set_flags({'FLAGS_gpu_memory_limit_mb': 10})
 
     def test_allocate(self):
         if not fluid.is_compiled_with_cuda():
@@ -53,5 +59,20 @@ class TestBase(unittest.TestCase):
             self.assertTrue(True)
 
 
+class TestChunkSize(unittest.TestCase):
+    def test_allocate(self):
+        if not fluid.is_compiled_with_cuda():
+            return
+
+        paddle.rand([1024])
+        reserved, allocated = (
+            paddle.device.cuda.max_memory_reserved(),
+            paddle.device.cuda.max_memory_allocated(),
+        )
+
+        self.assertEqual(reserved, 1024 * 1024 * 10)
+        self.assertEqual(allocated, 1024 * 4)
+
+
 if __name__ == '__main__':
     unittest.main()
-- 
GitLab