From b3efc92397292097f13e05ef1d9666a87c0e71ce Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Tue, 28 Mar 2023 17:05:46 +0800 Subject: [PATCH] add support to set chunk size of auto_growth_allocator (#52204) * add flag to set chunk size * use the flag * add vlog * add ut * rename ut --- .../memory/allocation/allocator_facade.cc | 25 ++++++++++----- .../auto_growth_best_fit_allocator.cc | 1 + paddle/phi/core/flags.cc | 17 ++++++++++ ...t.py => test_auto_growth_allocator_gpu.py} | 31 ++++++++++++++++--- 4 files changed, 62 insertions(+), 12 deletions(-) rename python/paddle/fluid/tests/unittests/{test_auto_growth_gpu_memory_limit.py => test_auto_growth_allocator_gpu.py} (65%) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 7f7336a24c3..029288f1539 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -101,6 +101,7 @@ PADDLE_DEFINE_EXPORTED_bool(use_cuda_managed_memory, "strategy"); DECLARE_string(allocator_strategy); +DECLARE_uint64(auto_growth_chunk_size_in_mb); namespace paddle { namespace memory { @@ -563,10 +564,16 @@ class AllocatorFacadePrivate { } void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) { + auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20; + VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is " + << FLAGS_auto_growth_chunk_size_in_mb; #if defined(PADDLE_WITH_HIP) auto cuda_allocator = CreateCUDAAllocator(p); cuda_allocators_[p][stream] = std::make_shared( - cuda_allocator, platform::GpuMinChunkSize(), 0, allow_free_idle_chunk_); + cuda_allocator, + platform::GpuMinChunkSize(), + chunk_size, + allow_free_idle_chunk_); #endif #if defined(PADDLE_WITH_CUDA) @@ -597,7 +604,7 @@ class AllocatorFacadePrivate { std::make_shared( cuda_allocator, platform::GpuMinChunkSize(), - /*chunk_size=*/0, + /*chunk_size=*/chunk_size, allow_free_idle_chunk_); } #else @@ -635,7 +642,7 @@ class AllocatorFacadePrivate { } cuda_allocators_[p][stream] = std::make_shared( - underlying_allocator, alignment, 0, allow_free_idle_chunk_); + underlying_allocator, alignment, chunk_size, allow_free_idle_chunk_); #endif #endif } @@ -643,12 +650,15 @@ class AllocatorFacadePrivate { // NOTE(Ruibiao): Old single-stream version, will be removed later void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, bool allow_free_idle_chunk) { + auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20; + VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is " + << FLAGS_auto_growth_chunk_size_in_mb; #if defined(PADDLE_WITH_HIP) auto cuda_allocator = CreateCUDAAllocator(p); allocators_[p] = std::make_shared( cuda_allocator, platform::GpuMinChunkSize(), - /*chunk_size=*/0, + /*chunk_size=*/chunk_size, allow_free_idle_chunk); #endif @@ -679,7 +689,7 @@ class AllocatorFacadePrivate { allocators_[p] = std::make_shared( cuda_allocator, platform::GpuMinChunkSize(), - /*chunk_size=*/0, + /*chunk_size=*/chunk_size, allow_free_idle_chunk); } @@ -717,7 +727,7 @@ class AllocatorFacadePrivate { underlying_allocator = cuda_allocator; } allocators_[p] = std::make_shared( - underlying_allocator, alignment, 0, allow_free_idle_chunk); + underlying_allocator, alignment, chunk_size, allow_free_idle_chunk); #endif #endif } @@ -831,12 +841,13 @@ class AllocatorFacadePrivate { void InitAutoGrowthCustomDeviceAllocator(platform::CustomPlace p, bool allow_free_idle_chunk) { + auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20; auto custom_allocator = std::make_shared(p); allocators_[p] = std::make_shared( custom_allocator, phi::DeviceManager::GetMinChunkSize(p), - /*chunk_size=*/0, + /*chunk_size=*/chunk_size, allow_free_idle_chunk); } #endif diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc index 26f988f43af..cf3bb15fdd8 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc @@ -54,6 +54,7 @@ AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator( total_alloc_size_ = 0; total_free_times_ = 0; total_free_size_ = 0; + VLOG(4) << "chunk_size_:" << chunk_size_; } phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl( diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc index 1b3cf23f3fc..b384bed077b 100644 --- a/paddle/phi/core/flags.cc +++ b/paddle/phi/core/flags.cc @@ -648,6 +648,23 @@ PADDLE_DEFINE_EXPORTED_uint64( "memory exceeds the limit even though there is available " "memory on the gpu card. The unit is MB and default value is 0."); +/** + * Memory related FLAG + * Name: FLAGS_auto_growth_chunk_size_in_mb + * Since Version: 2.5.0 + * Value Range: uint64, default=0 (MB) + * Example: + * Note: The minimal chunk size of GPU memory block in auto_growth allocator. + * The real chunk size is max(request_size, + * FLAGS_auto_growth_chunk_size_in_mb). + */ +PADDLE_DEFINE_EXPORTED_uint64( + auto_growth_chunk_size_in_mb, + 0ul, + "The minimal chunk size of GPU memory block in auto_growth allocator. " + "The real chunk size is max(request_size, " + "FLAGS_auto_growth_chunk_size_in_mb)."); + #endif /** diff --git a/python/paddle/fluid/tests/unittests/test_auto_growth_gpu_memory_limit.py b/python/paddle/fluid/tests/unittests/test_auto_growth_allocator_gpu.py similarity index 65% rename from python/paddle/fluid/tests/unittests/test_auto_growth_gpu_memory_limit.py rename to python/paddle/fluid/tests/unittests/test_auto_growth_allocator_gpu.py index c3432fd430d..f2edff5eac4 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_growth_gpu_memory_limit.py +++ b/python/paddle/fluid/tests/unittests/test_auto_growth_allocator_gpu.py @@ -16,18 +16,24 @@ import unittest import numpy as np +import paddle from paddle import fluid -fluid.core.globals()['FLAGS_allocator_strategy'] = 'auto_growth' - +# it should be set at the beginning if fluid.is_compiled_with_cuda(): - fluid.core.globals()['FLAGS_gpu_memory_limit_mb'] = 10 + paddle.set_flags( + { + 'FLAGS_allocator_strategy': 'auto_growth', + 'FLAGS_auto_growth_chunk_size_in_mb': 10, + } + ) -class TestBase(unittest.TestCase): +class TestMemoryLimit(unittest.TestCase): def setUp(self): + self._limit = 10 if fluid.is_compiled_with_cuda(): - self._limit = fluid.core.globals()['FLAGS_gpu_memory_limit_mb'] + paddle.set_flags({'FLAGS_gpu_memory_limit_mb': 10}) def test_allocate(self): if not fluid.is_compiled_with_cuda(): @@ -53,5 +59,20 @@ class TestBase(unittest.TestCase): self.assertTrue(True) +class TestChunkSize(unittest.TestCase): + def test_allocate(self): + if not fluid.is_compiled_with_cuda(): + return + + paddle.rand([1024]) + reserved, allocated = ( + paddle.device.cuda.max_memory_reserved(), + paddle.device.cuda.max_memory_allocated(), + ) + + self.assertEqual(reserved, 1024 * 1024 * 10) + self.assertEqual(allocated, 1024 * 4) + + if __name__ == '__main__': unittest.main() -- GitLab