未验证 提交 b3efc923 编写于 作者: L Leo Chen 提交者: GitHub

add support to set chunk size of auto_growth_allocator (#52204)

* add flag to set chunk size

* use the flag

* add vlog

* add ut

* rename ut
上级 ecff3864
......@@ -101,6 +101,7 @@ PADDLE_DEFINE_EXPORTED_bool(use_cuda_managed_memory,
"strategy");
DECLARE_string(allocator_strategy);
DECLARE_uint64(auto_growth_chunk_size_in_mb);
namespace paddle {
namespace memory {
......@@ -563,10 +564,16 @@ class AllocatorFacadePrivate {
}
void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is "
<< FLAGS_auto_growth_chunk_size_in_mb;
#if defined(PADDLE_WITH_HIP)
auto cuda_allocator = CreateCUDAAllocator(p);
cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator, platform::GpuMinChunkSize(), 0, allow_free_idle_chunk_);
cuda_allocator,
platform::GpuMinChunkSize(),
chunk_size,
allow_free_idle_chunk_);
#endif
#if defined(PADDLE_WITH_CUDA)
......@@ -597,7 +604,7 @@ class AllocatorFacadePrivate {
std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator,
platform::GpuMinChunkSize(),
/*chunk_size=*/0,
/*chunk_size=*/chunk_size,
allow_free_idle_chunk_);
}
#else
......@@ -635,7 +642,7 @@ class AllocatorFacadePrivate {
}
cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
underlying_allocator, alignment, 0, allow_free_idle_chunk_);
underlying_allocator, alignment, chunk_size, allow_free_idle_chunk_);
#endif
#endif
}
......@@ -643,12 +650,15 @@ class AllocatorFacadePrivate {
// NOTE(Ruibiao): Old single-stream version, will be removed later
void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p,
bool allow_free_idle_chunk) {
auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is "
<< FLAGS_auto_growth_chunk_size_in_mb;
#if defined(PADDLE_WITH_HIP)
auto cuda_allocator = CreateCUDAAllocator(p);
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator,
platform::GpuMinChunkSize(),
/*chunk_size=*/0,
/*chunk_size=*/chunk_size,
allow_free_idle_chunk);
#endif
......@@ -679,7 +689,7 @@ class AllocatorFacadePrivate {
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator,
platform::GpuMinChunkSize(),
/*chunk_size=*/0,
/*chunk_size=*/chunk_size,
allow_free_idle_chunk);
}
......@@ -717,7 +727,7 @@ class AllocatorFacadePrivate {
underlying_allocator = cuda_allocator;
}
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
underlying_allocator, alignment, 0, allow_free_idle_chunk);
underlying_allocator, alignment, chunk_size, allow_free_idle_chunk);
#endif
#endif
}
......@@ -831,12 +841,13 @@ class AllocatorFacadePrivate {
void InitAutoGrowthCustomDeviceAllocator(platform::CustomPlace p,
bool allow_free_idle_chunk) {
auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
auto custom_allocator =
std::make_shared<paddle::memory::allocation::CustomAllocator>(p);
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
custom_allocator,
phi::DeviceManager::GetMinChunkSize(p),
/*chunk_size=*/0,
/*chunk_size=*/chunk_size,
allow_free_idle_chunk);
}
#endif
......
......@@ -54,6 +54,7 @@ AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
total_alloc_size_ = 0;
total_free_times_ = 0;
total_free_size_ = 0;
VLOG(4) << "chunk_size_:" << chunk_size_;
}
phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
......
......@@ -648,6 +648,23 @@ PADDLE_DEFINE_EXPORTED_uint64(
"memory exceeds the limit even though there is available "
"memory on the gpu card. The unit is MB and default value is 0.");
/**
* Memory related FLAG
* Name: FLAGS_auto_growth_chunk_size_in_mb
* Since Version: 2.5.0
* Value Range: uint64, default=0 (MB)
* Example:
* Note: The minimal chunk size of GPU memory block in auto_growth allocator.
* The real chunk size is max(request_size,
* FLAGS_auto_growth_chunk_size_in_mb).
*/
PADDLE_DEFINE_EXPORTED_uint64(
auto_growth_chunk_size_in_mb,
0ul,
"The minimal chunk size of GPU memory block in auto_growth allocator. "
"The real chunk size is max(request_size, "
"FLAGS_auto_growth_chunk_size_in_mb).");
#endif
/**
......
......@@ -16,18 +16,24 @@ import unittest
import numpy as np
import paddle
from paddle import fluid
fluid.core.globals()['FLAGS_allocator_strategy'] = 'auto_growth'
# it should be set at the beginning
if fluid.is_compiled_with_cuda():
fluid.core.globals()['FLAGS_gpu_memory_limit_mb'] = 10
paddle.set_flags(
{
'FLAGS_allocator_strategy': 'auto_growth',
'FLAGS_auto_growth_chunk_size_in_mb': 10,
}
)
class TestBase(unittest.TestCase):
class TestMemoryLimit(unittest.TestCase):
def setUp(self):
self._limit = 10
if fluid.is_compiled_with_cuda():
self._limit = fluid.core.globals()['FLAGS_gpu_memory_limit_mb']
paddle.set_flags({'FLAGS_gpu_memory_limit_mb': 10})
def test_allocate(self):
if not fluid.is_compiled_with_cuda():
......@@ -53,5 +59,20 @@ class TestBase(unittest.TestCase):
self.assertTrue(True)
class TestChunkSize(unittest.TestCase):
def test_allocate(self):
if not fluid.is_compiled_with_cuda():
return
paddle.rand([1024])
reserved, allocated = (
paddle.device.cuda.max_memory_reserved(),
paddle.device.cuda.max_memory_allocated(),
)
self.assertEqual(reserved, 1024 * 1024 * 10)
self.assertEqual(allocated, 1024 * 4)
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册