未验证 提交 8fe1cb72 编写于 作者: Y Yang 提交者: GitHub

optimize buddy_allocator (#38312)

上级 64e2f670
......@@ -13,18 +13,18 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/memory/detail/buddy_allocator.h"
#include <algorithm>
#include "gflags/gflags.h"
#include "glog/logging.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_MLU)
DECLARE_uint64(reallocate_gpu_memory_in_mb);
#endif
#ifdef PADDLE_WITH_ASCEND_CL
defined(PADDLE_WITH_MLU) || defined(PADDLE_WITH_ASCEND_CL)
#define USE_DEVICE
DECLARE_uint64(reallocate_gpu_memory_in_mb);
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
......@@ -180,33 +180,24 @@ uint64_t BuddyAllocator::Release() {
std::lock_guard<std::mutex> lock(mutex_);
int num = 0;
uint64_t bytes = 0;
bool del_flag = false;
for (auto iter = pool_.begin(); iter != pool_.end();) {
auto remain_size = std::get<1>(*iter);
auto remain_ptr = std::get<2>(*iter);
for (auto& chunk : chunks_) {
auto init_size = std::get<1>(chunk);
auto init_ptr = std::get<2>(chunk);
if (init_size == remain_size && init_ptr == remain_ptr) {
++num;
bytes += init_size;
total_free_ -= init_size;
auto block = static_cast<MemoryBlock*>(std::get<2>(chunk));
system_allocator_->Free(init_ptr, init_size, std::get<0>(chunk));
cache_.Invalidate(block);
del_flag = true;
break;
}
}
if (del_flag) {
auto found = chunks_.find({remain_size, remain_ptr});
if (found != chunks_.end()) {
size_t index = found->second;
++num;
bytes += remain_size;
total_free_ -= remain_size;
auto block = static_cast<MemoryBlock*>(remain_ptr);
system_allocator_->Free(remain_ptr, remain_size, index);
cache_.Invalidate(block);
iter = pool_.erase(iter);
} else {
iter++;
}
}
VLOG(10) << "Release " << num << " chunk, Free " << bytes << " bytes.";
VLOG(10) << "Release " << num << " chunks, Free " << bytes << " bytes.";
return bytes;
}
......@@ -234,49 +225,15 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
size_t index = 0;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (system_allocator_->UseGpu()) {
if ((total_used_ + total_free_) == 0) {
// Compute the allocation size for gpu for the first allocation.
allocate_bytes = std::max(platform::GpuInitAllocSize(), request_bytes);
} else {
// Compute the re-allocation size, we store the re-allocation size when
// user set FLAGS_reallocate_gpu_memory_in_mb to fix value.
if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) {
realloc_size_ = platform::GpuReallocSize();
}
allocate_bytes = std::max(realloc_size_, request_bytes);
}
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
if (system_allocator_->UseGpu()) {
if ((total_used_ + total_free_) == 0) {
// Compute the allocation size for gpu for the first allocation.
allocate_bytes = std::max(platform::NPUInitAllocSize(), request_bytes);
} else {
// Compute the re-allocation size, we store the re-allocation size when
// user set FLAGS_reallocate_gpu_memory_in_mb to fix value.
if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) {
realloc_size_ = platform::NPUReallocSize();
}
allocate_bytes = std::max(realloc_size_, request_bytes);
}
}
#endif
#ifdef PADDLE_WITH_MLU
if (system_allocator_->UseGpu()) {
if ((total_used_ + total_free_) == 0) {
// Compute the allocation size for mlu for the first allocation.
allocate_bytes = std::max(platform::MLUInitAllocSize(), request_bytes);
} else {
// Compute the re-allocation size, we store the re-allocation size when
// user set FLAGS_reallocate_gpu_memory_in_mb to fix value.
if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) {
realloc_size_ = platform::MLUReallocSize();
}
allocate_bytes = std::max(realloc_size_, request_bytes);
}
}
allocate_bytes = DeviceAllocateSize(&platform::GpuInitAllocSize,
&platform::GpuReallocSize, request_bytes);
#elif defined(PADDLE_WITH_ASCEND_CL)
allocate_bytes = DeviceAllocateSize(&platform::NPUInitAllocSize,
&platform::NPUReallocSize, request_bytes);
#elif defined(PADDLE_WITH_MLU)
allocate_bytes =
DeviceAllocateSize(&platform::MLUInitAllocSize(),
&platform::MLUReallocSize(), request_bytes);
#endif
// Allocate a new block
......@@ -293,7 +250,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
total_free_ += allocate_bytes;
// record the chunk.
chunks_.insert(IndexSizeAddress(index, allocate_bytes, p));
chunks_.insert({{allocate_bytes, p}, index});
// dump the block into pool
return pool_.insert(IndexSizeAddress(index, allocate_bytes, p)).first;
......@@ -350,6 +307,31 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
return block;
}
size_t BuddyAllocator::DeviceAllocateSize(
std::function<size_t()> init_allocate_size_func,
std::function<size_t()> re_allocate_size_func, size_t request_bytes) {
size_t allocate_bytes = max_chunk_size_;
#if defined(USE_DEVICE)
const bool use_gpu = system_allocator_->UseGpu();
VLOG(10) << "use_gpu " << use_gpu << ", total_used " << total_used_
<< ", total_free " << total_free_;
if (use_gpu) {
if (total_used_ == 0 && total_free_ == 0) {
// Compute the allocation size for gpu for the first allocation.
allocate_bytes = std::max(init_allocate_size_func(), request_bytes);
} else {
// Compute the re-allocation size, we store the re-allocation size when
// user set FLAGS_reallocate_gpu_memory_in_mb to fix value.
if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) {
realloc_size_ = re_allocate_size_func();
}
allocate_bytes = std::max(realloc_size_, request_bytes);
}
}
#endif
return allocate_bytes;
}
} // namespace detail
} // namespace memory
} // namespace paddle
......@@ -15,11 +15,14 @@ limitations under the License. */
#pragma once
#include <stdint.h>
#include <functional>
#include <map>
#include <memory>
#include <mutex> // NOLINT
#include <set>
#include <tuple>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/memory/detail/memory_block.h"
......@@ -59,6 +62,9 @@ class BuddyAllocator {
using IndexSizeAddress = std::tuple<size_t, size_t, void*>;
// Each element in PoolSet is a free allocation
using PoolSet = std::set<IndexSizeAddress>;
// Each element in PoolMap is an allocation record
// key: <size, ptr>, value: index
using PoolMap = std::map<std::pair<size_t, void*>, size_t>;
/*! \brief Allocate fixed-size memory from system */
void* SystemAlloc(size_t size);
......@@ -80,6 +86,11 @@ class BuddyAllocator {
/*! \brief Find the existing chunk which used to allocation */
PoolSet::iterator FindExistChunk(size_t size);
/*! \brief Allocate bytes from the device */
size_t DeviceAllocateSize(std::function<size_t()> init_allocate_size_func,
std::function<size_t()> re_allocate_size_func,
size_t request_bytes);
private:
size_t total_used_ = 0; // the total size of used memory
size_t total_free_ = 0; // the total size of free memory
......@@ -102,7 +113,7 @@ class BuddyAllocator {
/**
* \brief Record the allocated chunks when Refill pool.
*/
PoolSet chunks_;
PoolMap chunks_;
private:
/*! Unify the metadata format between GPU and CPU allocations */
......
......@@ -189,6 +189,35 @@ TEST(BuddyAllocator, FractionRefillPool) {
buddy_allocator.Free(p1);
}
TEST(BuddyAllocator, DeviceRefillPool) {
const size_t malloc_size = 10;
const size_t malloc_bytes = malloc_size << 20;
FLAGS_initial_gpu_memory_in_mb = malloc_size;
FLAGS_reallocate_gpu_memory_in_mb = malloc_size;
EXPECT_EQ(platform::GpuMaxChunkSize(), malloc_bytes);
size_t max_chunk_size = platform::GpuMaxChunkSize();
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
platform::GpuMinChunkSize(), max_chunk_size);
int* p0 = TestBuddyAllocator(&buddy_allocator, malloc_bytes - 1000,
/* use_system_allocator = */ false,
/* free_ptr = */ false);
// Max chunk size should be same during allocation
EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
int* p1 = TestBuddyAllocator(&buddy_allocator, malloc_bytes - 1000,
/* use_system_allocator = */ false,
/* free_ptr = */ false);
// Max chunk size should be same during allocation
EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
buddy_allocator.Free(p0);
buddy_allocator.Free(p1);
}
TEST(BuddyAllocator, AllocFromAvailable) {
FLAGS_fraction_of_gpu_memory_to_use = 0.7;
FLAGS_initial_gpu_memory_in_mb = 0;
......@@ -350,7 +379,6 @@ TEST(BuddyAllocator, Release) {
#ifdef PADDLE_WITH_ASCEND_CL
TEST(BuddyAllocator, NpuFraction) {
// In a 16 GB machine, the pool size will be about 160 MB
FLAGS_fraction_of_gpu_memory_to_use = 0.005;
FLAGS_fraction_of_gpu_memory_to_use = 0.92;
FLAGS_initial_gpu_memory_in_mb = 0;
FLAGS_reallocate_gpu_memory_in_mb = 0;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册