diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc index ebe680f5eea4948339fb8c5584a5b9f5d71c752e..2462ba084b996b33cf95925ea6d2c476b53f4182 100644 --- a/paddle/memory/detail/buddy_allocator.cc +++ b/paddle/memory/detail/buddy_allocator.cc @@ -12,22 +12,161 @@ See the License for the specific language governing permissions and limitations under the License. */ -#pragma once - #include "paddle/memory/detail/buddy_allocator.h" +#include "glog/logging.h" namespace paddle { namespace memory { namespace detail { -BuddyAllocator::BuddyAllocator(size_t pool_size, size_t max_pools, - SystemAllocator* system_allocator) - : pool_size_(pool_size), - max_pools_(max_pools), - system_allocator_(system_allocator) { - PADDLE_ASSERT(pool_size > 0); - PADDLE_ASSERT(max_pools > 0); +BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator, + size_t min_chunk_size, size_t max_chunk_size) { + PADDLE_ASSERT(min_chunk_size > 0); + PADDLE_ASSERT(max_chunk_size > 0); PADDLE_ASSERT(system_allocator != nullptr); + + system_allocator_ = std::move(system_allocator); + min_chunk_size_ = min_chunk_size; + max_chunk_size_ = max_chunk_size; +} + +inline size_t align(size_t size, size_t alignment) { + size_t remaining = size % alignment; + return remaining == 0 ? size : size + (alignment - remaining); +} + +void* BuddyAllocator::Alloc(size_t unaligned_size) { + // adjust allocation alignment + size_t size = align(unaligned_size + sizeof(Metadata), min_chunk_size_); + + // acquire the allocator lock + std::lock_guard lock(mutex_); + + DLOG(INFO) << "Allocate " << unaligned_size << " bytes from chunk size " + << size; + + // if the allocation is huge, send directly to the system allocator + if (size > max_chunk_size_) { + DLOG(INFO) << "Allocate from system allocator."; + + return SystemAlloc(size); + } + + // query and allocate from the existing chunk + auto it = FindExistChunk(size); + + // refill the pool if failure + if (it == pool_.end()) { + it = RefillPool(); + } else { + DLOG(INFO) << " Allocation from existing memory block " << std::get<2>(*it) + << " at address " + << reinterpret_cast(std::get<2>(*it))->data(); + } + + // if still failure, fail fatally + if (it == pool_.end()) { + return nullptr; + } + + total_used_ += size; + total_free_ -= size; + + // split the allocation and return data for use + return reinterpret_cast(SplitToAlloc(it, size))->data(); +} + +void* BuddyAllocator::SystemAlloc(size_t size) { + size_t index = 0; + void* p = system_allocator_->Alloc(index, size); + + DLOG(INFO) << "Allocated " << p << " from system allocator."; + + if (p == nullptr) return nullptr; + + static_cast(p)->init(cache_, MemoryBlock::HUGE_CHUNK, index, + size, nullptr, nullptr); + + return static_cast(p)->data(); +} + +BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { +#ifndef PADDLE_ONLY_CPU + if (system_allocator_->UseGpu()) { + if ((total_used_ + total_free_) == 0) { + // Compute the maximum allocation size for the first allocation. + max_chunk_size_ = platform::GpuMaxChunkSize(); + } + } +#endif // PADDLE_ONLY_CPU + + // Allocate a new maximum sized block + size_t index = 0; + void* p = system_allocator_->Alloc(index, max_chunk_size_); + + if (p == nullptr) return pool_.end(); + + DLOG(INFO) << " Creating and inserting new block " << p + << " from system allocator"; + + static_cast(p)->init(cache_, MemoryBlock::FREE_CHUNK, index, + max_chunk_size_, nullptr, nullptr); + + total_free_ += max_chunk_size_; + + // dump the block into pool + return pool_.insert({index, max_chunk_size_, p}).first; +} + +BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) { + size_t index = 0; + + while (1) { + auto it = pool_.lower_bound({index, size, nullptr}); + if (it == pool_.end()) return it; + + if (std::get<0>(*it) > index) { + if (std::get<1>(*it) >= size) { + return it; + } + + index = std::get<0>(*it); + continue; + } + return it; + } +} + +void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it, + size_t size) { + auto block = static_cast(std::get<2>(*it)); + + pool_.erase(it); + + DLOG(INFO) << " Split block (" << block << ", " << block->total_size(cache_) + << ") into"; + + block->split(cache_, size); + + DLOG(INFO) << " Left block (" << block << ", " << block->total_size(cache_) + << ")"; + + block->set_type(cache_, MemoryBlock::ARENA_CHUNK); + + // the rest of memory if exist + if (block->has_right_buddy(cache_)) { + if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) { + DLOG(INFO) << " Insert right block (" << block->right_buddy(cache_) + << ", " << block->right_buddy(cache_)->total_size(cache_) + << ")"; + + pool_.insert({block->right_buddy(cache_)->index(cache_), + block->right_buddy(cache_)->total_size(cache_), + block->right_buddy(cache_)}); + } + } + + return block; } } // namespace detail diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h index 82e6aaedc719966b4074449ce1ef7193c73dc265..38bedc9a18366c328e17da744dbb16b0940bbeef 100644 --- a/paddle/memory/detail/buddy_allocator.h +++ b/paddle/memory/detail/buddy_allocator.h @@ -15,9 +15,15 @@ #pragma once #include "paddle/memory/detail/system_allocator.h" +#include "paddle/memory/detail/metadata.h" +#include "paddle/platform/assert.h" +#include "paddle/platform/cpu_info.h" +#include "paddle/platform/gpu_info.h" +#include #include #include +#include namespace paddle { namespace memory { @@ -25,55 +31,83 @@ namespace detail { class BuddyAllocator { public: - BuddyAllocator(size_t pool_size, size_t max_pools, - SystemAllocator* system_allocator); + BuddyAllocator(SystemAllocator* system_allocator, size_t min_chunk_size, + size_t max_chunk_size); + ~BuddyAllocator(); - void* Alloc(size_t size); + public: + void* Alloc(size_t unaligned_size); void Free(void*); size_t Used(); + public: + // Disable copy and assignment. + BuddyAllocator(const BuddyAllocator&) = delete; + BuddyAllocator& operator=(const BuddyAllocator&) = delete; + private: - struct Block { - size_t size_; - Block* left_; // left buddy - Block* right_; // right buddy - }; + // Tuple type: allocator index, memory size, memory address + using IndexSizeAddress = std::tuple; + using PoolSet = std::set; - // Initially, there is only one pool. If a Alloc founds not enough - // memory from that pool, and there has not been max_num_pools_, - // create a new pool by calling system_allocator_.Alloc(pool_size_). - std::vector pools_; + /*! \brief Allocate fixed-size memory from system */ + void* SystemAlloc(size_t size); - size_t pool_size_; // the size of each pool; - size_t max_num_pools_; // the size of all pools; + /*! \brief If existing chunks are not suitable, refill pool */ + PoolSet::iterator RefillPool(); - SystemAllocator* system_allocator_; + /** + * \brief Find the suitable chunk from existing pool + * + * \param it pool iterator which contains suitable block. + * \param size the size of allocation. + */ + void* SplitToAlloc(PoolSet::iterator it, size_t size); - std::mutex mutex_; + /*! \brief Find the existing chunk which used to allocation */ + PoolSet::iterator FindExistChunk(size_t size); - // Disable copy and assignment. - BuddyAllocator(const BuddyAllocator&) = delete; - BuddyAllocator& operator=(const BuddyAllocator&) = delete; + private: + size_t total_used_ = 0; // the total size of used memory + size_t total_free_ = 0; // the total size of free memory + + size_t min_chunk_size_; // the minimum size of each chunk + size_t max_chunk_size_; // the maximum size of each chunk + + private: + PoolSet pool_; + + private: + // Unify the metadata format between GPU and CPU allocations + using MetadataCache = std::unordered_map; + MetadataCache cache_; + + private: + SystemAllocator* system_allocator_; + std::mutex mutex_; }; -BuddyAllocator* GetCPUBuddyAllocator() { - static BuddyAllocator* a = nullptr; +BuddyAllocator* GetCPUBuddyAllocator() { + static BuddyAllocator* a = nullptr; if (a == nullptr) { - a = new BuddyAllocator(); + a = new BuddyAllocator(new CPUAllocator, platform::CpuMinChunkSize(), + platform::CpuMaxChunkSize()); } return a; } #ifndef PADDLE_ONLY_CPU // The following code are for CUDA. -BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { - static BuddyAllocator** as = NULL; +BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { + static BuddyAllocator** as = NULL; if (as == NULL) { - int gpu_num = platform::GetDeviceCount(); - as = new BuddyAllocator*[gpu_num]; + int gpu_num = platform::GpuDeviceCount(); + as = new BuddyAllocator*[gpu_num]; for (int gpu = 0; gpu < gpu_num; gpu++) { - as[gpu] = new BuddyAllocator(); + as[gpu] = + new BuddyAllocator(new GPUAllocator, platform::GpuMinChunkSize(), + platform::GpuMaxChunkSize()); } } return as[gpu_id];