diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index a95ea3964791c623dedb87ce483a90c7ced37c00..26ae89fe2869cbb0486daa6ce37c9bb86f043d7e 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -8,6 +8,9 @@ cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_alloca cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator) cc_test(multi_bin_buffered_allocator_test SRCS multi_bin_buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator multi_bin_buffered_allocator cpu_allocator) +cc_library(auto_increment_best_fit_allocator SRCS auto_increment_best_fit_allocator.cc DEPS allocator) +cc_test(auto_increment_best_fit_allocator_test SRCS auto_increment_best_fit_allocator_test.cc DEPS cpu_allocator auto_increment_best_fit_allocator) + if (WITH_GPU) nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard) endif() @@ -56,6 +59,7 @@ cc_library(allocator_facade SRCS allocator_facade.cc DEPS retry_allocator buffered_allocator multi_bin_buffered_allocator + auto_increment_best_fit_allocator allocator_strategy legacy_allocator ) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 1a9f5e8f7f018029128b7dc9ddd0cb07549cc0b8..b35032fb3c1d4436e94851b7731572834ce0cf37 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -22,6 +22,7 @@ #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "paddle/fluid/memory/allocation/auto_increment_allocator.h" +#include "paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h" #include "paddle/fluid/memory/allocation/best_fit_allocator.h" #include "paddle/fluid/memory/allocation/conditional_allocator.h" #include "paddle/fluid/memory/allocation/cpu_allocator.h" @@ -195,17 +196,57 @@ class AllocatorFacadePrivate { ~AllocatorFacadePrivate() = default; AllocatorFacadePrivate() { - if (GetAllocatorStrategy() == AllocatorStrategy::kLegacy) { - InitLegacyAllocator(); - } else { - InitCPUAllocator(); - InitCUDAAllocator(); - InitCUDAPinnedAllocator(); - WrapZeroSizeAllocator(); + auto strategy = GetAllocatorStrategy(); + switch (strategy) { + case AllocatorStrategy::kLegacy: { + InitLegacyAllocator(); + break; + } + case AllocatorStrategy::kNaiveBestFit: { + InitCPUAllocator(); + InitCUDAAllocator(); + InitCUDAPinnedAllocator(); + WrapZeroSizeAllocator(); + break; + } + case AllocatorStrategy::kAutoGrowthBestFit: { + InitCPUAllocator(); + InitAutoGrowthCUDAAllocator(); + InitAutoGrowthCUDAPinnedAllocator(); + WrapZeroSizeAllocator(); + break; + } + default: { + PADDLE_THROW("Unsupported allocator strategy: %d", + static_cast(strategy)); + } } } private: + void InitAutoGrowthCUDAAllocator() { +#ifdef PADDLE_WITH_CUDA + int dev_cnt = platform::GetCUDADeviceCount(); + for (int dev_id = 0; dev_id < dev_cnt; ++dev_id) { + auto cuda_allocator = std::make_shared>( + std::make_shared(platform::CUDAPlace(dev_id))); + allocators_[platform::CUDAPlace(dev_id)] = + std::make_shared( + cuda_allocator, platform::GpuMaxChunkSize(), 4096); + } +#endif + } + + void InitAutoGrowthCUDAPinnedAllocator() { +#ifdef PADDLE_WITH_CUDA + auto cuda_pinned_allocator = std::make_shared>( + std::make_shared()); + allocators_[platform::CUDAPinnedPlace()] = + std::make_shared( + cuda_pinned_allocator, platform::CUDAPinnedMaxChunkSize(), 4096); +#endif + } + void InitLegacyAllocator() { std::vector places{platform::CPUPlace()}; #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/memory/allocation/allocator_strategy.cc b/paddle/fluid/memory/allocation/allocator_strategy.cc index b46b1e9ae206b82f5810b4ba7345ebc60fb84285..d96fe0851d0ddeba2b982899c13a0d55677a179d 100644 --- a/paddle/fluid/memory/allocation/allocator_strategy.cc +++ b/paddle/fluid/memory/allocation/allocator_strategy.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "gflags/gflags.h" +#include "paddle/fluid/platform/enforce.h" DEFINE_string( allocator_strategy, "legacy", @@ -25,9 +26,16 @@ namespace memory { namespace allocation { static AllocatorStrategy GetStrategyFromFlag() { - return FLAGS_allocator_strategy == "legacy" - ? AllocatorStrategy::kLegacy - : AllocatorStrategy::kNaiveBestFit; + if (FLAGS_allocator_strategy == "legacy") { + return AllocatorStrategy::kLegacy; + } else if (FLAGS_allocator_strategy == "navie_best_fit") { + return AllocatorStrategy::kNaiveBestFit; + } else if (FLAGS_allocator_strategy == "auto_growth_best_fit") { + return AllocatorStrategy::kAutoGrowthBestFit; + } else { + PADDLE_THROW("Unsupported allocator strategy: %s", + FLAGS_allocator_strategy); + } } AllocatorStrategy GetAllocatorStrategy() { diff --git a/paddle/fluid/memory/allocation/allocator_strategy.h b/paddle/fluid/memory/allocation/allocator_strategy.h index 9adbd879939c562cf84579a92f21d3b82e69a7e5..9dad9c01901b8baf06679a52361c12d98f8cf8ea 100644 --- a/paddle/fluid/memory/allocation/allocator_strategy.h +++ b/paddle/fluid/memory/allocation/allocator_strategy.h @@ -18,7 +18,7 @@ namespace paddle { namespace memory { namespace allocation { -enum class AllocatorStrategy { kLegacy, kNaiveBestFit }; +enum class AllocatorStrategy { kLegacy, kNaiveBestFit, kAutoGrowthBestFit }; extern AllocatorStrategy GetAllocatorStrategy(); diff --git a/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.cc new file mode 100644 index 0000000000000000000000000000000000000000..ee52b10aa613e61544935a485b6d53fed3c903c9 --- /dev/null +++ b/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.cc @@ -0,0 +1,136 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h" +#include +#include +#include +#include +#include // NOLINT +#include + +namespace paddle { +namespace memory { +namespace allocation { + +static size_t align(size_t size, size_t alignment) { + auto remaining = size % alignment; + return remaining == 0 ? size : size + alignment - remaining; +} + +AutoIncrementBestFitAllocator::AutoIncrementBestFitAllocator( + const std::shared_ptr &underlying_allocator, size_t chunk_size, + size_t alignment) + : underlying_allocator_(underlying_allocator), + chunk_size_(align(chunk_size, alignment)), + alignment_(alignment) {} + +Allocation *AutoIncrementBestFitAllocator::AllocateImpl(size_t size, + Attr attr) { + if (size == 0) return nullptr; + size = align(size, alignment_); + std::lock_guard guard(mtx_); + auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr)); + BlockIt block_it; + if (iter != free_blocks_.end()) { + VLOG(2) << "Found " << iter->second->size_ << " for " << size; + block_it = iter->second; + free_blocks_.erase(iter); + auto *chunk = block_it->chunk_; + size_t remaining_size = block_it->size_ - size; + if (remaining_size == 0) { + block_it->is_free_ = false; + VLOG(2) << "Found and no remaining"; + } else { + auto remaining_free_block = chunk->blocks_.insert( + block_it, Chunk::Block(block_it->ptr_, remaining_size, true, chunk)); + free_blocks_.emplace(std::make_pair(remaining_size, block_it->ptr_), + remaining_free_block); + block_it->ptr_ = + reinterpret_cast(block_it->ptr_) + remaining_size; + block_it->size_ = size; + block_it->is_free_ = false; + VLOG(2) << "Found and remaining " << remaining_size; + } + } else { + size_t alloc_size = size; + if (!underlying_allocator_exhaustive_ && chunk_size_ > size) { + alloc_size = chunk_size_; + } + + try { + chunks_.emplace_back(underlying_allocator_->Allocate(alloc_size, attr)); + } catch (BadAlloc &ex) { + if (size == alloc_size) throw ex; + underlying_allocator_exhaustive_ = true; + alloc_size = size; + chunks_.emplace_back(underlying_allocator_->Allocate(alloc_size, attr)); + } + auto *chunk = &(*chunks_.rbegin()); + uint8_t *p = reinterpret_cast(chunk->allocation_->ptr()); + auto &blocks = chunk->blocks_; + + size_t remaining_size = alloc_size - size; + if (remaining_size > 0) { + blocks.emplace_back(p, remaining_size, true, chunk); + free_blocks_.emplace(std::make_pair(remaining_size, p), --(blocks.end())); + } + blocks.emplace_back(p + remaining_size, size, false, chunk); + block_it = --(blocks.end()); + VLOG(2) << "Not found and allocate " << alloc_size << ", and remaining " + << remaining_size; + } + VLOG(2) << "After allocate, free blocks " << free_blocks_.size(); + return new Chunk::BlockAllocation(block_it); +} + +void AutoIncrementBestFitAllocator::FreeImpl(Allocation *allocation) { + auto &block_it = static_cast(allocation)->block_it_; + auto &blocks = block_it->chunk_->blocks_; + + std::lock_guard guard(mtx_); + block_it->is_free_ = true; + + if (block_it != blocks.begin()) { + auto prev_it = block_it; + --prev_it; + + if (prev_it->is_free_) { + free_blocks_.erase(std::make_pair(prev_it->size_, prev_it->ptr_)); + prev_it->size_ += block_it->size_; + blocks.erase(block_it); + block_it = prev_it; + } + } + + auto next_it = block_it; + ++next_it; + + if (next_it != blocks.end() && next_it->is_free_) { + free_blocks_.erase(std::make_pair(next_it->size_, next_it->ptr_)); + block_it->size_ += next_it->size_; + blocks.erase(next_it); + } + + free_blocks_.emplace(std::make_pair(block_it->size_, block_it->ptr_), + block_it); + + VLOG(2) << "Combine " << block_it->size_ << ", " << blocks.size() << ", " + << free_blocks_.size(); + delete allocation; +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h new file mode 100644 index 0000000000000000000000000000000000000000..6e569c2627cf752d89b9d37c3c546215ec3df223 --- /dev/null +++ b/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h @@ -0,0 +1,87 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include // NOLINT +#include +#include "paddle/fluid/memory/allocation/allocator.h" + +namespace paddle { +namespace memory { +namespace allocation { + +class AutoIncrementBestFitAllocator : public Allocator { + public: + explicit AutoIncrementBestFitAllocator( + const std::shared_ptr &underlying_allocator, size_t chunk_size, + size_t alignment); + + bool IsAllocThreadSafe() const override { return true; } + + using AllocationList = std::list; + using AllocationListIt = AllocationList::iterator; + + struct Chunk { + struct Block { + Block(void *ptr, size_t size, bool is_free, Chunk *chunk) + : ptr_(ptr), size_(size), is_free_(is_free), chunk_(chunk) {} + + void *ptr_; + size_t size_; + bool is_free_; + Chunk *chunk_; // which chunk it is from + }; + + explicit Chunk(AllocationPtr allocation) + : allocation_(std::move(allocation)) {} + + AllocationPtr allocation_; + std::list blocks_; + // std::mutex mtx_; + + struct BlockAllocation : public Allocation { + explicit BlockAllocation(const std::list::iterator &it) + : Allocation(it->ptr_, it->size_, it->chunk_->allocation_->place()), + block_it_(it) {} + + std::list::iterator block_it_; + }; + }; + + protected: + Allocation *AllocateImpl(size_t size, Attr attr) override; + + void FreeImpl(Allocation *allocation) override; + + private: + using BlockIt = std::list::iterator; + + std::shared_ptr underlying_allocator_; + std::list chunks_; + std::map, BlockIt> free_blocks_; + size_t chunk_size_; + size_t alignment_; + + bool underlying_allocator_exhaustive_{false}; + + mutable std::mutex mtx_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..c5fb209279652f20f4f72cbf168dd09862ffc55b --- /dev/null +++ b/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator_test.cc @@ -0,0 +1,74 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include + +#include + +#include "paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h" +#include "paddle/fluid/memory/allocation/cpu_allocator.h" + +namespace paddle { +namespace memory { +namespace allocation { + +TEST(allocator, auto_increment_best_fit_allocator) { + auto cpu_allocator = std::make_shared(); + + auto allocator = + std::make_shared(cpu_allocator, 0, 4096); + + std::mutex mtx; + std::condition_variable cv; + bool flag = false; + + auto thread_main = [&] { + { + std::unique_lock lock(mtx); + cv.wait(lock, [&] { return flag; }); + } + for (size_t i = 10; i > 0; --i) { + allocator->Allocate((i + 1) * 1000); + } + }; + + std::vector ths; + for (size_t i = 10; i < 10; ++i) { + ths.emplace_back(thread_main); + } + + { + std::lock_guard lock(mtx); + flag = true; + } + cv.notify_all(); + + thread_main(); + + for (auto &th : ths) { + th.join(); + } + + std::cout << "test ends" << std::endl; +} + +} // namespace allocation +} // namespace memory +} // namespace paddle