diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 9cc7c267454a4dbd4e1f62ec971e4160d6088913..8a1a1115ad7bd3d917ac041504c3c20c6920ba9a 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -4,6 +4,7 @@ cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator) cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator) cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator) cc_library(naive_best_fit_allocator SRCS naive_best_fit_allocator.cc DEPS allocator buddy_allocator profiler) +cc_test(naive_best_fit_allocator_test SRCS naive_best_fit_allocator_test.cc DEPS naive_best_fit_allocator) cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS locked_allocator buffered_allocator cpu_allocator best_fit_allocator) if (WITH_MKLDNN) diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h index e54748a53679d1363246896a0982275c4ef09535..b83d3efb72b719662a49be5a3b9aaf27e7386ed0 100644 --- a/paddle/fluid/memory/allocation/allocator.h +++ b/paddle/fluid/memory/allocation/allocator.h @@ -178,12 +178,15 @@ class Allocator { FreeImpl(allocation); } + inline void Release(const platform::Place& place) { ReleaseImpl(place); } + // True if the `Allocate` is thread safe. virtual bool IsAllocThreadSafe() const; protected: virtual Allocation* AllocateImpl(size_t size) = 0; virtual void FreeImpl(Allocation* allocation); + virtual void ReleaseImpl(const platform::Place& place) {} }; using AllocationDeleter = Allocator::AllocationDeleter; diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 3213684c140b02e1fa4b846cb0448f9bc9d8f3ee..59b06d082872c11b56855bee75e9d14ac686d1e1 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -287,6 +287,11 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, return m_->GetAllocator(place, size)->Allocate(size); } +void AllocatorFacade::Release(const platform::Place& place) { + m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1) + ->Release(place); +} + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index 64b6fe25c352e82d6320e26d95efb61f3cb4a5b1..2f2f222f6c74a5c957461258f43fb1abf65e29b1 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -44,6 +44,9 @@ class AllocatorFacade { // Allocate a unique allocation. AllocationPtr Alloc(const platform::Place& place, size_t size); + // Release unused memory pool. + void Release(const platform::Place& place); + // TODO(yy): Allocate a Copy-On-Write allocation? private: AllocatorFacade(); diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h index cbc126264ac2c09ef2532bf21834a648c02473ec..b55ebf18934f2ba4d7b67f76f1c55ba9b426780e 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h @@ -39,6 +39,9 @@ class AutoGrowthBestFitAllocator : public Allocator { void FreeImpl(Allocation *allocation) override; + // Release the memory block which is not used in pool. + void ReleaseImpl(const platform::Place &place) override { FreeIdleChunks(); } + private: void FreeIdleChunks(); diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc index 685248a88f71d695095bd844dea06558e5cbcee6..dbe2f0ac94453ec7de0361dcf4eeb4817a947525 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc @@ -65,6 +65,7 @@ static void TestFreeIdleChunk(bool free_idle_chunk, } else { ASSERT_EQ(recorded_allocator->AllocatedSize(), memory_size + alignment); } + ag_allocator->Release(platform::CPUPlace()); } } diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index c661c9f9c37509f6b55f6ce8b67b11752c68418a..842ebd16cf8afedc150caa65b2fdedf5c130bb1b 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -53,6 +53,9 @@ void *Alloc(const Place &place, size_t size); template void Free(const Place &place, void *p, size_t size); +template +void Release(const Place &place); + template size_t Used(const Place &place); @@ -99,6 +102,11 @@ void Free(const platform::CPUPlace &place, void *p, GetCPUBuddyAllocator()->Free(p); } +template <> +void Release(const platform::CPUPlace &place) { + GetCPUBuddyAllocator()->Release(); +} + template <> size_t Used(const platform::CPUPlace &place) { return GetCPUBuddyAllocator()->Used(); @@ -186,6 +194,17 @@ void Free(const platform::XPUPlace &place, void *p, #endif } +template <> +void Release(const platform::XPUPlace &place) { +#ifdef PADDLE_WITH_XPU + PADDLE_THROW( + platform::errors::PermissionDenied("Release XPU pool is not supported.")); +#else + PADDLE_THROW( + platform::errors::PermissionDenied("'XPUPlace' is not supported.")); +#endif +} + template <> size_t Used(const platform::XPUPlace &place) { #ifdef PADDLE_WITH_XPU @@ -313,6 +332,16 @@ void Free(const platform::CUDAPlace &place, void *p, #endif } +template <> +void Release(const platform::CUDAPlace &place) { +#ifdef PADDLE_WITH_CUDA + GetGPUBuddyAllocator(place.device)->Release(); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'CUDAPlace' is not supported in CPU only device.")); +#endif +} + #ifdef PADDLE_WITH_CUDA BuddyAllocator *GetCUDAPinnedBuddyAllocator() { static std::once_flag init_flag; @@ -371,6 +400,17 @@ void Free(const platform::CUDAPinnedPlace &place, #endif } +template <> +void Release( + const platform::CUDAPinnedPlace &place) { +#ifdef PADDLE_WITH_CUDA + GetCUDAPinnedBuddyAllocator()->Release(); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'CUDAPinnedPlace' is not supported in CPU only device.")); +#endif +} + struct AllocVisitor : public boost::static_visitor { inline explicit AllocVisitor(size_t size) : size_(size) {} @@ -397,6 +437,13 @@ struct FreeVisitor : public boost::static_visitor { size_t size_; }; +struct ReleaseVisitor : public boost::static_visitor { + template + inline void operator()(const Place &place) const { + Release(place); + } +}; + size_t Usage::operator()(const platform::CPUPlace &cpu) const { return Used(cpu); } @@ -439,6 +486,10 @@ void NaiveBestFitAllocator::FreeImpl(Allocation *allocation) { delete allocation; } +void NaiveBestFitAllocator::ReleaseImpl(const platform::Place &place) { + boost::apply_visitor(legacy::ReleaseVisitor(), place); +} + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h index 4cf1bd6123e5fb3b99c60cc0a2750ef6295ab870..ba4c4ca226b1e08428df332a6b9f2f6774a07692 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h @@ -35,6 +35,7 @@ class NaiveBestFitAllocator : public Allocator { protected: Allocation *AllocateImpl(size_t size) override; void FreeImpl(Allocation *allocation) override; + void ReleaseImpl(const platform::Place &place) override; private: platform::Place place_; diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..054c75b11f78c7733c15ac39a44cdc45078af7e7 --- /dev/null +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc @@ -0,0 +1,74 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h" + +#include +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" + +namespace paddle { +namespace memory { +namespace allocation { + +TEST(NaiveBestFitAllocatorTest, CpuAlloc) { + NaiveBestFitAllocator alloc{platform::CPUPlace()}; + { + size_t size = (1 << 20); + auto allocation = alloc.Allocate(size); + } + alloc.Release(platform::CPUPlace()); + + size_t size = (1 << 20); + auto allocation = alloc.Allocate(size); + alloc.Release(platform::CPUPlace()); +} + +#ifdef PADDLE_WITH_CUDA +TEST(NaiveBestFitAllocatorTest, GpuAlloc) { + NaiveBestFitAllocator alloc{platform::CUDAPlace(0)}; + { + size_t size = (1 << 20); + auto allocation = alloc.Allocate(size); + } + alloc.Release(platform::CUDAPlace(0)); + + size_t size = (1 << 20); + auto allocation = alloc.Allocate(size); + alloc.Release(platform::CUDAPlace(0)); +} + +TEST(NaiveBestFitAllocatorTest, CudaPinnedAlloc) { + NaiveBestFitAllocator alloc{platform::CUDAPinnedPlace()}; + { + size_t size = (1 << 20); + auto allocation = alloc.Allocate(size); + } + alloc.Release(platform::CUDAPinnedPlace()); + + size_t size = (1 << 20); + auto allocation = alloc.Allocate(size); + alloc.Release(platform::CUDAPinnedPlace()); +} +#endif + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h index 4a787ff2d7b3848207449a1b7c04da0bd9884ea6..74828a0ede3f4318e8fe336ad0f189e3d58725f2 100644 --- a/paddle/fluid/memory/allocation/retry_allocator.h +++ b/paddle/fluid/memory/allocation/retry_allocator.h @@ -47,6 +47,9 @@ class RetryAllocator : public Allocator { protected: void FreeImpl(Allocation* allocation) override; Allocation* AllocateImpl(size_t size) override; + void ReleaseImpl(const platform::Place& place) override { + underlying_allocator_->Release(place); + } private: std::shared_ptr underlying_allocator_; diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc index b80e48460bf9f537667652d4937fca7bead1fe51..13b77c660ca8f54a9e1b7befcef40e9a76c0833f 100644 --- a/paddle/fluid/memory/allocation/retry_allocator_test.cc +++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc @@ -96,6 +96,7 @@ TEST(RetryAllocator, RetryAllocator) { bool is_all_equal = std::all_of(addresses.begin(), addresses.end(), [val](void *p) { return p == val; }); ASSERT_TRUE(is_all_equal); + allocator->Release(platform::CPUPlace()); } } @@ -135,6 +136,7 @@ TEST(RetryAllocator, RetryAllocatorLastAllocFailure) { auto allocation = allocator.Allocate(allocate_size); ASSERT_TRUE(false); allocation.reset(); + allocator.Release(p); } catch (BadAlloc &ex) { ASSERT_TRUE(std::string(ex.what()).find("Cannot allocate") != std::string::npos); diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.cc b/paddle/fluid/memory/allocation/thread_local_allocator.cc index 50fe9c9b7524945117abd8441f1f53f6e9ce1328..d2a8250d3db58cce463fac15587a1bef99b274d6 100644 --- a/paddle/fluid/memory/allocation/thread_local_allocator.cc +++ b/paddle/fluid/memory/allocation/thread_local_allocator.cc @@ -72,6 +72,8 @@ void ThreadLocalAllocatorImpl::FreeImpl(ThreadLocalAllocation* allocation) { delete allocation; } +void ThreadLocalAllocatorImpl::ReleaseImpl() { buddy_allocator_->Release(); } + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.h b/paddle/fluid/memory/allocation/thread_local_allocator.h index 10ca4b828a4bb508ed91d15f2649c3d0d5e1da9a..764509e75ba23a76a6d7c186f4a3daaa75302c8f 100644 --- a/paddle/fluid/memory/allocation/thread_local_allocator.h +++ b/paddle/fluid/memory/allocation/thread_local_allocator.h @@ -52,6 +52,7 @@ class ThreadLocalAllocatorImpl explicit ThreadLocalAllocatorImpl(const platform::Place& p); ThreadLocalAllocation* AllocateImpl(size_t size); void FreeImpl(ThreadLocalAllocation* allocation); + void ReleaseImpl(); private: std::unique_ptr buddy_allocator_; @@ -91,6 +92,9 @@ class ThreadLocalCUDAAllocator : public Allocator { auto allocator_impl = tl_allocation->GetAllocator(); allocator_impl->FreeImpl(tl_allocation); } + void ReleaseImpl(const platform::Place& p) override { + return ThreadLocalCUDAAllocatorPool::Instance().Get(gpu_id_)->ReleaseImpl(); + } private: int gpu_id_; diff --git a/paddle/fluid/memory/allocation/thread_local_allocator_test.cc b/paddle/fluid/memory/allocation/thread_local_allocator_test.cc index f9e2ea8c27a74c29e7b9bbea3ab30eadbfe48b3d..70fd3a48d7861ef6eb7ad8b8881fd5d22d5ab15b 100644 --- a/paddle/fluid/memory/allocation/thread_local_allocator_test.cc +++ b/paddle/fluid/memory/allocation/thread_local_allocator_test.cc @@ -62,6 +62,7 @@ TEST(ThreadLocalAllocator, cross_scope_release) { auto tl_allocator_impl = ThreadLocalCUDAAllocatorPool::Instance().Get(devices[j]); allocator_addresses[j][i] = tl_allocator_impl.get(); + memory::Release(platform::CUDAPlace(devices[j])); } }); } diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc index 6ac99744d79380803925f973c5b39262685e1ff0..e7738d07147510f5f1895559ce7b11dd8b3fd69c 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.cc +++ b/paddle/fluid/memory/detail/buddy_allocator.cc @@ -39,9 +39,10 @@ BuddyAllocator::~BuddyAllocator() { while (!pool_.empty()) { auto block = static_cast(std::get<2>(*pool_.begin())); auto desc = cache_.LoadDesc(block); - VLOG(10) << "Free from block (" << block << ", " << desc->get_size() << ")"; + VLOG(10) << "Free from block (" << block << ", " << desc->get_total_size() + << ")"; - system_allocator_->Free(block, desc->get_size(), desc->get_index()); + system_allocator_->Free(block, desc->get_total_size(), desc->get_index()); cache_.Invalidate(block); pool_.erase(pool_.begin()); } @@ -161,6 +162,39 @@ void BuddyAllocator::Free(void* p) { IndexSizeAddress(desc->get_index(), desc->get_total_size(), block)); } +void BuddyAllocator::Release() { + std::lock_guard lock(mutex_); + int num = 0; + uint64_t bytes = 0; + bool del_flag = false; + for (auto iter = pool_.begin(); iter != pool_.end();) { + auto remain_size = std::get<1>(*iter); + auto remain_ptr = std::get<2>(*iter); + for (auto& chunk : chunks_) { + auto init_size = std::get<1>(chunk); + auto init_ptr = std::get<2>(chunk); + + if (init_size == remain_size && init_ptr == remain_ptr) { + ++num; + bytes += init_size; + total_free_ -= init_size; + auto block = static_cast(std::get<2>(chunk)); + system_allocator_->Free(init_ptr, init_size, std::get<0>(chunk)); + cache_.Invalidate(block); + del_flag = true; + break; + } + } + + if (del_flag) { + iter = pool_.erase(iter); + } else { + iter++; + } + } + VLOG(10) << "Release " << num << " chunk, Free " << bytes << " bytes."; +} + size_t BuddyAllocator::Used() { return total_used_; } size_t BuddyAllocator::GetMinChunkSize() { return min_chunk_size_; } size_t BuddyAllocator::GetMaxChunkSize() { return max_chunk_size_; } @@ -213,6 +247,9 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool( total_free_ += allocate_bytes; + // record the chunk. + chunks_.insert(IndexSizeAddress(index, allocate_bytes, p)); + // dump the block into pool return pool_.insert(IndexSizeAddress(index, allocate_bytes, p)).first; } diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h index 791f8b56277723c59ea47e60c0d8d9eec9745fc4..0bfc8918503b9e210f00774c665e54a104779fcf 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.h +++ b/paddle/fluid/memory/detail/buddy_allocator.h @@ -40,6 +40,8 @@ class BuddyAllocator { public: void* Alloc(size_t unaligned_size); void Free(void* ptr); + // Release the unused memory pool, a real free operation for the OS. + void Release(); size_t Used(); size_t GetMinChunkSize(); size_t GetMaxChunkSize(); @@ -92,6 +94,11 @@ class BuddyAllocator { */ PoolSet pool_; + /** + * \brief Record the allocated chunks when Refill pool. + */ + PoolSet chunks_; + private: /*! Unify the metadata format between GPU and CPU allocations */ MetadataCache cache_; diff --git a/paddle/fluid/memory/detail/buddy_allocator_test.cc b/paddle/fluid/memory/detail/buddy_allocator_test.cc index 1722acd10aa38e33f3c11aa8eac7cb50dce9fed4..90f7e33eb3540f6272df80296bba57c3d7d9b596 100644 --- a/paddle/fluid/memory/detail/buddy_allocator_test.cc +++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc @@ -305,6 +305,23 @@ TEST(BuddyAllocator, SpeedAna) { std::cerr << "time cost " << diff.count() << std::endl; } +TEST(BuddyAllocator, Release) { + // In a 8 GB machine, the pool size will be about 800 MB + FLAGS_fraction_of_gpu_memory_to_use = 0.1; + FLAGS_initial_gpu_memory_in_mb = 0; + FLAGS_reallocate_gpu_memory_in_mb = 0; + + BuddyAllocator buddy_allocator( + std::unique_ptr(new GPUAllocator(TEST_GPU_ID)), + platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); + + // Less than pool size + TestBuddyAllocator(&buddy_allocator, 10); + TestBuddyAllocator(&buddy_allocator, 10 << 10); + TestBuddyAllocator(&buddy_allocator, 50 << 20); + + buddy_allocator.Release(); +} #endif } // namespace detail diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index e01f030585a8330a2e9bcc2bc2a662f00f5cde1c..2fbde03b42bcc025312cc5980afa35b7e320236f 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -31,5 +31,9 @@ AllocationPtr Alloc(const platform::Place &place, size_t size) { return allocation::AllocatorFacade::Instance().Alloc(place, size); } +void Release(const platform::Place &place) { + return allocation::AllocatorFacade::Instance().Release(place); +} + } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h index 73487795f752eab69e660154c2e35817b2c80368..3d6836e1d255b4de99672bec81e1ed226c3a9d14 100644 --- a/paddle/fluid/memory/malloc.h +++ b/paddle/fluid/memory/malloc.h @@ -38,5 +38,7 @@ extern AllocationPtr Alloc(const platform::Place& place, size_t size); extern AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size); +extern void Release(const platform::Place& place); + } // namespace memory } // namespace paddle