From 05114693cfb13eec3efb7cba0b9a52b411300126 Mon Sep 17 00:00:00 2001 From: Wilber Date: Wed, 4 Nov 2020 00:29:15 -0600 Subject: [PATCH] [Inference] Memory modification for ShrinkMemory. (#28355) --- paddle/fluid/memory/allocation/CMakeLists.txt | 1 + paddle/fluid/memory/allocation/allocator.h | 3 + .../memory/allocation/allocator_facade.cc | 5 ++ .../memory/allocation/allocator_facade.h | 3 + .../auto_growth_best_fit_allocator.h | 3 + .../auto_growth_best_fit_allocator_test.cc | 1 + .../allocation/naive_best_fit_allocator.cc | 51 +++++++++++++ .../allocation/naive_best_fit_allocator.h | 1 + .../naive_best_fit_allocator_test.cc | 74 +++++++++++++++++++ .../fluid/memory/allocation/retry_allocator.h | 3 + .../memory/allocation/retry_allocator_test.cc | 2 + .../allocation/thread_local_allocator.cc | 2 + .../allocation/thread_local_allocator.h | 4 + .../allocation/thread_local_allocator_test.cc | 1 + paddle/fluid/memory/detail/buddy_allocator.cc | 41 +++++++++- paddle/fluid/memory/detail/buddy_allocator.h | 7 ++ .../memory/detail/buddy_allocator_test.cc | 17 +++++ paddle/fluid/memory/malloc.cc | 4 + paddle/fluid/memory/malloc.h | 2 + 19 files changed, 223 insertions(+), 2 deletions(-) create mode 100644 paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 9cc7c26745..8a1a1115ad 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -4,6 +4,7 @@ cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator) cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator) cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator) cc_library(naive_best_fit_allocator SRCS naive_best_fit_allocator.cc DEPS allocator buddy_allocator profiler) +cc_test(naive_best_fit_allocator_test SRCS naive_best_fit_allocator_test.cc DEPS naive_best_fit_allocator) cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS locked_allocator buffered_allocator cpu_allocator best_fit_allocator) if (WITH_MKLDNN) diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h index e54748a536..b83d3efb72 100644 --- a/paddle/fluid/memory/allocation/allocator.h +++ b/paddle/fluid/memory/allocation/allocator.h @@ -178,12 +178,15 @@ class Allocator { FreeImpl(allocation); } + inline void Release(const platform::Place& place) { ReleaseImpl(place); } + // True if the `Allocate` is thread safe. virtual bool IsAllocThreadSafe() const; protected: virtual Allocation* AllocateImpl(size_t size) = 0; virtual void FreeImpl(Allocation* allocation); + virtual void ReleaseImpl(const platform::Place& place) {} }; using AllocationDeleter = Allocator::AllocationDeleter; diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 3213684c14..59b06d0828 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -287,6 +287,11 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, return m_->GetAllocator(place, size)->Allocate(size); } +void AllocatorFacade::Release(const platform::Place& place) { + m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1) + ->Release(place); +} + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index 64b6fe25c3..2f2f222f6c 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -44,6 +44,9 @@ class AllocatorFacade { // Allocate a unique allocation. AllocationPtr Alloc(const platform::Place& place, size_t size); + // Release unused memory pool. + void Release(const platform::Place& place); + // TODO(yy): Allocate a Copy-On-Write allocation? private: AllocatorFacade(); diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h index cbc126264a..b55ebf1893 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h @@ -39,6 +39,9 @@ class AutoGrowthBestFitAllocator : public Allocator { void FreeImpl(Allocation *allocation) override; + // Release the memory block which is not used in pool. + void ReleaseImpl(const platform::Place &place) override { FreeIdleChunks(); } + private: void FreeIdleChunks(); diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc index 685248a88f..dbe2f0ac94 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc @@ -65,6 +65,7 @@ static void TestFreeIdleChunk(bool free_idle_chunk, } else { ASSERT_EQ(recorded_allocator->AllocatedSize(), memory_size + alignment); } + ag_allocator->Release(platform::CPUPlace()); } } diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index c661c9f9c3..842ebd16cf 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -53,6 +53,9 @@ void *Alloc(const Place &place, size_t size); template void Free(const Place &place, void *p, size_t size); +template +void Release(const Place &place); + template size_t Used(const Place &place); @@ -99,6 +102,11 @@ void Free(const platform::CPUPlace &place, void *p, GetCPUBuddyAllocator()->Free(p); } +template <> +void Release(const platform::CPUPlace &place) { + GetCPUBuddyAllocator()->Release(); +} + template <> size_t Used(const platform::CPUPlace &place) { return GetCPUBuddyAllocator()->Used(); @@ -186,6 +194,17 @@ void Free(const platform::XPUPlace &place, void *p, #endif } +template <> +void Release(const platform::XPUPlace &place) { +#ifdef PADDLE_WITH_XPU + PADDLE_THROW( + platform::errors::PermissionDenied("Release XPU pool is not supported.")); +#else + PADDLE_THROW( + platform::errors::PermissionDenied("'XPUPlace' is not supported.")); +#endif +} + template <> size_t Used(const platform::XPUPlace &place) { #ifdef PADDLE_WITH_XPU @@ -313,6 +332,16 @@ void Free(const platform::CUDAPlace &place, void *p, #endif } +template <> +void Release(const platform::CUDAPlace &place) { +#ifdef PADDLE_WITH_CUDA + GetGPUBuddyAllocator(place.device)->Release(); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'CUDAPlace' is not supported in CPU only device.")); +#endif +} + #ifdef PADDLE_WITH_CUDA BuddyAllocator *GetCUDAPinnedBuddyAllocator() { static std::once_flag init_flag; @@ -371,6 +400,17 @@ void Free(const platform::CUDAPinnedPlace &place, #endif } +template <> +void Release( + const platform::CUDAPinnedPlace &place) { +#ifdef PADDLE_WITH_CUDA + GetCUDAPinnedBuddyAllocator()->Release(); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'CUDAPinnedPlace' is not supported in CPU only device.")); +#endif +} + struct AllocVisitor : public boost::static_visitor { inline explicit AllocVisitor(size_t size) : size_(size) {} @@ -397,6 +437,13 @@ struct FreeVisitor : public boost::static_visitor { size_t size_; }; +struct ReleaseVisitor : public boost::static_visitor { + template + inline void operator()(const Place &place) const { + Release(place); + } +}; + size_t Usage::operator()(const platform::CPUPlace &cpu) const { return Used(cpu); } @@ -439,6 +486,10 @@ void NaiveBestFitAllocator::FreeImpl(Allocation *allocation) { delete allocation; } +void NaiveBestFitAllocator::ReleaseImpl(const platform::Place &place) { + boost::apply_visitor(legacy::ReleaseVisitor(), place); +} + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h index 4cf1bd6123..ba4c4ca226 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h @@ -35,6 +35,7 @@ class NaiveBestFitAllocator : public Allocator { protected: Allocation *AllocateImpl(size_t size) override; void FreeImpl(Allocation *allocation) override; + void ReleaseImpl(const platform::Place &place) override; private: platform::Place place_; diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc new file mode 100644 index 0000000000..054c75b11f --- /dev/null +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc @@ -0,0 +1,74 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h" + +#include +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" + +namespace paddle { +namespace memory { +namespace allocation { + +TEST(NaiveBestFitAllocatorTest, CpuAlloc) { + NaiveBestFitAllocator alloc{platform::CPUPlace()}; + { + size_t size = (1 << 20); + auto allocation = alloc.Allocate(size); + } + alloc.Release(platform::CPUPlace()); + + size_t size = (1 << 20); + auto allocation = alloc.Allocate(size); + alloc.Release(platform::CPUPlace()); +} + +#ifdef PADDLE_WITH_CUDA +TEST(NaiveBestFitAllocatorTest, GpuAlloc) { + NaiveBestFitAllocator alloc{platform::CUDAPlace(0)}; + { + size_t size = (1 << 20); + auto allocation = alloc.Allocate(size); + } + alloc.Release(platform::CUDAPlace(0)); + + size_t size = (1 << 20); + auto allocation = alloc.Allocate(size); + alloc.Release(platform::CUDAPlace(0)); +} + +TEST(NaiveBestFitAllocatorTest, CudaPinnedAlloc) { + NaiveBestFitAllocator alloc{platform::CUDAPinnedPlace()}; + { + size_t size = (1 << 20); + auto allocation = alloc.Allocate(size); + } + alloc.Release(platform::CUDAPinnedPlace()); + + size_t size = (1 << 20); + auto allocation = alloc.Allocate(size); + alloc.Release(platform::CUDAPinnedPlace()); +} +#endif + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h index 4a787ff2d7..74828a0ede 100644 --- a/paddle/fluid/memory/allocation/retry_allocator.h +++ b/paddle/fluid/memory/allocation/retry_allocator.h @@ -47,6 +47,9 @@ class RetryAllocator : public Allocator { protected: void FreeImpl(Allocation* allocation) override; Allocation* AllocateImpl(size_t size) override; + void ReleaseImpl(const platform::Place& place) override { + underlying_allocator_->Release(place); + } private: std::shared_ptr underlying_allocator_; diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc index b80e48460b..13b77c660c 100644 --- a/paddle/fluid/memory/allocation/retry_allocator_test.cc +++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc @@ -96,6 +96,7 @@ TEST(RetryAllocator, RetryAllocator) { bool is_all_equal = std::all_of(addresses.begin(), addresses.end(), [val](void *p) { return p == val; }); ASSERT_TRUE(is_all_equal); + allocator->Release(platform::CPUPlace()); } } @@ -135,6 +136,7 @@ TEST(RetryAllocator, RetryAllocatorLastAllocFailure) { auto allocation = allocator.Allocate(allocate_size); ASSERT_TRUE(false); allocation.reset(); + allocator.Release(p); } catch (BadAlloc &ex) { ASSERT_TRUE(std::string(ex.what()).find("Cannot allocate") != std::string::npos); diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.cc b/paddle/fluid/memory/allocation/thread_local_allocator.cc index 50fe9c9b75..d2a8250d3d 100644 --- a/paddle/fluid/memory/allocation/thread_local_allocator.cc +++ b/paddle/fluid/memory/allocation/thread_local_allocator.cc @@ -72,6 +72,8 @@ void ThreadLocalAllocatorImpl::FreeImpl(ThreadLocalAllocation* allocation) { delete allocation; } +void ThreadLocalAllocatorImpl::ReleaseImpl() { buddy_allocator_->Release(); } + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.h b/paddle/fluid/memory/allocation/thread_local_allocator.h index 10ca4b828a..764509e75b 100644 --- a/paddle/fluid/memory/allocation/thread_local_allocator.h +++ b/paddle/fluid/memory/allocation/thread_local_allocator.h @@ -52,6 +52,7 @@ class ThreadLocalAllocatorImpl explicit ThreadLocalAllocatorImpl(const platform::Place& p); ThreadLocalAllocation* AllocateImpl(size_t size); void FreeImpl(ThreadLocalAllocation* allocation); + void ReleaseImpl(); private: std::unique_ptr buddy_allocator_; @@ -91,6 +92,9 @@ class ThreadLocalCUDAAllocator : public Allocator { auto allocator_impl = tl_allocation->GetAllocator(); allocator_impl->FreeImpl(tl_allocation); } + void ReleaseImpl(const platform::Place& p) override { + return ThreadLocalCUDAAllocatorPool::Instance().Get(gpu_id_)->ReleaseImpl(); + } private: int gpu_id_; diff --git a/paddle/fluid/memory/allocation/thread_local_allocator_test.cc b/paddle/fluid/memory/allocation/thread_local_allocator_test.cc index f9e2ea8c27..70fd3a48d7 100644 --- a/paddle/fluid/memory/allocation/thread_local_allocator_test.cc +++ b/paddle/fluid/memory/allocation/thread_local_allocator_test.cc @@ -62,6 +62,7 @@ TEST(ThreadLocalAllocator, cross_scope_release) { auto tl_allocator_impl = ThreadLocalCUDAAllocatorPool::Instance().Get(devices[j]); allocator_addresses[j][i] = tl_allocator_impl.get(); + memory::Release(platform::CUDAPlace(devices[j])); } }); } diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc index 6ac99744d7..e7738d0714 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.cc +++ b/paddle/fluid/memory/detail/buddy_allocator.cc @@ -39,9 +39,10 @@ BuddyAllocator::~BuddyAllocator() { while (!pool_.empty()) { auto block = static_cast(std::get<2>(*pool_.begin())); auto desc = cache_.LoadDesc(block); - VLOG(10) << "Free from block (" << block << ", " << desc->get_size() << ")"; + VLOG(10) << "Free from block (" << block << ", " << desc->get_total_size() + << ")"; - system_allocator_->Free(block, desc->get_size(), desc->get_index()); + system_allocator_->Free(block, desc->get_total_size(), desc->get_index()); cache_.Invalidate(block); pool_.erase(pool_.begin()); } @@ -161,6 +162,39 @@ void BuddyAllocator::Free(void* p) { IndexSizeAddress(desc->get_index(), desc->get_total_size(), block)); } +void BuddyAllocator::Release() { + std::lock_guard lock(mutex_); + int num = 0; + uint64_t bytes = 0; + bool del_flag = false; + for (auto iter = pool_.begin(); iter != pool_.end();) { + auto remain_size = std::get<1>(*iter); + auto remain_ptr = std::get<2>(*iter); + for (auto& chunk : chunks_) { + auto init_size = std::get<1>(chunk); + auto init_ptr = std::get<2>(chunk); + + if (init_size == remain_size && init_ptr == remain_ptr) { + ++num; + bytes += init_size; + total_free_ -= init_size; + auto block = static_cast(std::get<2>(chunk)); + system_allocator_->Free(init_ptr, init_size, std::get<0>(chunk)); + cache_.Invalidate(block); + del_flag = true; + break; + } + } + + if (del_flag) { + iter = pool_.erase(iter); + } else { + iter++; + } + } + VLOG(10) << "Release " << num << " chunk, Free " << bytes << " bytes."; +} + size_t BuddyAllocator::Used() { return total_used_; } size_t BuddyAllocator::GetMinChunkSize() { return min_chunk_size_; } size_t BuddyAllocator::GetMaxChunkSize() { return max_chunk_size_; } @@ -213,6 +247,9 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool( total_free_ += allocate_bytes; + // record the chunk. + chunks_.insert(IndexSizeAddress(index, allocate_bytes, p)); + // dump the block into pool return pool_.insert(IndexSizeAddress(index, allocate_bytes, p)).first; } diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h index 791f8b5627..0bfc891850 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.h +++ b/paddle/fluid/memory/detail/buddy_allocator.h @@ -40,6 +40,8 @@ class BuddyAllocator { public: void* Alloc(size_t unaligned_size); void Free(void* ptr); + // Release the unused memory pool, a real free operation for the OS. + void Release(); size_t Used(); size_t GetMinChunkSize(); size_t GetMaxChunkSize(); @@ -92,6 +94,11 @@ class BuddyAllocator { */ PoolSet pool_; + /** + * \brief Record the allocated chunks when Refill pool. + */ + PoolSet chunks_; + private: /*! Unify the metadata format between GPU and CPU allocations */ MetadataCache cache_; diff --git a/paddle/fluid/memory/detail/buddy_allocator_test.cc b/paddle/fluid/memory/detail/buddy_allocator_test.cc index 1722acd10a..90f7e33eb3 100644 --- a/paddle/fluid/memory/detail/buddy_allocator_test.cc +++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc @@ -305,6 +305,23 @@ TEST(BuddyAllocator, SpeedAna) { std::cerr << "time cost " << diff.count() << std::endl; } +TEST(BuddyAllocator, Release) { + // In a 8 GB machine, the pool size will be about 800 MB + FLAGS_fraction_of_gpu_memory_to_use = 0.1; + FLAGS_initial_gpu_memory_in_mb = 0; + FLAGS_reallocate_gpu_memory_in_mb = 0; + + BuddyAllocator buddy_allocator( + std::unique_ptr(new GPUAllocator(TEST_GPU_ID)), + platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); + + // Less than pool size + TestBuddyAllocator(&buddy_allocator, 10); + TestBuddyAllocator(&buddy_allocator, 10 << 10); + TestBuddyAllocator(&buddy_allocator, 50 << 20); + + buddy_allocator.Release(); +} #endif } // namespace detail diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index e01f030585..2fbde03b42 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -31,5 +31,9 @@ AllocationPtr Alloc(const platform::Place &place, size_t size) { return allocation::AllocatorFacade::Instance().Alloc(place, size); } +void Release(const platform::Place &place) { + return allocation::AllocatorFacade::Instance().Release(place); +} + } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h index 73487795f7..3d6836e1d2 100644 --- a/paddle/fluid/memory/malloc.h +++ b/paddle/fluid/memory/malloc.h @@ -38,5 +38,7 @@ extern AllocationPtr Alloc(const platform::Place& place, size_t size); extern AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size); +extern void Release(const platform::Place& place); + } // namespace memory } // namespace paddle -- GitLab