[Inference] Memory modification for ShrinkMemory. (#28355)

05114693 · Wilber · GitHub · 95b18683 · 05114693 · 05114693
19 changed file
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -4,6 +4,7 @@ cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
 cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
 cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
 cc_library(naive_best_fit_allocator SRCS naive_best_fit_allocator.cc DEPS allocator buddy_allocator profiler)
+cc_test(naive_best_fit_allocator_test SRCS naive_best_fit_allocator_test.cc DEPS naive_best_fit_allocator)
 cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS locked_allocator buffered_allocator cpu_allocator best_fit_allocator)
 if (WITH_MKLDNN)

--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -178,12 +178,15 @@ class Allocator {
    FreeImpl(allocation);
  }
+  inline void Release(const platform::Place& place) { ReleaseImpl(place); }
  // True if the `Allocate` is thread safe.
  virtual bool IsAllocThreadSafe() const;
 protected:
  virtual Allocation* AllocateImpl(size_t size) = 0;
  virtual void FreeImpl(Allocation* allocation);
+  virtual void ReleaseImpl(const platform::Place& place) {}
 };
 using AllocationDeleter = Allocator::AllocationDeleter;

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -287,6 +287,11 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
  return m_->GetAllocator(place, size)->Allocate(size);
 }
+void AllocatorFacade::Release(const platform::Place& place) {
+  m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
+      ->Release(place);
+}
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -44,6 +44,9 @@ class AllocatorFacade {
  // Allocate a unique allocation.
  AllocationPtr Alloc(const platform::Place& place, size_t size);
+  // Release unused memory pool.
+  void Release(const platform::Place& place);
  // TODO(yy): Allocate a Copy-On-Write allocation?
 private:
  AllocatorFacade();

--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
@@ -39,6 +39,9 @@ class AutoGrowthBestFitAllocator : public Allocator {
  void FreeImpl(Allocation *allocation) override;
+  // Release the memory block which is not used in pool.
+  void ReleaseImpl(const platform::Place &place) override { FreeIdleChunks(); }
 private:
  void FreeIdleChunks();

--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
@@ -65,6 +65,7 @@ static void TestFreeIdleChunk(bool free_idle_chunk,
    } else {
      ASSERT_EQ(recorded_allocator->AllocatedSize(), memory_size + alignment);
    }
+    ag_allocator->Release(platform::CPUPlace());
  }
 }

--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -53,6 +53,9 @@ void *Alloc(const Place &place, size_t size);
 template <typename Place>
 void Free(const Place &place, void *p, size_t size);
+template <typename Place>
+void Release(const Place &place);
 template <typename Place>
 size_t Used(const Place &place);
@@ -99,6 +102,11 @@ void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p,
  GetCPUBuddyAllocator()->Free(p);
 }
+template <>
+void Release<platform::CPUPlace>(const platform::CPUPlace &place) {
+  GetCPUBuddyAllocator()->Release();
+}
 template <>
 size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
  return GetCPUBuddyAllocator()->Used();
@@ -186,6 +194,17 @@ void Free<platform::XPUPlace>(const platform::XPUPlace &place, void *p,
 #endif
 }
+template <>
+void Release<platform::XPUPlace>(const platform::XPUPlace &place) {
+#ifdef PADDLE_WITH_XPU
+  PADDLE_THROW(
+      platform::errors::PermissionDenied("Release XPU pool is not supported."));
+#else
+  PADDLE_THROW(
+      platform::errors::PermissionDenied("'XPUPlace' is not supported."));
+#endif
+}
 template <>
 size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
 #ifdef PADDLE_WITH_XPU
@@ -313,6 +332,16 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
 #endif
 }
+template <>
+void Release<platform::CUDAPlace>(const platform::CUDAPlace &place) {
+#ifdef PADDLE_WITH_CUDA
+  GetGPUBuddyAllocator(place.device)->Release();
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'CUDAPlace' is not supported in CPU only device."));
+#endif
+}
 #ifdef PADDLE_WITH_CUDA
 BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
  static std::once_flag init_flag;
@@ -371,6 +400,17 @@ void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
 #endif
 }
+template <>
+void Release<platform::CUDAPinnedPlace>(
+    const platform::CUDAPinnedPlace &place) {
+#ifdef PADDLE_WITH_CUDA
+  GetCUDAPinnedBuddyAllocator()->Release();
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'CUDAPinnedPlace' is not supported in CPU only device."));
+#endif
+}
 struct AllocVisitor : public boost::static_visitor<void *> {
  inline explicit AllocVisitor(size_t size) : size_(size) {}
@@ -397,6 +437,13 @@ struct FreeVisitor : public boost::static_visitor<void> {
  size_t size_;
 };
+struct ReleaseVisitor : public boost::static_visitor<void> {
+  template <typename Place>
+  inline void operator()(const Place &place) const {
+    Release<Place>(place);
+  }
+};
 size_t Usage::operator()(const platform::CPUPlace &cpu) const {
  return Used(cpu);
 }
@@ -439,6 +486,10 @@ void NaiveBestFitAllocator::FreeImpl(Allocation *allocation) {
  delete allocation;
 }
+void NaiveBestFitAllocator::ReleaseImpl(const platform::Place &place) {
+  boost::apply_visitor(legacy::ReleaseVisitor(), place);
+}
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
@@ -35,6 +35,7 @@ class NaiveBestFitAllocator : public Allocator {
 protected:
  Allocation *AllocateImpl(size_t size) override;
  void FreeImpl(Allocation *allocation) override;
+  void ReleaseImpl(const platform::Place &place) override;
 private:
  platform::Place place_;

--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
+#include <algorithm>
+#include <chrono>              // NOLINT
+#include <condition_variable>  // NOLINT
+#include <mutex>               // NOLINT
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+#include "gtest/gtest.h"
+namespace paddle {
+namespace memory {
+namespace allocation {
+TEST(NaiveBestFitAllocatorTest, CpuAlloc) {
+  NaiveBestFitAllocator alloc{platform::CPUPlace()};
+  {
+    size_t size = (1 << 20);
+    auto allocation = alloc.Allocate(size);
+  }
+  alloc.Release(platform::CPUPlace());
+  size_t size = (1 << 20);
+  auto allocation = alloc.Allocate(size);
+  alloc.Release(platform::CPUPlace());
+}
+#ifdef PADDLE_WITH_CUDA
+TEST(NaiveBestFitAllocatorTest, GpuAlloc) {
+  NaiveBestFitAllocator alloc{platform::CUDAPlace(0)};
+  {
+    size_t size = (1 << 20);
+    auto allocation = alloc.Allocate(size);
+  }
+  alloc.Release(platform::CUDAPlace(0));
+  size_t size = (1 << 20);
+  auto allocation = alloc.Allocate(size);
+  alloc.Release(platform::CUDAPlace(0));
+}
+TEST(NaiveBestFitAllocatorTest, CudaPinnedAlloc) {
+  NaiveBestFitAllocator alloc{platform::CUDAPinnedPlace()};
+  {
+    size_t size = (1 << 20);
+    auto allocation = alloc.Allocate(size);
+  }
+  alloc.Release(platform::CUDAPinnedPlace());
+  size_t size = (1 << 20);
+  auto allocation = alloc.Allocate(size);
+  alloc.Release(platform::CUDAPinnedPlace());
+}
+#endif
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/fluid/memory/allocation/retry_allocator.h
+++ b/paddle/fluid/memory/allocation/retry_allocator.h
@@ -47,6 +47,9 @@ class RetryAllocator : public Allocator {
 protected:
  void FreeImpl(Allocation* allocation) override;
  Allocation* AllocateImpl(size_t size) override;
+  void ReleaseImpl(const platform::Place& place) override {
+    underlying_allocator_->Release(place);
+  }
 private:
  std::shared_ptr<Allocator> underlying_allocator_;

--- a/paddle/fluid/memory/allocation/retry_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc
@@ -96,6 +96,7 @@ TEST(RetryAllocator, RetryAllocator) {
    bool is_all_equal = std::all_of(addresses.begin(), addresses.end(),
                                    [val](void *p) { return p == val; });
    ASSERT_TRUE(is_all_equal);
+    allocator->Release(platform::CPUPlace());
  }
 }
@@ -135,6 +136,7 @@ TEST(RetryAllocator, RetryAllocatorLastAllocFailure) {
      auto allocation = allocator.Allocate(allocate_size);
      ASSERT_TRUE(false);
      allocation.reset();
+      allocator.Release(p);
    } catch (BadAlloc &ex) {
      ASSERT_TRUE(std::string(ex.what()).find("Cannot allocate") !=
                  std::string::npos);

--- a/paddle/fluid/memory/allocation/thread_local_allocator.cc
+++ b/paddle/fluid/memory/allocation/thread_local_allocator.cc
@@ -72,6 +72,8 @@ void ThreadLocalAllocatorImpl::FreeImpl(ThreadLocalAllocation* allocation) {
  delete allocation;
 }
+void ThreadLocalAllocatorImpl::ReleaseImpl() { buddy_allocator_->Release(); }
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/allocation/thread_local_allocator.h
+++ b/paddle/fluid/memory/allocation/thread_local_allocator.h
@@ -52,6 +52,7 @@ class ThreadLocalAllocatorImpl
  explicit ThreadLocalAllocatorImpl(const platform::Place& p);
  ThreadLocalAllocation* AllocateImpl(size_t size);
  void FreeImpl(ThreadLocalAllocation* allocation);
+  void ReleaseImpl();
 private:
  std::unique_ptr<memory::detail::BuddyAllocator> buddy_allocator_;
@@ -91,6 +92,9 @@ class ThreadLocalCUDAAllocator : public Allocator {
    auto allocator_impl = tl_allocation->GetAllocator();
    allocator_impl->FreeImpl(tl_allocation);
  }
+  void ReleaseImpl(const platform::Place& p) override {
+    return ThreadLocalCUDAAllocatorPool::Instance().Get(gpu_id_)->ReleaseImpl();
+  }
 private:
  int gpu_id_;

--- a/paddle/fluid/memory/allocation/thread_local_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/thread_local_allocator_test.cc
@@ -62,6 +62,7 @@ TEST(ThreadLocalAllocator, cross_scope_release) {
        auto tl_allocator_impl =
            ThreadLocalCUDAAllocatorPool::Instance().Get(devices[j]);
        allocator_addresses[j][i] = tl_allocator_impl.get();
+        memory::Release(platform::CUDAPlace(devices[j]));
      }
    });
  }

--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -39,9 +39,10 @@ BuddyAllocator::~BuddyAllocator() {
  while (!pool_.empty()) {
    auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
    auto desc = cache_.LoadDesc(block);
-    VLOG(10) << "Free from block (" << block << ", " << desc->get_size() << ")";
+    VLOG(10) << "Free from block (" << block << ", " << desc->get_total_size()
+             << ")";
-    system_allocator_->Free(block, desc->get_size(), desc->get_index());
+    system_allocator_->Free(block, desc->get_total_size(), desc->get_index());
    cache_.Invalidate(block);
    pool_.erase(pool_.begin());
  }
@@ -161,6 +162,39 @@ void BuddyAllocator::Free(void* p) {
      IndexSizeAddress(desc->get_index(), desc->get_total_size(), block));
 }
+void BuddyAllocator::Release() {
+  std::lock_guard<std::mutex> lock(mutex_);
+  int num = 0;
+  uint64_t bytes = 0;
+  bool del_flag = false;
+  for (auto iter = pool_.begin(); iter != pool_.end();) {
+    auto remain_size = std::get<1>(*iter);
+    auto remain_ptr = std::get<2>(*iter);
+    for (auto& chunk : chunks_) {
+      auto init_size = std::get<1>(chunk);
+      auto init_ptr = std::get<2>(chunk);
+      if (init_size == remain_size && init_ptr == remain_ptr) {
+        ++num;
+        bytes += init_size;
+        total_free_ -= init_size;
+        auto block = static_cast<MemoryBlock*>(std::get<2>(chunk));
+        system_allocator_->Free(init_ptr, init_size, std::get<0>(chunk));
+        cache_.Invalidate(block);
+        del_flag = true;
+        break;
+      }
+    }
+    if (del_flag) {
+      iter = pool_.erase(iter);
+    } else {
+      iter++;
+    }
+  }
+  VLOG(10) << "Release " << num << " chunk, Free " << bytes << " bytes.";
+}
 size_t BuddyAllocator::Used() { return total_used_; }
 size_t BuddyAllocator::GetMinChunkSize() { return min_chunk_size_; }
 size_t BuddyAllocator::GetMaxChunkSize() { return max_chunk_size_; }
@@ -213,6 +247,9 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
  total_free_ += allocate_bytes;
+  // record the chunk.
+  chunks_.insert(IndexSizeAddress(index, allocate_bytes, p));
  // dump the block into pool
  return pool_.insert(IndexSizeAddress(index, allocate_bytes, p)).first;
 }

--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -40,6 +40,8 @@ class BuddyAllocator {
 public:
  void* Alloc(size_t unaligned_size);
  void Free(void* ptr);
+  // Release the unused memory pool, a real free operation for the OS.
+  void Release();
  size_t Used();
  size_t GetMinChunkSize();
  size_t GetMaxChunkSize();
@@ -92,6 +94,11 @@ class BuddyAllocator {
   */
  PoolSet pool_;
+  /**
+   * \brief Record the allocated chunks when Refill pool.
+   */
+  PoolSet chunks_;
 private:
  /*! Unify the metadata format between GPU and CPU allocations */
  MetadataCache cache_;

--- a/paddle/fluid/memory/detail/buddy_allocator_test.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc
@@ -305,6 +305,23 @@ TEST(BuddyAllocator, SpeedAna) {
  std::cerr << "time cost " << diff.count() << std::endl;
 }
+TEST(BuddyAllocator, Release) {
+  // In a 8 GB machine, the pool size will be about 800 MB
+  FLAGS_fraction_of_gpu_memory_to_use = 0.1;
+  FLAGS_initial_gpu_memory_in_mb = 0;
+  FLAGS_reallocate_gpu_memory_in_mb = 0;
+  BuddyAllocator buddy_allocator(
+      std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
+      platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
+  // Less than pool size
+  TestBuddyAllocator(&buddy_allocator, 10);
+  TestBuddyAllocator(&buddy_allocator, 10 << 10);
+  TestBuddyAllocator(&buddy_allocator, 50 << 20);
+  buddy_allocator.Release();
+}
 #endif
 }  // namespace detail

--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -31,5 +31,9 @@ AllocationPtr Alloc(const platform::Place &place, size_t size) {
  return allocation::AllocatorFacade::Instance().Alloc(place, size);
 }
+void Release(const platform::Place &place) {
+  return allocation::AllocatorFacade::Instance().Release(place);
+}
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -38,5 +38,7 @@ extern AllocationPtr Alloc(const platform::Place& place, size_t size);
 extern AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size);
+extern void Release(const platform::Place& place);
 }  // namespace memory
 }  // namespace paddle