optimize buddy_allocator (#38312)

8fe1cb72 · Yang · GitHub · 64e2f670 · 8fe1cb72 · 8fe1cb72
3 changed file
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -13,18 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/memory/detail/buddy_allocator.h"
+
 #include <algorithm>

 #include "gflags/gflags.h"
 #include "glog/logging.h"

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_MLU)
-DECLARE_uint64(reallocate_gpu_memory_in_mb);
-#endif
-#ifdef PADDLE_WITH_ASCEND_CL
+    defined(PADDLE_WITH_MLU) || defined(PADDLE_WITH_ASCEND_CL)
+#define USE_DEVICE
 DECLARE_uint64(reallocate_gpu_memory_in_mb);
 #endif
+
 #ifdef PADDLE_WITH_MLU
 #include "paddle/fluid/platform/device/mlu/mlu_info.h"
 #endif
@@ -180,33 +180,24 @@ uint64_t BuddyAllocator::Release() {
  std::lock_guard<std::mutex> lock(mutex_);
  int num = 0;
  uint64_t bytes = 0;
-  bool del_flag = false;
  for (auto iter = pool_.begin(); iter != pool_.end();) {
    auto remain_size = std::get<1>(*iter);
    auto remain_ptr = std::get<2>(*iter);
-    for (auto& chunk : chunks_) {
-      auto init_size = std::get<1>(chunk);
-      auto init_ptr = std::get<2>(chunk);
-
-      if (init_size == remain_size && init_ptr == remain_ptr) {
-        ++num;
-        bytes += init_size;
-        total_free_ -= init_size;
-        auto block = static_cast<MemoryBlock*>(std::get<2>(chunk));
-        system_allocator_->Free(init_ptr, init_size, std::get<0>(chunk));
-        cache_.Invalidate(block);
-        del_flag = true;
-        break;
-      }
-    }
-
-    if (del_flag) {
+    auto found = chunks_.find({remain_size, remain_ptr});
+    if (found != chunks_.end()) {
+      size_t index = found->second;
+      ++num;
+      bytes += remain_size;
+      total_free_ -= remain_size;
+      auto block = static_cast<MemoryBlock*>(remain_ptr);
+      system_allocator_->Free(remain_ptr, remain_size, index);
+      cache_.Invalidate(block);
      iter = pool_.erase(iter);
    } else {
      iter++;
    }
  }
-  VLOG(10) << "Release " << num << " chunk, Free " << bytes << " bytes.";
+  VLOG(10) << "Release " << num << " chunks, Free " << bytes << " bytes.";
  return bytes;
 }

@@ -234,49 +225,15 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
  size_t index = 0;

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (system_allocator_->UseGpu()) {
-    if ((total_used_ + total_free_) == 0) {
-      // Compute the allocation size for gpu for the first allocation.
-      allocate_bytes = std::max(platform::GpuInitAllocSize(), request_bytes);
-    } else {
-      // Compute the re-allocation size, we store the re-allocation size when
-      // user set FLAGS_reallocate_gpu_memory_in_mb to fix value.
-      if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) {
-        realloc_size_ = platform::GpuReallocSize();
-      }
-      allocate_bytes = std::max(realloc_size_, request_bytes);
-    }
-  }
-#endif
-#ifdef PADDLE_WITH_ASCEND_CL
-  if (system_allocator_->UseGpu()) {
-    if ((total_used_ + total_free_) == 0) {
-      // Compute the allocation size for gpu for the first allocation.
-      allocate_bytes = std::max(platform::NPUInitAllocSize(), request_bytes);
-    } else {
-      // Compute the re-allocation size, we store the re-allocation size when
-      // user set FLAGS_reallocate_gpu_memory_in_mb to fix value.
-      if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) {
-        realloc_size_ = platform::NPUReallocSize();
-      }
-      allocate_bytes = std::max(realloc_size_, request_bytes);
-    }
-  }
-#endif
-#ifdef PADDLE_WITH_MLU
-  if (system_allocator_->UseGpu()) {
-    if ((total_used_ + total_free_) == 0) {
-      // Compute the allocation size for mlu for the first allocation.
-      allocate_bytes = std::max(platform::MLUInitAllocSize(), request_bytes);
-    } else {
-      // Compute the re-allocation size, we store the re-allocation size when
-      // user set FLAGS_reallocate_gpu_memory_in_mb to fix value.
-      if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) {
-        realloc_size_ = platform::MLUReallocSize();
-      }
-      allocate_bytes = std::max(realloc_size_, request_bytes);
-    }
-  }
+  allocate_bytes = DeviceAllocateSize(&platform::GpuInitAllocSize,
+                                      &platform::GpuReallocSize, request_bytes);
+#elif defined(PADDLE_WITH_ASCEND_CL)
+  allocate_bytes = DeviceAllocateSize(&platform::NPUInitAllocSize,
+                                      &platform::NPUReallocSize, request_bytes);
+#elif defined(PADDLE_WITH_MLU)
+  allocate_bytes =
+      DeviceAllocateSize(&platform::MLUInitAllocSize(),
+                         &platform::MLUReallocSize(), request_bytes);
 #endif

  // Allocate a new block
@@ -293,7 +250,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
  total_free_ += allocate_bytes;

  // record the chunk.
-  chunks_.insert(IndexSizeAddress(index, allocate_bytes, p));
+  chunks_.insert({{allocate_bytes, p}, index});

  // dump the block into pool
  return pool_.insert(IndexSizeAddress(index, allocate_bytes, p)).first;
@@ -350,6 +307,31 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
  return block;
 }

+size_t BuddyAllocator::DeviceAllocateSize(
+    std::function<size_t()> init_allocate_size_func,
+    std::function<size_t()> re_allocate_size_func, size_t request_bytes) {
+  size_t allocate_bytes = max_chunk_size_;
+#if defined(USE_DEVICE)
+  const bool use_gpu = system_allocator_->UseGpu();
+  VLOG(10) << "use_gpu " << use_gpu << ", total_used " << total_used_
+           << ", total_free " << total_free_;
+  if (use_gpu) {
+    if (total_used_ == 0 && total_free_ == 0) {
+      // Compute the allocation size for gpu for the first allocation.
+      allocate_bytes = std::max(init_allocate_size_func(), request_bytes);
+    } else {
+      // Compute the re-allocation size, we store the re-allocation size when
+      // user set FLAGS_reallocate_gpu_memory_in_mb to fix value.
+      if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) {
+        realloc_size_ = re_allocate_size_func();
+      }
+      allocate_bytes = std::max(realloc_size_, request_bytes);
+    }
+  }
+#endif
+  return allocate_bytes;
+}
+
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -15,11 +15,14 @@ limitations under the License. */
 #pragma once

 #include <stdint.h>
+
+#include <functional>
+#include <map>
 #include <memory>
 #include <mutex>  // NOLINT
 #include <set>
 #include <tuple>
-#include <unordered_map>
+#include <utility>
 #include <vector>

 #include "paddle/fluid/memory/detail/memory_block.h"
@@ -59,6 +62,9 @@ class BuddyAllocator {
  using IndexSizeAddress = std::tuple<size_t, size_t, void*>;
  // Each element in PoolSet is a free allocation
  using PoolSet = std::set<IndexSizeAddress>;
+  // Each element in PoolMap is an allocation record
+  // key: <size, ptr>, value: index
+  using PoolMap = std::map<std::pair<size_t, void*>, size_t>;

  /*! \brief Allocate fixed-size memory from system */
  void* SystemAlloc(size_t size);
@@ -80,6 +86,11 @@ class BuddyAllocator {
  /*! \brief Find the existing chunk which used to allocation */
  PoolSet::iterator FindExistChunk(size_t size);

+  /*! \brief Allocate bytes from the device */
+  size_t DeviceAllocateSize(std::function<size_t()> init_allocate_size_func,
+                            std::function<size_t()> re_allocate_size_func,
+                            size_t request_bytes);
+
 private:
  size_t total_used_ = 0;  // the total size of used memory
  size_t total_free_ = 0;  // the total size of free memory
@@ -102,7 +113,7 @@ class BuddyAllocator {
  /**
   * \brief Record the allocated chunks when Refill pool.
   */
-  PoolSet chunks_;
+  PoolMap chunks_;

 private:
  /*! Unify the metadata format between GPU and CPU allocations */

--- a/paddle/fluid/memory/detail/buddy_allocator_test.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc
@@ -189,6 +189,35 @@ TEST(BuddyAllocator, FractionRefillPool) {
  buddy_allocator.Free(p1);
 }

+TEST(BuddyAllocator, DeviceRefillPool) {
+  const size_t malloc_size = 10;
+  const size_t malloc_bytes = malloc_size << 20;
+  FLAGS_initial_gpu_memory_in_mb = malloc_size;
+  FLAGS_reallocate_gpu_memory_in_mb = malloc_size;
+
+  EXPECT_EQ(platform::GpuMaxChunkSize(), malloc_bytes);
+
+  size_t max_chunk_size = platform::GpuMaxChunkSize();
+  BuddyAllocator buddy_allocator(
+      std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
+      platform::GpuMinChunkSize(), max_chunk_size);
+
+  int* p0 = TestBuddyAllocator(&buddy_allocator, malloc_bytes - 1000,
+                               /* use_system_allocator = */ false,
+                               /* free_ptr = */ false);
+  // Max chunk size should be same during allocation
+  EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
+
+  int* p1 = TestBuddyAllocator(&buddy_allocator, malloc_bytes - 1000,
+                               /* use_system_allocator = */ false,
+                               /* free_ptr = */ false);
+  // Max chunk size should be same during allocation
+  EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
+
+  buddy_allocator.Free(p0);
+  buddy_allocator.Free(p1);
+}
+
 TEST(BuddyAllocator, AllocFromAvailable) {
  FLAGS_fraction_of_gpu_memory_to_use = 0.7;
  FLAGS_initial_gpu_memory_in_mb = 0;
@@ -350,7 +379,6 @@ TEST(BuddyAllocator, Release) {
 #ifdef PADDLE_WITH_ASCEND_CL
 TEST(BuddyAllocator, NpuFraction) {
  // In a 16 GB machine, the pool size will be about 160 MB
-  FLAGS_fraction_of_gpu_memory_to_use = 0.005;
  FLAGS_fraction_of_gpu_memory_to_use = 0.92;
  FLAGS_initial_gpu_memory_in_mb = 0;
  FLAGS_reallocate_gpu_memory_in_mb = 0;