add auto increment best fit allocator

test=develop

add auto increment best fit allocator
test=develop
e893cbd2 · sneaxiy · a7a4f053 · e893cbd2 · e893cbd2 · e893cbd2
7 changed file
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -8,6 +8,9 @@ cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_alloca
 cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator)
 cc_test(multi_bin_buffered_allocator_test SRCS multi_bin_buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator multi_bin_buffered_allocator cpu_allocator)
+cc_library(auto_increment_best_fit_allocator SRCS auto_increment_best_fit_allocator.cc DEPS allocator)
+cc_test(auto_increment_best_fit_allocator_test SRCS auto_increment_best_fit_allocator_test.cc DEPS cpu_allocator auto_increment_best_fit_allocator)
 if (WITH_GPU)
  nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
 endif()
@@ -56,6 +59,7 @@ cc_library(allocator_facade SRCS allocator_facade.cc DEPS
        retry_allocator
        buffered_allocator
        multi_bin_buffered_allocator
+        auto_increment_best_fit_allocator
        allocator_strategy
        legacy_allocator
        )

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -22,6 +22,7 @@
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "paddle/fluid/memory/allocation/auto_increment_allocator.h"
+#include "paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/conditional_allocator.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
@@ -195,17 +196,57 @@ class AllocatorFacadePrivate {
  ~AllocatorFacadePrivate() = default;
  AllocatorFacadePrivate() {
-    if (GetAllocatorStrategy() == AllocatorStrategy::kLegacy) {
+    auto strategy = GetAllocatorStrategy();
+    switch (strategy) {
+      case AllocatorStrategy::kLegacy: {
        InitLegacyAllocator();
-    } else {
+        break;
+      }
+      case AllocatorStrategy::kNaiveBestFit: {
        InitCPUAllocator();
        InitCUDAAllocator();
        InitCUDAPinnedAllocator();
        WrapZeroSizeAllocator();
+        break;
+      }
+      case AllocatorStrategy::kAutoGrowthBestFit: {
+        InitCPUAllocator();
+        InitAutoGrowthCUDAAllocator();
+        InitAutoGrowthCUDAPinnedAllocator();
+        WrapZeroSizeAllocator();
+        break;
+      }
+      default: {
+        PADDLE_THROW("Unsupported allocator strategy: %d",
+                     static_cast<int>(strategy));
+      }
    }
  }
 private:
+  void InitAutoGrowthCUDAAllocator() {
+#ifdef PADDLE_WITH_CUDA
+    int dev_cnt = platform::GetCUDADeviceCount();
+    for (int dev_id = 0; dev_id < dev_cnt; ++dev_id) {
+      auto cuda_allocator = std::make_shared<AlignedAllocator<4096>>(
+          std::make_shared<CUDAAllocator>(platform::CUDAPlace(dev_id)));
+      allocators_[platform::CUDAPlace(dev_id)] =
+          std::make_shared<AutoIncrementBestFitAllocator>(
+              cuda_allocator, platform::GpuMaxChunkSize(), 4096);
+    }
+#endif
+  }
+  void InitAutoGrowthCUDAPinnedAllocator() {
+#ifdef PADDLE_WITH_CUDA
+    auto cuda_pinned_allocator = std::make_shared<AlignedAllocator<4096>>(
+        std::make_shared<CPUPinnedAllocator>());
+    allocators_[platform::CUDAPinnedPlace()] =
+        std::make_shared<AutoIncrementBestFitAllocator>(
+            cuda_pinned_allocator, platform::CUDAPinnedMaxChunkSize(), 4096);
+#endif
+  }
  void InitLegacyAllocator() {
    std::vector<platform::Place> places{platform::CPUPlace()};
 #ifdef PADDLE_WITH_CUDA

--- a/paddle/fluid/memory/allocation/allocator_strategy.cc
+++ b/paddle/fluid/memory/allocation/allocator_strategy.cc
@@ -14,6 +14,7 @@
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "gflags/gflags.h"
+#include "paddle/fluid/platform/enforce.h"
 DEFINE_string(
    allocator_strategy, "legacy",
@@ -25,9 +26,16 @@ namespace memory {
 namespace allocation {
 static AllocatorStrategy GetStrategyFromFlag() {
-  return FLAGS_allocator_strategy == "legacy"
+  if (FLAGS_allocator_strategy == "legacy") {
-             ? AllocatorStrategy::kLegacy
+    return AllocatorStrategy::kLegacy;
-             : AllocatorStrategy::kNaiveBestFit;
+  } else if (FLAGS_allocator_strategy == "navie_best_fit") {
+    return AllocatorStrategy::kNaiveBestFit;
+  } else if (FLAGS_allocator_strategy == "auto_growth_best_fit") {
+    return AllocatorStrategy::kAutoGrowthBestFit;
+  } else {
+    PADDLE_THROW("Unsupported allocator strategy: %s",
+                 FLAGS_allocator_strategy);
+  }
 }
 AllocatorStrategy GetAllocatorStrategy() {

--- a/paddle/fluid/memory/allocation/allocator_strategy.h
+++ b/paddle/fluid/memory/allocation/allocator_strategy.h
@@ -18,7 +18,7 @@ namespace paddle {
 namespace memory {
 namespace allocation {
-enum class AllocatorStrategy { kLegacy, kNaiveBestFit };
+enum class AllocatorStrategy { kLegacy, kNaiveBestFit, kAutoGrowthBestFit };
 extern AllocatorStrategy GetAllocatorStrategy();

--- a/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h"
+#include <algorithm>
+#include <list>
+#include <map>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <unordered_map>
+namespace paddle {
+namespace memory {
+namespace allocation {
+static size_t align(size_t size, size_t alignment) {
+  auto remaining = size % alignment;
+  return remaining == 0 ? size : size + alignment - remaining;
+}
+AutoIncrementBestFitAllocator::AutoIncrementBestFitAllocator(
+    const std::shared_ptr<Allocator> &underlying_allocator, size_t chunk_size,
+    size_t alignment)
+    : underlying_allocator_(underlying_allocator),
+      chunk_size_(align(chunk_size, alignment)),
+      alignment_(alignment) {}
+Allocation *AutoIncrementBestFitAllocator::AllocateImpl(size_t size,
+                                                        Attr attr) {
+  if (size == 0) return nullptr;
+  size = align(size, alignment_);
+  std::lock_guard<std::mutex> guard(mtx_);
+  auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr));
+  BlockIt block_it;
+  if (iter != free_blocks_.end()) {
+    VLOG(2) << "Found " << iter->second->size_ << " for " << size;
+    block_it = iter->second;
+    free_blocks_.erase(iter);
+    auto *chunk = block_it->chunk_;
+    size_t remaining_size = block_it->size_ - size;
+    if (remaining_size == 0) {
+      block_it->is_free_ = false;
+      VLOG(2) << "Found and no remaining";
+    } else {
+      auto remaining_free_block = chunk->blocks_.insert(
+          block_it, Chunk::Block(block_it->ptr_, remaining_size, true, chunk));
+      free_blocks_.emplace(std::make_pair(remaining_size, block_it->ptr_),
+                           remaining_free_block);
+      block_it->ptr_ =
+          reinterpret_cast<uint8_t *>(block_it->ptr_) + remaining_size;
+      block_it->size_ = size;
+      block_it->is_free_ = false;
+      VLOG(2) << "Found and remaining " << remaining_size;
+    }
+  } else {
+    size_t alloc_size = size;
+    if (!underlying_allocator_exhaustive_ && chunk_size_ > size) {
+      alloc_size = chunk_size_;
+    }
+    try {
+      chunks_.emplace_back(underlying_allocator_->Allocate(alloc_size, attr));
+    } catch (BadAlloc &ex) {
+      if (size == alloc_size) throw ex;
+      underlying_allocator_exhaustive_ = true;
+      alloc_size = size;
+      chunks_.emplace_back(underlying_allocator_->Allocate(alloc_size, attr));
+    }
+    auto *chunk = &(*chunks_.rbegin());
+    uint8_t *p = reinterpret_cast<uint8_t *>(chunk->allocation_->ptr());
+    auto &blocks = chunk->blocks_;
+    size_t remaining_size = alloc_size - size;
+    if (remaining_size > 0) {
+      blocks.emplace_back(p, remaining_size, true, chunk);
+      free_blocks_.emplace(std::make_pair(remaining_size, p), --(blocks.end()));
+    }
+    blocks.emplace_back(p + remaining_size, size, false, chunk);
+    block_it = --(blocks.end());
+    VLOG(2) << "Not found and allocate " << alloc_size << ", and remaining "
+            << remaining_size;
+  }
+  VLOG(2) << "After allocate, free blocks " << free_blocks_.size();
+  return new Chunk::BlockAllocation(block_it);
+}
+void AutoIncrementBestFitAllocator::FreeImpl(Allocation *allocation) {
+  auto &block_it = static_cast<Chunk::BlockAllocation *>(allocation)->block_it_;
+  auto &blocks = block_it->chunk_->blocks_;
+  std::lock_guard<std::mutex> guard(mtx_);
+  block_it->is_free_ = true;
+  if (block_it != blocks.begin()) {
+    auto prev_it = block_it;
+    --prev_it;
+    if (prev_it->is_free_) {
+      free_blocks_.erase(std::make_pair(prev_it->size_, prev_it->ptr_));
+      prev_it->size_ += block_it->size_;
+      blocks.erase(block_it);
+      block_it = prev_it;
+    }
+  }
+  auto next_it = block_it;
+  ++next_it;
+  if (next_it != blocks.end() && next_it->is_free_) {
+    free_blocks_.erase(std::make_pair(next_it->size_, next_it->ptr_));
+    block_it->size_ += next_it->size_;
+    blocks.erase(next_it);
+  }
+  free_blocks_.emplace(std::make_pair(block_it->size_, block_it->ptr_),
+                       block_it);
+  VLOG(2) << "Combine " << block_it->size_ << ", " << blocks.size() << ", "
+          << free_blocks_.size();
+  delete allocation;
+}
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <list>
+#include <map>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <utility>
+#include "paddle/fluid/memory/allocation/allocator.h"
+namespace paddle {
+namespace memory {
+namespace allocation {
+class AutoIncrementBestFitAllocator : public Allocator {
+ public:
+  explicit AutoIncrementBestFitAllocator(
+      const std::shared_ptr<Allocator> &underlying_allocator, size_t chunk_size,
+      size_t alignment);
+  bool IsAllocThreadSafe() const override { return true; }
+  using AllocationList = std::list<AllocationPtr>;
+  using AllocationListIt = AllocationList::iterator;
+  struct Chunk {
+    struct Block {
+      Block(void *ptr, size_t size, bool is_free, Chunk *chunk)
+          : ptr_(ptr), size_(size), is_free_(is_free), chunk_(chunk) {}
+      void *ptr_;
+      size_t size_;
+      bool is_free_;
+      Chunk *chunk_;  // which chunk it is from
+    };
+    explicit Chunk(AllocationPtr allocation)
+        : allocation_(std::move(allocation)) {}
+    AllocationPtr allocation_;
+    std::list<Block> blocks_;
+    // std::mutex mtx_;
+    struct BlockAllocation : public Allocation {
+      explicit BlockAllocation(const std::list<Block>::iterator &it)
+          : Allocation(it->ptr_, it->size_, it->chunk_->allocation_->place()),
+            block_it_(it) {}
+      std::list<Block>::iterator block_it_;
+    };
+  };
+ protected:
+  Allocation *AllocateImpl(size_t size, Attr attr) override;
+  void FreeImpl(Allocation *allocation) override;
+ private:
+  using BlockIt = std::list<Chunk::Block>::iterator;
+  std::shared_ptr<Allocator> underlying_allocator_;
+  std::list<Chunk> chunks_;
+  std::map<std::pair<size_t, void *>, BlockIt> free_blocks_;
+  size_t chunk_size_;
+  size_t alignment_;
+  bool underlying_allocator_exhaustive_{false};
+  mutable std::mutex mtx_;
+};
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/auto_increment_best_fit_allocator_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <condition_variable>  // NOLINT
+#include <mutex>               // NOLINT
+#include <thread>              // NOLINT
+#include <vector>
+#include <iostream>
+#include "paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h"
+#include "paddle/fluid/memory/allocation/cpu_allocator.h"
+namespace paddle {
+namespace memory {
+namespace allocation {
+TEST(allocator, auto_increment_best_fit_allocator) {
+  auto cpu_allocator = std::make_shared<CPUAllocator>();
+  auto allocator =
+      std::make_shared<AutoIncrementBestFitAllocator>(cpu_allocator, 0, 4096);
+  std::mutex mtx;
+  std::condition_variable cv;
+  bool flag = false;
+  auto thread_main = [&] {
+    {
+      std::unique_lock<std::mutex> lock(mtx);
+      cv.wait(lock, [&] { return flag; });
+    }
+    for (size_t i = 10; i > 0; --i) {
+      allocator->Allocate((i + 1) * 1000);
+    }
+  };
+  std::vector<std::thread> ths;
+  for (size_t i = 10; i < 10; ++i) {
+    ths.emplace_back(thread_main);
+  }
+  {
+    std::lock_guard<std::mutex> lock(mtx);
+    flag = true;
+  }
+  cv.notify_all();
+  thread_main();
+  for (auto &th : ths) {
+    th.join();
+  }
+  std::cout << "test ends" << std::endl;
+}
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle