提交 e893cbd2 编写于 作者: S sneaxiy

add auto increment best fit allocator

test=develop
上级 a7a4f053
...@@ -8,6 +8,9 @@ cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_alloca ...@@ -8,6 +8,9 @@ cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_alloca
cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator) cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator)
cc_test(multi_bin_buffered_allocator_test SRCS multi_bin_buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator multi_bin_buffered_allocator cpu_allocator) cc_test(multi_bin_buffered_allocator_test SRCS multi_bin_buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator multi_bin_buffered_allocator cpu_allocator)
cc_library(auto_increment_best_fit_allocator SRCS auto_increment_best_fit_allocator.cc DEPS allocator)
cc_test(auto_increment_best_fit_allocator_test SRCS auto_increment_best_fit_allocator_test.cc DEPS cpu_allocator auto_increment_best_fit_allocator)
if (WITH_GPU) if (WITH_GPU)
nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard) nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
endif() endif()
...@@ -56,6 +59,7 @@ cc_library(allocator_facade SRCS allocator_facade.cc DEPS ...@@ -56,6 +59,7 @@ cc_library(allocator_facade SRCS allocator_facade.cc DEPS
retry_allocator retry_allocator
buffered_allocator buffered_allocator
multi_bin_buffered_allocator multi_bin_buffered_allocator
auto_increment_best_fit_allocator
allocator_strategy allocator_strategy
legacy_allocator legacy_allocator
) )
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h"
#include "paddle/fluid/memory/allocation/auto_increment_allocator.h" #include "paddle/fluid/memory/allocation/auto_increment_allocator.h"
#include "paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/best_fit_allocator.h" #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/conditional_allocator.h" #include "paddle/fluid/memory/allocation/conditional_allocator.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h" #include "paddle/fluid/memory/allocation/cpu_allocator.h"
...@@ -195,17 +196,57 @@ class AllocatorFacadePrivate { ...@@ -195,17 +196,57 @@ class AllocatorFacadePrivate {
~AllocatorFacadePrivate() = default; ~AllocatorFacadePrivate() = default;
AllocatorFacadePrivate() { AllocatorFacadePrivate() {
if (GetAllocatorStrategy() == AllocatorStrategy::kLegacy) { auto strategy = GetAllocatorStrategy();
switch (strategy) {
case AllocatorStrategy::kLegacy: {
InitLegacyAllocator(); InitLegacyAllocator();
} else { break;
}
case AllocatorStrategy::kNaiveBestFit: {
InitCPUAllocator(); InitCPUAllocator();
InitCUDAAllocator(); InitCUDAAllocator();
InitCUDAPinnedAllocator(); InitCUDAPinnedAllocator();
WrapZeroSizeAllocator(); WrapZeroSizeAllocator();
break;
}
case AllocatorStrategy::kAutoGrowthBestFit: {
InitCPUAllocator();
InitAutoGrowthCUDAAllocator();
InitAutoGrowthCUDAPinnedAllocator();
WrapZeroSizeAllocator();
break;
}
default: {
PADDLE_THROW("Unsupported allocator strategy: %d",
static_cast<int>(strategy));
}
} }
} }
private: private:
void InitAutoGrowthCUDAAllocator() {
#ifdef PADDLE_WITH_CUDA
int dev_cnt = platform::GetCUDADeviceCount();
for (int dev_id = 0; dev_id < dev_cnt; ++dev_id) {
auto cuda_allocator = std::make_shared<AlignedAllocator<4096>>(
std::make_shared<CUDAAllocator>(platform::CUDAPlace(dev_id)));
allocators_[platform::CUDAPlace(dev_id)] =
std::make_shared<AutoIncrementBestFitAllocator>(
cuda_allocator, platform::GpuMaxChunkSize(), 4096);
}
#endif
}
void InitAutoGrowthCUDAPinnedAllocator() {
#ifdef PADDLE_WITH_CUDA
auto cuda_pinned_allocator = std::make_shared<AlignedAllocator<4096>>(
std::make_shared<CPUPinnedAllocator>());
allocators_[platform::CUDAPinnedPlace()] =
std::make_shared<AutoIncrementBestFitAllocator>(
cuda_pinned_allocator, platform::CUDAPinnedMaxChunkSize(), 4096);
#endif
}
void InitLegacyAllocator() { void InitLegacyAllocator() {
std::vector<platform::Place> places{platform::CPUPlace()}; std::vector<platform::Place> places{platform::CPUPlace()};
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h"
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/fluid/platform/enforce.h"
DEFINE_string( DEFINE_string(
allocator_strategy, "legacy", allocator_strategy, "legacy",
...@@ -25,9 +26,16 @@ namespace memory { ...@@ -25,9 +26,16 @@ namespace memory {
namespace allocation { namespace allocation {
static AllocatorStrategy GetStrategyFromFlag() { static AllocatorStrategy GetStrategyFromFlag() {
return FLAGS_allocator_strategy == "legacy" if (FLAGS_allocator_strategy == "legacy") {
? AllocatorStrategy::kLegacy return AllocatorStrategy::kLegacy;
: AllocatorStrategy::kNaiveBestFit; } else if (FLAGS_allocator_strategy == "navie_best_fit") {
return AllocatorStrategy::kNaiveBestFit;
} else if (FLAGS_allocator_strategy == "auto_growth_best_fit") {
return AllocatorStrategy::kAutoGrowthBestFit;
} else {
PADDLE_THROW("Unsupported allocator strategy: %s",
FLAGS_allocator_strategy);
}
} }
AllocatorStrategy GetAllocatorStrategy() { AllocatorStrategy GetAllocatorStrategy() {
......
...@@ -18,7 +18,7 @@ namespace paddle { ...@@ -18,7 +18,7 @@ namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
enum class AllocatorStrategy { kLegacy, kNaiveBestFit }; enum class AllocatorStrategy { kLegacy, kNaiveBestFit, kAutoGrowthBestFit };
extern AllocatorStrategy GetAllocatorStrategy(); extern AllocatorStrategy GetAllocatorStrategy();
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h"
#include <algorithm>
#include <list>
#include <map>
#include <memory>
#include <mutex> // NOLINT
#include <unordered_map>
namespace paddle {
namespace memory {
namespace allocation {
static size_t align(size_t size, size_t alignment) {
auto remaining = size % alignment;
return remaining == 0 ? size : size + alignment - remaining;
}
AutoIncrementBestFitAllocator::AutoIncrementBestFitAllocator(
const std::shared_ptr<Allocator> &underlying_allocator, size_t chunk_size,
size_t alignment)
: underlying_allocator_(underlying_allocator),
chunk_size_(align(chunk_size, alignment)),
alignment_(alignment) {}
Allocation *AutoIncrementBestFitAllocator::AllocateImpl(size_t size,
Attr attr) {
if (size == 0) return nullptr;
size = align(size, alignment_);
std::lock_guard<std::mutex> guard(mtx_);
auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr));
BlockIt block_it;
if (iter != free_blocks_.end()) {
VLOG(2) << "Found " << iter->second->size_ << " for " << size;
block_it = iter->second;
free_blocks_.erase(iter);
auto *chunk = block_it->chunk_;
size_t remaining_size = block_it->size_ - size;
if (remaining_size == 0) {
block_it->is_free_ = false;
VLOG(2) << "Found and no remaining";
} else {
auto remaining_free_block = chunk->blocks_.insert(
block_it, Chunk::Block(block_it->ptr_, remaining_size, true, chunk));
free_blocks_.emplace(std::make_pair(remaining_size, block_it->ptr_),
remaining_free_block);
block_it->ptr_ =
reinterpret_cast<uint8_t *>(block_it->ptr_) + remaining_size;
block_it->size_ = size;
block_it->is_free_ = false;
VLOG(2) << "Found and remaining " << remaining_size;
}
} else {
size_t alloc_size = size;
if (!underlying_allocator_exhaustive_ && chunk_size_ > size) {
alloc_size = chunk_size_;
}
try {
chunks_.emplace_back(underlying_allocator_->Allocate(alloc_size, attr));
} catch (BadAlloc &ex) {
if (size == alloc_size) throw ex;
underlying_allocator_exhaustive_ = true;
alloc_size = size;
chunks_.emplace_back(underlying_allocator_->Allocate(alloc_size, attr));
}
auto *chunk = &(*chunks_.rbegin());
uint8_t *p = reinterpret_cast<uint8_t *>(chunk->allocation_->ptr());
auto &blocks = chunk->blocks_;
size_t remaining_size = alloc_size - size;
if (remaining_size > 0) {
blocks.emplace_back(p, remaining_size, true, chunk);
free_blocks_.emplace(std::make_pair(remaining_size, p), --(blocks.end()));
}
blocks.emplace_back(p + remaining_size, size, false, chunk);
block_it = --(blocks.end());
VLOG(2) << "Not found and allocate " << alloc_size << ", and remaining "
<< remaining_size;
}
VLOG(2) << "After allocate, free blocks " << free_blocks_.size();
return new Chunk::BlockAllocation(block_it);
}
void AutoIncrementBestFitAllocator::FreeImpl(Allocation *allocation) {
auto &block_it = static_cast<Chunk::BlockAllocation *>(allocation)->block_it_;
auto &blocks = block_it->chunk_->blocks_;
std::lock_guard<std::mutex> guard(mtx_);
block_it->is_free_ = true;
if (block_it != blocks.begin()) {
auto prev_it = block_it;
--prev_it;
if (prev_it->is_free_) {
free_blocks_.erase(std::make_pair(prev_it->size_, prev_it->ptr_));
prev_it->size_ += block_it->size_;
blocks.erase(block_it);
block_it = prev_it;
}
}
auto next_it = block_it;
++next_it;
if (next_it != blocks.end() && next_it->is_free_) {
free_blocks_.erase(std::make_pair(next_it->size_, next_it->ptr_));
block_it->size_ += next_it->size_;
blocks.erase(next_it);
}
free_blocks_.emplace(std::make_pair(block_it->size_, block_it->ptr_),
block_it);
VLOG(2) << "Combine " << block_it->size_ << ", " << blocks.size() << ", "
<< free_blocks_.size();
delete allocation;
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <list>
#include <map>
#include <memory>
#include <mutex> // NOLINT
#include <utility>
#include "paddle/fluid/memory/allocation/allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
class AutoIncrementBestFitAllocator : public Allocator {
public:
explicit AutoIncrementBestFitAllocator(
const std::shared_ptr<Allocator> &underlying_allocator, size_t chunk_size,
size_t alignment);
bool IsAllocThreadSafe() const override { return true; }
using AllocationList = std::list<AllocationPtr>;
using AllocationListIt = AllocationList::iterator;
struct Chunk {
struct Block {
Block(void *ptr, size_t size, bool is_free, Chunk *chunk)
: ptr_(ptr), size_(size), is_free_(is_free), chunk_(chunk) {}
void *ptr_;
size_t size_;
bool is_free_;
Chunk *chunk_; // which chunk it is from
};
explicit Chunk(AllocationPtr allocation)
: allocation_(std::move(allocation)) {}
AllocationPtr allocation_;
std::list<Block> blocks_;
// std::mutex mtx_;
struct BlockAllocation : public Allocation {
explicit BlockAllocation(const std::list<Block>::iterator &it)
: Allocation(it->ptr_, it->size_, it->chunk_->allocation_->place()),
block_it_(it) {}
std::list<Block>::iterator block_it_;
};
};
protected:
Allocation *AllocateImpl(size_t size, Attr attr) override;
void FreeImpl(Allocation *allocation) override;
private:
using BlockIt = std::list<Chunk::Block>::iterator;
std::shared_ptr<Allocator> underlying_allocator_;
std::list<Chunk> chunks_;
std::map<std::pair<size_t, void *>, BlockIt> free_blocks_;
size_t chunk_size_;
size_t alignment_;
bool underlying_allocator_exhaustive_{false};
mutable std::mutex mtx_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include <condition_variable> // NOLINT
#include <mutex> // NOLINT
#include <thread> // NOLINT
#include <vector>
#include <iostream>
#include "paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
TEST(allocator, auto_increment_best_fit_allocator) {
auto cpu_allocator = std::make_shared<CPUAllocator>();
auto allocator =
std::make_shared<AutoIncrementBestFitAllocator>(cpu_allocator, 0, 4096);
std::mutex mtx;
std::condition_variable cv;
bool flag = false;
auto thread_main = [&] {
{
std::unique_lock<std::mutex> lock(mtx);
cv.wait(lock, [&] { return flag; });
}
for (size_t i = 10; i > 0; --i) {
allocator->Allocate((i + 1) * 1000);
}
};
std::vector<std::thread> ths;
for (size_t i = 10; i < 10; ++i) {
ths.emplace_back(thread_main);
}
{
std::lock_guard<std::mutex> lock(mtx);
flag = true;
}
cv.notify_all();
thread_main();
for (auto &th : ths) {
th.join();
}
std::cout << "test ends" << std::endl;
}
} // namespace allocation
} // namespace memory
} // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册