提交 953214ad 编写于 作者: S sneaxiy

add more unittest

modify allocator strategy
remove changes of legacy buddy_allocator
test=develop
上级 fd23262e
...@@ -202,6 +202,8 @@ cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc) ...@@ -202,6 +202,8 @@ cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
cc_test(tuple_test SRCS tuple_test.cc ) cc_test(tuple_test SRCS tuple_test.cc )
cc_test(inlined_vector_test SRCS inlined_vector_test.cc)
if (NOT WIN32) if (NOT WIN32)
cc_test(rw_lock_test SRCS rw_lock_test.cc) cc_test(rw_lock_test SRCS rw_lock_test.cc)
endif (NOT WIN32) endif (NOT WIN32)
......
...@@ -14,18 +14,18 @@ ...@@ -14,18 +14,18 @@
#pragma once #pragma once
#include <deque> #include <vector>
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
template <typename T, size_t N> template <typename T, size_t N>
class InlinedStack { class InlinedVector {
static_assert(N > 0, "N must be larger than 0"); static_assert(N > 0, "N must be larger than 0");
public: public:
inline void push(const T& item) { inline void push_back(const T& item) {
if (size_ < N) { if (size_ < N) {
head_[size_] = item; head_[size_] = item;
} else { } else {
...@@ -34,21 +34,21 @@ class InlinedStack { ...@@ -34,21 +34,21 @@ class InlinedStack {
++size_; ++size_;
} }
inline void pop() { inline void pop_back() {
PADDLE_ENFORCE(!empty(), "Try to pop element from empty stack."); PADDLE_ENFORCE(!empty(), "Try to pop back element from empty vector.");
if (size_ > N) { if (size_ > N) {
tail_.pop_back(); tail_.pop_back();
} }
--size_; --size_;
} }
inline const T& top() const { inline const T& back() const {
PADDLE_ENFORCE(!empty(), "Try to get top element of empty stack."); PADDLE_ENFORCE(!empty(), "Try to get back element of empty vector.");
return size_ <= N ? head_[size_ - 1] : tail_.back(); return size_ <= N ? head_[size_ - 1] : tail_.back();
} }
inline T& top() { inline T& back() {
PADDLE_ENFORCE(!empty(), "Try to get top element of empty stack."); PADDLE_ENFORCE(!empty(), "Try to get back element of empty vector.");
return size_ <= N ? head_[size_ - 1] : tail_.back(); return size_ <= N ? head_[size_ - 1] : tail_.back();
} }
...@@ -63,10 +63,19 @@ class InlinedStack { ...@@ -63,10 +63,19 @@ class InlinedStack {
return i < N ? head_[i] : tail_[i - N]; return i < N ? head_[i] : tail_[i - N];
} }
operator std::vector<T>() const {
std::vector<T> ret;
ret.reserve(size_);
for (size_t i = 0; i < size_; ++i) {
ret.emplace_back((*this)[i]);
}
return ret;
}
private: private:
T head_[N]; T head_[N];
size_t size_{0}; size_t size_{0};
std::deque<T> tail_; std::vector<T> tail_;
}; };
} // namespace framework } // namespace framework
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/inlined_vector.h"
#include <vector>
#include "gtest/gtest.h"
namespace paddle {
namespace framework {
TEST(inlined_stack, inlined_stack) {
size_t max_num = 10;
InlinedVector<size_t, 5> stack;
for (size_t i = 0; i < max_num; ++i) {
ASSERT_EQ(stack.size(), i);
stack.push_back(i);
ASSERT_EQ(stack.size(), i + 1);
}
std::vector<size_t> vec = stack;
ASSERT_EQ(stack.size(), vec.size());
for (size_t i = 0; i < vec.size(); ++i) {
ASSERT_EQ(stack[i], vec[i]);
}
for (size_t i = 0; i < max_num; ++i) {
ASSERT_EQ(stack[i], i);
}
for (size_t i = 0; i < max_num; ++i) {
ASSERT_EQ(stack.back(), max_num - 1 - i);
stack.pop_back();
ASSERT_EQ(stack.size(), max_num - 1 - i);
}
}
} // namespace framework
} // namespace paddle
...@@ -3,13 +3,18 @@ cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator) ...@@ -3,13 +3,18 @@ cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator) cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator) cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator) cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
cc_library(multi_bin_buffered_allocator SRCS multi_bin_buffered_allocator.cc DEPS allocator) cc_library(multi_bin_buffered_allocator SRCS multi_bin_buffered_allocator.cc DEPS allocator gflags)
cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler) cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler)
cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator)
cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator) cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator)
cc_test(multi_bin_buffered_allocator_test SRCS multi_bin_buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator multi_bin_buffered_allocator cpu_allocator) cc_test(multi_bin_buffered_allocator_test SRCS multi_bin_buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator multi_bin_buffered_allocator cpu_allocator)
cc_library(auto_increment_best_fit_allocator SRCS auto_increment_best_fit_allocator.cc DEPS allocator) cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc DEPS allocator)
cc_test(auto_increment_best_fit_allocator_test SRCS auto_increment_best_fit_allocator_test.cc DEPS cpu_allocator auto_increment_best_fit_allocator) cc_test(auto_growth_best_fit_allocator_test SRCS auto_growth_best_fit_allocator_test.cc DEPS cpu_allocator auto_growth_best_fit_allocator)
if (NOT WIN32)
cc_test(test_multi_bin_buffered_allocator_division_plan SRCS test_multi_bin_buffered_allocator_division_plan.cc DEPS multi_bin_buffered_allocator)
endif()
if (WITH_GPU) if (WITH_GPU)
nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard) nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
...@@ -42,30 +47,20 @@ else () ...@@ -42,30 +47,20 @@ else ()
set(AllocatorFacadeDeps) set(AllocatorFacadeDeps)
endif() endif()
list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator best_fit_allocator aligned_allocator auto_increment_allocator conditional_allocator retry_allocator buffered_allocator multi_bin_buffered_allocator auto_growth_best_fit_allocator legacy_allocator zero_size_allocator)
cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator) cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
cc_library(auto_increment_allocator SRCS auto_increment_allocator.cc DEPS allocator) cc_library(auto_increment_allocator SRCS auto_increment_allocator.cc DEPS allocator)
cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator)
cc_library(conditional_allocator SRCS conditional_allocator.cc DEPS allocator) cc_library(conditional_allocator SRCS conditional_allocator.cc DEPS allocator)
cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags) cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps})
cc_library(allocator_facade SRCS allocator_facade.cc DEPS cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy)
${AllocatorFacadeDeps}
cpu_allocator
locked_allocator
best_fit_allocator
aligned_allocator
auto_increment_allocator
zero_size_allocator
conditional_allocator
retry_allocator
buffered_allocator
multi_bin_buffered_allocator
auto_increment_best_fit_allocator
allocator_strategy
legacy_allocator
)
nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade) nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade)
cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator best_fit_allocator locked_allocator cpu_allocator) cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator best_fit_allocator locked_allocator cpu_allocator)
cc_test(allocator_facade_test SRCS allocator_facade_test.cc DEPS allocator_facade) cc_test(allocator_facade_test SRCS allocator_facade_test.cc DEPS allocator_facade)
cc_test(naive_best_fit_allocator_facade_test SRCS naive_best_fit_allocator_facade_test.cc DEPS allocator_facade)
cc_test(auto_growth_best_fit_allocator_facade_test SRCS auto_growth_best_fit_allocator_facade_test.cc DEPS allocator_facade)
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include <memory> #include <memory>
#include <utility>
#include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/allocator.h"
namespace paddle { namespace paddle {
......
...@@ -27,24 +27,24 @@ bool Allocator::IsAllocThreadSafe() const { return false; } ...@@ -27,24 +27,24 @@ bool Allocator::IsAllocThreadSafe() const { return false; }
AllocationPtr Allocator::Allocate(size_t size, Allocator::Attr attr) { AllocationPtr Allocator::Allocate(size_t size, Allocator::Attr attr) {
auto ptr = AllocateImpl(size, attr); auto ptr = AllocateImpl(size, attr);
ptr->RegisterAllocatorChain(this); ptr->RegisterDecoratedAllocator(this);
return AllocationPtr(ptr); return AllocationPtr(ptr);
} }
void Allocator::FreeImpl(Allocation* allocation) { void Allocator::FreeImpl(Allocation* allocation) {
Allocator* allocator = allocation->TopAllocator(); Allocator* allocator = allocation->TopDecoratedAllocator();
allocator->Free(allocation); allocator->Free(allocation);
} }
void Allocator::Free(Allocation* allocation) { void Allocator::Free(Allocation* allocation) {
allocation->PopAllocator(); allocation->PopDecoratedAllocator();
FreeImpl(allocation); FreeImpl(allocation);
} }
const char* BadAlloc::what() const noexcept { return msg_.c_str(); } const char* BadAlloc::what() const noexcept { return msg_.c_str(); }
void AllocationDeleter::operator()(Allocation* allocation) const { void AllocationDeleter::operator()(Allocation* allocation) const {
Allocator* allocator = allocation->TopAllocator(); Allocator* allocator = allocation->TopDecoratedAllocator();
allocator->Free(allocation); allocator->Free(allocation);
} }
......
...@@ -15,8 +15,9 @@ ...@@ -15,8 +15,9 @@
#pragma once #pragma once
#include <memory> #include <memory>
#include <string> #include <string>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/inlined_stack.h" #include "paddle/fluid/framework/inlined_vector.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
namespace paddle { namespace paddle {
...@@ -78,29 +79,26 @@ class Allocation { ...@@ -78,29 +79,26 @@ class Allocation {
virtual ~Allocation(); virtual ~Allocation();
// This function should only be used in unittest private:
std::vector<Allocator*> GetAllocatorChain() const { std::vector<Allocator*> DecoratedAllocators() const {
std::vector<Allocator*> allocators; return static_cast<std::vector<Allocator*>>(decorated_allocators_);
for (size_t i = 0; i < allocator_chain_.size(); ++i) {
allocators.push_back(allocator_chain_[i]);
}
return allocators;
} }
private: inline void RegisterDecoratedAllocator(Allocator* allocator) {
inline void RegisterAllocatorChain(Allocator* allocator) { decorated_allocators_.push_back(allocator);
allocator_chain_.push(allocator);
} }
inline void PopAllocator() { allocator_chain_.pop(); } inline void PopDecoratedAllocator() { decorated_allocators_.pop_back(); }
inline Allocator* TopAllocator() { return allocator_chain_.top(); } inline Allocator* TopDecoratedAllocator() {
return decorated_allocators_.back();
}
private: private:
void* ptr_; void* ptr_;
size_t size_; size_t size_;
platform::Place place_; platform::Place place_;
framework::InlinedStack<Allocator*, 8> allocator_chain_; framework::InlinedVector<Allocator*, 8> decorated_allocators_;
friend class Allocator; friend class Allocator;
friend class AllocationDeleter; friend class AllocationDeleter;
......
...@@ -17,12 +17,13 @@ ...@@ -17,12 +17,13 @@
#include <map> #include <map>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/memory/allocation/aligned_allocator.h" #include "paddle/fluid/memory/allocation/aligned_allocator.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h"
#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/auto_increment_allocator.h" #include "paddle/fluid/memory/allocation/auto_increment_allocator.h"
#include "paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/best_fit_allocator.h" #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/conditional_allocator.h" #include "paddle/fluid/memory/allocation/conditional_allocator.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h" #include "paddle/fluid/memory/allocation/cpu_allocator.h"
...@@ -32,6 +33,7 @@ ...@@ -32,6 +33,7 @@
#include "paddle/fluid/memory/allocation/retry_allocator.h" #include "paddle/fluid/memory/allocation/retry_allocator.h"
#include "paddle/fluid/memory/allocation/zero_size_allocator.h" #include "paddle/fluid/memory/allocation/zero_size_allocator.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/memory/allocation/cuda_allocator.h" #include "paddle/fluid/memory/allocation/cuda_allocator.h"
...@@ -51,6 +53,21 @@ namespace paddle { ...@@ -51,6 +53,21 @@ namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
static inline std::shared_ptr<Allocator> WrapRetryAndBufferedAllocator(
std::shared_ptr<Allocator> allocator, int64_t retry_time,
bool enable_buffered) {
if (retry_time > 0) {
auto* retry_allocator =
new RetryAllocator(std::move(allocator), retry_time);
allocator.reset(retry_allocator);
}
if (enable_buffered) {
allocator.reset(new MultiBinBufferedAllocator(allocator));
}
return allocator;
}
// TODO(yy): Dirty code here. This class should be configurable in runtime. // TODO(yy): Dirty code here. This class should be configurable in runtime.
class CPUManagedAllocator : public Allocator { class CPUManagedAllocator : public Allocator {
public: public:
...@@ -117,17 +134,10 @@ class ChunkedAllocator : public Allocator { ...@@ -117,17 +134,10 @@ class ChunkedAllocator : public Allocator {
std::shared_ptr<Allocator> allocator(new LockedAllocator( std::shared_ptr<Allocator> allocator(new LockedAllocator(
std::shared_ptr<Allocator>(new BestFitAllocator(allocation)))); std::shared_ptr<Allocator>(new BestFitAllocator(allocation))));
if (retry_time_ > 0) { allocator = WrapRetryAndBufferedAllocator(allocator, retry_time_,
auto* retry_allocator = FLAGS_enable_buffered_allocator);
new RetryAllocator(std::move(allocator), retry_time_);
allocator.reset(retry_allocator);
}
if (FLAGS_enable_buffered_allocator) { return std::make_shared<AlignedAllocator<4096>>(std::move(allocator));
allocator.reset(new MultiBinBufferedAllocator(allocator));
}
return std::make_shared<AlignedAllocator<64u>>(std::move(allocator));
} }
bool IsAllocThreadSafe() const override { return true; } bool IsAllocThreadSafe() const override { return true; }
...@@ -210,7 +220,7 @@ class AllocatorFacadePrivate { ...@@ -210,7 +220,7 @@ class AllocatorFacadePrivate {
break; break;
} }
case AllocatorStrategy::kAutoGrowthBestFit: { case AllocatorStrategy::kAutoGrowthBestFit: {
InitCPUAllocator(); InitAutoGrowthCPUAllocator();
InitAutoGrowthCUDAAllocator(); InitAutoGrowthCUDAAllocator();
InitAutoGrowthCUDAPinnedAllocator(); InitAutoGrowthCUDAPinnedAllocator();
WrapZeroSizeAllocator(); WrapZeroSizeAllocator();
...@@ -224,15 +234,25 @@ class AllocatorFacadePrivate { ...@@ -224,15 +234,25 @@ class AllocatorFacadePrivate {
} }
private: private:
void InitAutoGrowthCPUAllocator() {
auto cpu_allocator = std::make_shared<AlignedAllocator<4096>>(
std::make_shared<CPUAllocator>());
allocators_[platform::CPUPlace()] =
std::make_shared<AutoGrowthBestFitAllocator>(
cpu_allocator, platform::CpuMaxChunkSize(), 4096);
}
void InitAutoGrowthCUDAAllocator() { void InitAutoGrowthCUDAAllocator() {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
int dev_cnt = platform::GetCUDADeviceCount(); int dev_cnt = platform::GetCUDADeviceCount();
for (int dev_id = 0; dev_id < dev_cnt; ++dev_id) { for (int dev_id = 0; dev_id < dev_cnt; ++dev_id) {
auto cuda_allocator = std::make_shared<AlignedAllocator<4096>>( auto cuda_allocator = std::make_shared<AlignedAllocator<4096>>(
std::make_shared<CUDAAllocator>(platform::CUDAPlace(dev_id))); std::make_shared<CUDAAllocator>(platform::CUDAPlace(dev_id)));
allocators_[platform::CUDAPlace(dev_id)] = auto allocator = std::make_shared<AutoGrowthBestFitAllocator>(
std::make_shared<AutoIncrementBestFitAllocator>( cuda_allocator, platform::GpuMaxChunkSize(), 4096);
cuda_allocator, platform::GpuMaxChunkSize(), 4096);
allocators_[platform::CUDAPlace(dev_id)] = WrapRetryAndBufferedAllocator(
allocator, FLAGS_gpu_allocator_retry_time, false);
} }
#endif #endif
} }
...@@ -242,7 +262,7 @@ class AllocatorFacadePrivate { ...@@ -242,7 +262,7 @@ class AllocatorFacadePrivate {
auto cuda_pinned_allocator = std::make_shared<AlignedAllocator<4096>>( auto cuda_pinned_allocator = std::make_shared<AlignedAllocator<4096>>(
std::make_shared<CPUPinnedAllocator>()); std::make_shared<CPUPinnedAllocator>());
allocators_[platform::CUDAPinnedPlace()] = allocators_[platform::CUDAPinnedPlace()] =
std::make_shared<AutoIncrementBestFitAllocator>( std::make_shared<AutoGrowthBestFitAllocator>(
cuda_pinned_allocator, platform::CUDAPinnedMaxChunkSize(), 4096); cuda_pinned_allocator, platform::CUDAPinnedMaxChunkSize(), 4096);
#endif #endif
} }
...@@ -300,8 +320,7 @@ AllocatorFacade& AllocatorFacade::Instance() { ...@@ -300,8 +320,7 @@ AllocatorFacade& AllocatorFacade::Instance() {
std::shared_ptr<Allocation> AllocatorFacade::AllocShared( std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
const platform::Place& place, size_t size, Allocator::Attr attr) { const platform::Place& place, size_t size, Allocator::Attr attr) {
return std::shared_ptr<Allocation>(Alloc(place, size, attr).release(), return std::shared_ptr<Allocation>(Alloc(place, size, attr));
AllocationDeleter());
} }
AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size, AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
......
...@@ -19,7 +19,9 @@ ...@@ -19,7 +19,9 @@
DEFINE_string( DEFINE_string(
allocator_strategy, "legacy", allocator_strategy, "legacy",
"The allocation strategy. Legacy means the original allocator of Fluid." "The allocation strategy. Legacy means the original allocator of Fluid."
"New means the experimental allocators of Fluid. in [legacy, new]"); "naive_best_fit means the experimental best fit allocator. "
"auto_growth_best_fit means the experimental auto growth best fit "
"allocator. Enum in [legacy, naive_best_fit, auto_growth_best_fit].");
namespace paddle { namespace paddle {
namespace memory { namespace memory {
...@@ -28,7 +30,7 @@ namespace allocation { ...@@ -28,7 +30,7 @@ namespace allocation {
static AllocatorStrategy GetStrategyFromFlag() { static AllocatorStrategy GetStrategyFromFlag() {
if (FLAGS_allocator_strategy == "legacy") { if (FLAGS_allocator_strategy == "legacy") {
return AllocatorStrategy::kLegacy; return AllocatorStrategy::kLegacy;
} else if (FLAGS_allocator_strategy == "navie_best_fit") { } else if (FLAGS_allocator_strategy == "naive_best_fit") {
return AllocatorStrategy::kNaiveBestFit; return AllocatorStrategy::kNaiveBestFit;
} else if (FLAGS_allocator_strategy == "auto_growth_best_fit") { } else if (FLAGS_allocator_strategy == "auto_growth_best_fit") {
return AllocatorStrategy::kAutoGrowthBestFit; return AllocatorStrategy::kAutoGrowthBestFit;
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h" #include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
#include <algorithm> #include <algorithm>
#include <list> #include <list>
#include <map> #include <map>
...@@ -29,16 +29,14 @@ static size_t align(size_t size, size_t alignment) { ...@@ -29,16 +29,14 @@ static size_t align(size_t size, size_t alignment) {
return remaining == 0 ? size : size + alignment - remaining; return remaining == 0 ? size : size + alignment - remaining;
} }
AutoIncrementBestFitAllocator::AutoIncrementBestFitAllocator( AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
const std::shared_ptr<Allocator> &underlying_allocator, size_t chunk_size, const std::shared_ptr<Allocator> &underlying_allocator, size_t chunk_size,
size_t alignment) size_t alignment)
: underlying_allocator_(underlying_allocator), : underlying_allocator_(underlying_allocator),
chunk_size_(align(chunk_size, alignment)), chunk_size_(align(chunk_size, alignment)),
alignment_(alignment) {} alignment_(alignment) {}
Allocation *AutoIncrementBestFitAllocator::AllocateImpl(size_t size, Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size, Attr attr) {
Attr attr) {
if (size == 0) return nullptr;
size = align(size, alignment_); size = align(size, alignment_);
std::lock_guard<std::mutex> guard(mtx_); std::lock_guard<std::mutex> guard(mtx_);
auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr)); auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr));
...@@ -95,7 +93,7 @@ Allocation *AutoIncrementBestFitAllocator::AllocateImpl(size_t size, ...@@ -95,7 +93,7 @@ Allocation *AutoIncrementBestFitAllocator::AllocateImpl(size_t size,
return new Chunk::BlockAllocation(block_it); return new Chunk::BlockAllocation(block_it);
} }
void AutoIncrementBestFitAllocator::FreeImpl(Allocation *allocation) { void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) {
auto &block_it = static_cast<Chunk::BlockAllocation *>(allocation)->block_it_; auto &block_it = static_cast<Chunk::BlockAllocation *>(allocation)->block_it_;
auto &blocks = block_it->chunk_->blocks_; auto &blocks = block_it->chunk_->blocks_;
......
...@@ -25,9 +25,9 @@ namespace paddle { ...@@ -25,9 +25,9 @@ namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
class AutoIncrementBestFitAllocator : public Allocator { class AutoGrowthBestFitAllocator : public Allocator {
public: public:
explicit AutoIncrementBestFitAllocator( explicit AutoGrowthBestFitAllocator(
const std::shared_ptr<Allocator> &underlying_allocator, size_t chunk_size, const std::shared_ptr<Allocator> &underlying_allocator, size_t chunk_size,
size_t alignment); size_t alignment);
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#ifdef PADDLE_WITH_CUDA
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
DECLARE_int64(gpu_allocator_retry_time);
#endif
DECLARE_string(allocator_strategy);
namespace paddle {
namespace memory {
namespace allocation {
static inline size_t AlignTo(size_t size, size_t alignment = 4096) {
auto remaining = size % alignment;
return remaining == 0 ? size : size + alignment - remaining;
}
TEST(allocator, allocator) {
#ifdef PADDLE_WITH_CUDA
FLAGS_fraction_of_gpu_memory_to_use = 0.01;
FLAGS_gpu_allocator_retry_time = 500;
FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
#endif
FLAGS_allocator_strategy = "auto_growth_best_fit";
auto &instance = AllocatorFacade::Instance();
platform::Place place;
size_t size = 1024;
{
place = platform::CPUPlace();
size = 1024;
auto cpu_allocation = instance.Alloc(place, size);
ASSERT_NE(cpu_allocation, nullptr);
ASSERT_NE(cpu_allocation->ptr(), nullptr);
ASSERT_EQ(cpu_allocation->place(), place);
ASSERT_EQ(cpu_allocation->size(), AlignTo(size));
}
#ifdef PADDLE_WITH_CUDA
{
place = platform::CUDAPlace(0);
size = 1024;
auto gpu_allocation = instance.Alloc(place, size);
ASSERT_NE(gpu_allocation, nullptr);
ASSERT_NE(gpu_allocation->ptr(), nullptr);
ASSERT_EQ(gpu_allocation->place(), place);
ASSERT_GE(gpu_allocation->size(), AlignTo(size));
}
{
// Allocate 2GB gpu memory
place = platform::CUDAPlace(0);
size = 2 * static_cast<size_t>(1 << 30);
auto gpu_allocation = instance.Alloc(place, size);
ASSERT_NE(gpu_allocation, nullptr);
ASSERT_NE(gpu_allocation->ptr(), nullptr);
ASSERT_EQ(gpu_allocation->place(), place);
ASSERT_GE(gpu_allocation->size(), AlignTo(size));
}
{
place = platform::CUDAPinnedPlace();
size = (1 << 20);
auto cuda_pinned_allocation =
instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
ASSERT_NE(cuda_pinned_allocation, nullptr);
ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
ASSERT_EQ(cuda_pinned_allocation->place(), place);
ASSERT_GE(cuda_pinned_allocation->size(), AlignTo(size));
}
#endif
}
} // namespace allocation
} // namespace memory
} // namespace paddle
...@@ -22,18 +22,18 @@ ...@@ -22,18 +22,18 @@
#include <iostream> #include <iostream>
#include "paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h" #include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h" #include "paddle/fluid/memory/allocation/cpu_allocator.h"
namespace paddle { namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
TEST(allocator, auto_increment_best_fit_allocator) { TEST(allocator, auto_growth_best_fit_allocator) {
auto cpu_allocator = std::make_shared<CPUAllocator>(); auto cpu_allocator = std::make_shared<CPUAllocator>();
auto allocator = auto allocator =
std::make_shared<AutoIncrementBestFitAllocator>(cpu_allocator, 0, 4096); std::make_shared<AutoGrowthBestFitAllocator>(cpu_allocator, 0, 4096);
std::mutex mtx; std::mutex mtx;
std::condition_variable cv; std::condition_variable cv;
...@@ -60,13 +60,9 @@ TEST(allocator, auto_increment_best_fit_allocator) { ...@@ -60,13 +60,9 @@ TEST(allocator, auto_increment_best_fit_allocator) {
} }
cv.notify_all(); cv.notify_all();
thread_main();
for (auto &th : ths) { for (auto &th : ths) {
th.join(); th.join();
} }
std::cout << "test ends" << std::endl;
} }
} // namespace allocation } // namespace allocation
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/fluid/memory/allocation/buffered_allocator.h" #include "paddle/fluid/memory/allocation/buffered_allocator.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <utility>
#include "paddle/fluid/memory/allocation/best_fit_allocator.h" #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h" #include "paddle/fluid/memory/allocation/cpu_allocator.h"
#include "paddle/fluid/memory/allocation/locked_allocator.h" #include "paddle/fluid/memory/allocation/locked_allocator.h"
......
...@@ -37,8 +37,6 @@ DEFINE_bool(init_allocated_mem, false, ...@@ -37,8 +37,6 @@ DEFINE_bool(init_allocated_mem, false,
"that initializing the allocated memory with a small value " "that initializing the allocated memory with a small value "
"during unit testing."); "during unit testing.");
DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_double(initial_gpu_memory_in_mb);
DECLARE_double(reallocate_gpu_memory_in_mb);
DECLARE_bool(benchmark); DECLARE_bool(benchmark);
namespace paddle { namespace paddle {
...@@ -72,8 +70,7 @@ BuddyAllocator *GetCPUBuddyAllocator() { ...@@ -72,8 +70,7 @@ BuddyAllocator *GetCPUBuddyAllocator() {
std::call_once(init_flag, []() { std::call_once(init_flag, []() {
a = new detail::BuddyAllocator( a = new detail::BuddyAllocator(
std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator), std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
platform::CpuMinChunkSize(), platform::CpuMaxChunkSize(), platform::CpuMinChunkSize(), platform::CpuMaxChunkSize());
platform::CpuMaxChunkSize());
}); });
return a; return a;
...@@ -147,28 +144,16 @@ class GPUBuddyAllocatorList { ...@@ -147,28 +144,16 @@ class GPUBuddyAllocatorList {
PADDLE_ENFORCE(dev_id < flags_.size(), "Invalid device id %s", dev_id); PADDLE_ENFORCE(dev_id < flags_.size(), "Invalid device id %s", dev_id);
std::call_once(flags_[dev_id], [this, dev_id] { std::call_once(flags_[dev_id], [this, dev_id] {
platform::SetDeviceId(dev_id); platform::SetDeviceId(dev_id);
size_t first_size = platform::GpuFirstAllocateChunkSize(); allocators_[dev_id] = new BuddyAllocator(
size_t re_size = platform::GpuReAllocateChunkSize(); std::unique_ptr<detail::SystemAllocator>(
allocators_[dev_id] = new detail::GPUAllocator(dev_id)),
new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>( platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
new detail::GPUAllocator(dev_id)), VLOG(10) << "\n\nNOTE: each GPU device use "
platform::GpuMinChunkSize(), first_size, re_size); << FLAGS_fraction_of_gpu_memory_to_use * 100
VLOG(2) << "\n\nNOTE: each GPU device use " << "% of GPU memory.\n"
<< string::HumanReadableSize(first_size) << "(initial chunk) " << "You can set GFlags environment variable '"
<< string::HumanReadableSize(re_size) << "(reallocate chunk) " << "FLAGS_fraction_of_gpu_memory_to_use"
<< "% of GPU memory.\n" << "' to change the fraction of GPU usage.\n\n";
<< "You can set GFlags environment variable '"
<< "FLAGS_fraction_of_gpu_memory_to_use"
<< "' or "
"'FLAGS_initial_gpu_memory_in_mb/"
"FLAGS_reallocate_gpu_memory_in_mb' to change the fraction "
"of GPU usage.\n\n";
VLOG(2) << "Currently, FLAGS_fraction_of_gpu_memory_to_use="
<< FLAGS_fraction_of_gpu_memory_to_use << ", "
<< "FLAGS_initial_gpu_memory_in_mb="
<< FLAGS_initial_gpu_memory_in_mb << ", "
<< "FLAGS_reallocate_gpu_memory_in_mb="
<< FLAGS_reallocate_gpu_memory_in_mb;
}); });
return allocators_[dev_id]; return allocators_[dev_id];
} }
...@@ -251,7 +236,6 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() { ...@@ -251,7 +236,6 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>( ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::CUDAPinnedAllocator), new detail::CUDAPinnedAllocator),
platform::CUDAPinnedMinChunkSize(), platform::CUDAPinnedMinChunkSize(),
platform::CUDAPinnedMaxChunkSize(),
platform::CUDAPinnedMaxChunkSize()); platform::CUDAPinnedMaxChunkSize());
}); });
......
...@@ -14,8 +14,10 @@ ...@@ -14,8 +14,10 @@
#include "paddle/fluid/memory/allocation/locked_allocator.h" #include "paddle/fluid/memory/allocation/locked_allocator.h"
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <utility>
#include "paddle/fluid/memory/allocation/allocation_with_underlying.h" #include "paddle/fluid/memory/allocation/allocation_with_underlying.h"
#include "paddle/fluid/platform/lock_guard_ptr.h" #include "paddle/fluid/platform/lock_guard_ptr.h"
namespace paddle { namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
......
...@@ -17,20 +17,37 @@ ...@@ -17,20 +17,37 @@
#include <cctype> #include <cctype>
#include <fstream> #include <fstream>
#include <limits> #include <limits>
#include <mutex> // NOLINT
#include <sstream> #include <sstream>
#include <string> #include <string>
#include <utility>
#include "paddle/fluid/platform/lock_guard_ptr.h" #include "paddle/fluid/platform/lock_guard_ptr.h"
DEFINE_double(buffered_allocator_excess_times, 2, DEFINE_double(
"Tolerant memory size times of buffered_allocator"); buffered_allocator_excess_times, 2,
"Excess memory size times of buffered_allocator. BufferedAllocator"
" would try to reuse memory freed previously, but the size of freed"
" allocation may not be exactly the same as the requested. Here, we"
" use a flag to control the excess times of reused memory size. "
"Not quite sure what is the best excess times value.");
DEFINE_string(division_plan_path, "", "Division plan file path"); DEFINE_string(
buffered_allocator_division_plan_path, "",
"The file path which "
"determines the memory size division plans of BufferedAllocator."
"If it is empty, use the default division plan. The file must be a "
"text file which each lines indicates the bound of division plan. "
"For example, if the text file has 3 lines, which are '500M', '1G', "
" '2G', the division plan would be [0, 500M), [500M, 1G), [1G, 2G) "
"and [2G, +inf). Allocation request whose requested memory size is "
"inside the last interval of division plan would be dispatched to "
" underlying_allocator directly without caching when freed.");
namespace paddle { namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
std::string TrimStringAndToLowerCase(const std::string &str) { static std::string TrimStringAndToUpperCase(const std::string &str) {
auto not_space = [](char ch) { return std::isspace(ch) == 0; }; auto not_space = [](char ch) { return std::isspace(ch) == 0; };
auto first_idx = static_cast<size_t>( auto first_idx = static_cast<size_t>(
std::find_if(str.begin(), str.end(), not_space) - str.begin()); std::find_if(str.begin(), str.end(), not_space) - str.begin());
...@@ -38,41 +55,69 @@ std::string TrimStringAndToLowerCase(const std::string &str) { ...@@ -38,41 +55,69 @@ std::string TrimStringAndToLowerCase(const std::string &str) {
std::find_if(str.rbegin(), str.rend(), not_space) - str.rbegin()); std::find_if(str.rbegin(), str.rend(), not_space) - str.rbegin());
if (first_idx == str.size() || last_idx == str.size()) return ""; if (first_idx == str.size() || last_idx == str.size()) return "";
last_idx = str.size() - 1 - last_idx; last_idx = str.size() - last_idx;
auto ret = str.substr(first_idx, last_idx - first_idx); auto ret = str.substr(first_idx, last_idx - first_idx);
std::for_each(ret.begin(), ret.end(), std::for_each(ret.begin(), ret.end(),
[](char &ch) { ch = std::tolower(ch); }); [](char &ch) { ch = std::toupper(ch); });
return ret; return ret;
} }
static size_t ParseStringToBytes(const std::string &str) { namespace {
std::string ret = str;
if (ret.back() == 'b') { enum DivisionPlanFileStatus { kEOF, kException, kNormal };
ret.pop_back();
} // NOLINT
static size_t ParseStringToBytes(const std::string &original_str,
DivisionPlanFileStatus *ret_code) {
std::string str = TrimStringAndToUpperCase(original_str);
if (str.empty()) {
*ret_code = kEOF;
return 0;
}
if (str.back() == 'B') {
str.pop_back();
if (str.empty()) {
*ret_code = kException;
return 0;
}
} }
PADDLE_ENFORCE(!ret.empty(), "Wrong format: %s", str);
size_t multiples = 1; size_t multiples = 1;
switch (ret.back()) { switch (str.back()) {
case 'g': case 'G':
multiples *= (static_cast<size_t>(1) << 30); multiples *= (static_cast<size_t>(1) << 30);
break; break;
case 'm': case 'M':
multiples *= (static_cast<size_t>(1) << 20); multiples *= (static_cast<size_t>(1) << 20);
break; break;
case 'k': case 'K':
multiples *= (static_cast<size_t>(1) << 10); multiples *= (static_cast<size_t>(1) << 10);
break; break;
default: default:
break; break;
} }
if (multiples != 1) ret.pop_back(); if (multiples != 1) {
ret = TrimStringAndToLowerCase(ret); str.pop_back();
double ret_val = 0.0; if (str.empty()) {
std::stringstream ss(ret); *ret_code = kException;
PADDLE_ENFORCE((ss >> ret_val).good(), "Wrong format %s", str); return 0;
return static_cast<size_t>(ret_val * multiples); }
}
str = TrimStringAndToUpperCase(str);
double mem_val = -1.0;
std::stringstream ss(str);
if (!(ss >> mem_val) || mem_val < 0) {
*ret_code = kException;
return 0;
}
*ret_code = kNormal;
return static_cast<size_t>(mem_val * multiples);
} }
static std::string GetDebugStringOfPlan(const std::vector<size_t> &plan) { static std::string GetDebugStringOfPlan(const std::vector<size_t> &plan) {
...@@ -84,16 +129,27 @@ static std::string GetDebugStringOfPlan(const std::vector<size_t> &plan) { ...@@ -84,16 +129,27 @@ static std::string GetDebugStringOfPlan(const std::vector<size_t> &plan) {
return ret + "]"; return ret + "]";
} }
static std::vector<size_t> ReadDivisionPlanFromFile( std::vector<size_t> ReadBufferedAllocatorDivisionPlanFromFile(
const std::string &filepath) { const std::string &filepath) {
std::ifstream is(filepath.c_str()); std::ifstream is(filepath.c_str());
PADDLE_ENFORCE(is.good(), "File not exist"); PADDLE_ENFORCE(is.good(), "File %s not exist", filepath);
std::string str; std::string str;
std::vector<size_t> plan; std::vector<size_t> plan;
size_t line_num = 1;
while (std::getline(is, str).good()) { while (std::getline(is, str).good()) {
str = TrimStringAndToLowerCase(str); DivisionPlanFileStatus status;
if (str.empty()) break; size_t ret = ParseStringToBytes(str, &status);
plan.push_back(ParseStringToBytes(str)); if (status == kEOF) {
break;
}
if (status == kException) {
PADDLE_THROW(
"Invalid format in line %d of file %s: '%s'. Only support B, KB, MB, "
"GB.",
line_num, filepath, str);
}
plan.push_back(ret);
++line_num;
} }
return plan; return plan;
} }
...@@ -110,11 +166,12 @@ static void CheckAndModifyMemoryDivisionPlan( ...@@ -110,11 +166,12 @@ static void CheckAndModifyMemoryDivisionPlan(
} }
PADDLE_ENFORCE(is_strictly_sorted, "Divison plan must be stricted sorted"); PADDLE_ENFORCE(is_strictly_sorted, "Divison plan must be stricted sorted");
// Insert 0 and remove MAX to disivion plan for clean binary searching code // Insert 0 to disivion plan for clean binary searching code
if (division_plan->empty() || division_plan->front() != 0) { if (division_plan->empty() || division_plan->front() != 0) {
division_plan->insert(division_plan->begin(), 0); division_plan->insert(division_plan->begin(), 0);
} }
// Remove MAX from disivion plan for clean binary searching code
constexpr auto kSizeTypeMax = std::numeric_limits<size_t>::max(); constexpr auto kSizeTypeMax = std::numeric_limits<size_t>::max();
if (division_plan->back() == kSizeTypeMax) { if (division_plan->back() == kSizeTypeMax) {
division_plan->pop_back(); division_plan->pop_back();
...@@ -124,21 +181,17 @@ static void CheckAndModifyMemoryDivisionPlan( ...@@ -124,21 +181,17 @@ static void CheckAndModifyMemoryDivisionPlan(
} }
static std::vector<size_t> GetDefaultDivisionPlan() { static std::vector<size_t> GetDefaultDivisionPlan() {
if (!FLAGS_division_plan_path.empty()) { if (!FLAGS_buffered_allocator_division_plan_path.empty()) {
return ReadDivisionPlanFromFile(FLAGS_division_plan_path); return ReadBufferedAllocatorDivisionPlanFromFile(
FLAGS_buffered_allocator_division_plan_path);
} }
// Default division plan is 4K, 8K, 16K, ..., 500M, 1G
constexpr size_t kMaxLogSize = 30; constexpr size_t kMaxLogSize = 30;
std::vector<size_t> plan; std::vector<size_t> plan;
for (size_t i = 12; i <= kMaxLogSize; ++i) { for (size_t i = 12; i <= kMaxLogSize; ++i) {
plan.push_back(static_cast<size_t>(1) << i); plan.push_back(static_cast<size_t>(1) << i);
} }
/*
for (size_t i = 0; i < sizeof(size_t) * 8; ++i) {
plan.push_back(static_cast<size_t>(1) << i);
}
*/
return plan; return plan;
} }
...@@ -164,6 +217,7 @@ MultiBinBufferedAllocator::MultiBinBufferedAllocator( ...@@ -164,6 +217,7 @@ MultiBinBufferedAllocator::MultiBinBufferedAllocator(
division_plan_(division_plan) { division_plan_(division_plan) {
CheckAndModifyMemoryDivisionPlan(&division_plan_); CheckAndModifyMemoryDivisionPlan(&division_plan_);
allocations_.resize(division_plan_.size() - 1); allocations_.resize(division_plan_.size() - 1);
accumulated_cache_size_.assign(division_plan_.size() - 1, 0UL);
mtx_.resize(division_plan_.size() - 1); mtx_.resize(division_plan_.size() - 1);
if (underlying_allocator_->IsAllocThreadSafe()) { if (underlying_allocator_->IsAllocThreadSafe()) {
for (auto &mtx : mtx_) { for (auto &mtx : mtx_) {
...@@ -182,28 +236,22 @@ void MultiBinBufferedAllocator::FreeImpl(Allocation *allocation) { ...@@ -182,28 +236,22 @@ void MultiBinBufferedAllocator::FreeImpl(Allocation *allocation) {
platform::LockGuardPtr<std::mutex> guard(mtx_[bin_index]); platform::LockGuardPtr<std::mutex> guard(mtx_[bin_index]);
allocations_[bin_index].emplace(allocation->size(), allocations_[bin_index].emplace(allocation->size(),
AllocationPtr(allocation)); AllocationPtr(allocation));
accumulated_cache_size_[bin_index] += allocation->size();
} else { } else {
underlying_allocator_->Free(allocation); underlying_allocator_->Free(allocation);
} }
} }
// bin_index is not used currently.
// Maybe we can design more flexible FreeCache strategy based on bin_index // Maybe we can design more flexible FreeCache strategy based on bin_index
size_t MultiBinBufferedAllocator::FreeCache(size_t size, size_t bin_index) { // and require size.
size_t MultiBinBufferedAllocator::ClearCache() {
size_t accumulated_size = 0; size_t accumulated_size = 0;
// FIXME(zjl): free the largest first when there is no extra // FIXME(zjl): free the largest first when there is no extra
for (size_t i = allocations_.size() - 1; i != static_cast<size_t>(-1); --i) { for (size_t i = allocations_.size() - 1; i != static_cast<size_t>(-1); --i) {
platform::LockGuardPtr<std::mutex> lock(mtx_[i]); platform::LockGuardPtr<std::mutex> lock(mtx_[i]);
if (allocations_[i].empty()) continue; allocations_[i].clear();
auto it = --allocations_[i].end(); accumulated_size += accumulated_cache_size_[i];
do { accumulated_cache_size_[i] = 0;
accumulated_size += it->second->size();
underlying_allocator_->Free(it->second.release());
allocations_[i].erase(it--);
if (accumulated_size >= size) {
return accumulated_size;
}
} while (!allocations_[i].empty());
} }
return accumulated_size; return accumulated_size;
} }
...@@ -212,10 +260,6 @@ Allocation *MultiBinBufferedAllocator::AllocateImpl(size_t size, Attr attr) { ...@@ -212,10 +260,6 @@ Allocation *MultiBinBufferedAllocator::AllocateImpl(size_t size, Attr attr) {
auto bin_index = FindDivisionPlanBinIndex(division_plan_, size); auto bin_index = FindDivisionPlanBinIndex(division_plan_, size);
auto upper_size = TolerantUpperSize(size); auto upper_size = TolerantUpperSize(size);
// if (bin_index >= allocations_.size()) {
// VLOG(2) << "Allocate " << size << " from underlying directly";
//}
for (; bin_index < allocations_.size() && for (; bin_index < allocations_.size() &&
upper_size >= division_plan_[bin_index]; upper_size >= division_plan_[bin_index];
++bin_index) { ++bin_index) {
...@@ -226,6 +270,7 @@ Allocation *MultiBinBufferedAllocator::AllocateImpl(size_t size, Attr attr) { ...@@ -226,6 +270,7 @@ Allocation *MultiBinBufferedAllocator::AllocateImpl(size_t size, Attr attr) {
size_t sz = it->second->size(); size_t sz = it->second->size();
auto ret = std::move(it->second); auto ret = std::move(it->second);
allocation.erase(it); allocation.erase(it);
accumulated_cache_size_[bin_index] -= sz;
VLOG(3) << "Allocate " << sz << "(required " << size VLOG(3) << "Allocate " << sz << "(required " << size
<< ") from cache directly"; << ") from cache directly";
return ret.release(); return ret.release();
...@@ -239,10 +284,7 @@ Allocation *MultiBinBufferedAllocator::AllocateImpl(size_t size, Attr attr) { ...@@ -239,10 +284,7 @@ Allocation *MultiBinBufferedAllocator::AllocateImpl(size_t size, Attr attr) {
VLOG(2) << "Allocate " << size << " from underlying directly"; VLOG(2) << "Allocate " << size << " from underlying directly";
return ret; return ret;
} catch (BadAlloc &) { } catch (BadAlloc &) {
VLOG(1) << retry_time << "-th BadAlloc raises, try to free " << size size_t actual_free_size = ClearCache();
<< " bytes caches";
// size_t actual_free_size = FreeCache(size, bin_index);
size_t actual_free_size = FreeCache(-1UL, bin_index);
VLOG(1) << retry_time << "-th free " << actual_free_size VLOG(1) << retry_time << "-th free " << actual_free_size
<< " bytes caches"; << " bytes caches";
if (actual_free_size == 0) throw; if (actual_free_size == 0) throw;
...@@ -251,6 +293,8 @@ Allocation *MultiBinBufferedAllocator::AllocateImpl(size_t size, Attr attr) { ...@@ -251,6 +293,8 @@ Allocation *MultiBinBufferedAllocator::AllocateImpl(size_t size, Attr attr) {
} }
} }
void UseMultiBinBufferedAllocatorGFlags() {}
} // namespace allocation } // namespace allocation
} // namespace memory } // namespace memory
} // namespace paddle } // namespace paddle
...@@ -16,6 +16,8 @@ ...@@ -16,6 +16,8 @@
#include <map> #include <map>
#include <memory> #include <memory>
#include <mutex> // NOLINT
#include <string>
#include <vector> #include <vector>
#include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/allocator.h"
...@@ -24,6 +26,9 @@ namespace paddle { ...@@ -24,6 +26,9 @@ namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
std::vector<size_t> ReadBufferedAllocatorDivisionPlanFromFile(
const std::string& filepath);
class MultiBinBufferedAllocator : public Allocator { class MultiBinBufferedAllocator : public Allocator {
public: public:
explicit MultiBinBufferedAllocator( explicit MultiBinBufferedAllocator(
...@@ -34,21 +39,24 @@ class MultiBinBufferedAllocator : public Allocator { ...@@ -34,21 +39,24 @@ class MultiBinBufferedAllocator : public Allocator {
bool IsAllocThreadSafe() const override { return mtx_.front() != nullptr; } bool IsAllocThreadSafe() const override { return mtx_.front() != nullptr; }
void ClearCache() { FreeCache(static_cast<size_t>(-1), 0); } size_t ClearCache();
const std::vector<size_t>& DivisionPlan() const { return division_plan_; }
protected: protected:
Allocation* AllocateImpl(size_t size, Attr attr) override; Allocation* AllocateImpl(size_t size, Attr attr) override;
void FreeImpl(Allocation* allocation) override; void FreeImpl(Allocation* allocation) override;
private: private:
size_t FreeCache(size_t size, size_t bin_index);
std::shared_ptr<Allocator> underlying_allocator_; std::shared_ptr<Allocator> underlying_allocator_;
std::vector<std::multimap<size_t, AllocationPtr>> allocations_; std::vector<std::multimap<size_t, AllocationPtr>> allocations_;
std::vector<size_t> accumulated_cache_size_;
std::vector<size_t> division_plan_; std::vector<size_t> division_plan_;
std::vector<std::unique_ptr<std::mutex>> mtx_; std::vector<std::unique_ptr<std::mutex>> mtx_;
}; };
extern void UseMultiBinBufferedAllocatorGFlags();
} // namespace allocation } // namespace allocation
} // namespace memory } // namespace memory
} // namespace paddle } // namespace paddle
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h" #include "paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/memory/allocation/best_fit_allocator.h" #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h" #include "paddle/fluid/memory/allocation/cpu_allocator.h"
...@@ -123,10 +124,31 @@ TEST(buffered_allocator, lazy_free) { ...@@ -123,10 +124,31 @@ TEST(buffered_allocator, lazy_free) {
{ {
underlying_allocator->ResetCounter(); underlying_allocator->ResetCounter();
allocator->ClearCache(); size_t cache_size = allocator->ClearCache();
ASSERT_EQ(cache_size, static_cast<size_t>(alloc_size + 2048));
ASSERT_EQ(underlying_allocator->GetAllocCount(), kZero); ASSERT_EQ(underlying_allocator->GetAllocCount(), kZero);
ASSERT_EQ(underlying_allocator->GetFreeCount(), kTwo); ASSERT_EQ(underlying_allocator->GetFreeCount(), kTwo);
} }
{
underlying_allocator->ResetCounter();
auto p = allocator->Allocate(allocator->DivisionPlan().back(),
allocator->kDefault);
ASSERT_EQ(underlying_allocator->GetAllocCount(), kOne);
ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
}
ASSERT_EQ(underlying_allocator->GetFreeCount(), kOne);
{
underlying_allocator->ResetCounter();
auto p = allocator->Allocate(allocator->DivisionPlan().back() - 1,
allocator->kDefault);
ASSERT_EQ(underlying_allocator->GetAllocCount(), kOne);
ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
}
ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
} }
} }
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#ifdef PADDLE_WITH_CUDA
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
DECLARE_int64(gpu_allocator_retry_time);
#endif
DECLARE_bool(enable_buffered_allocator);
DECLARE_string(allocator_strategy);
namespace paddle {
namespace memory {
namespace allocation {
TEST(allocator, allocator) {
#ifdef PADDLE_WITH_CUDA
FLAGS_fraction_of_gpu_memory_to_use = 0.01;
FLAGS_gpu_allocator_retry_time = 500;
FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
#endif
FLAGS_allocator_strategy = "naive_best_fit";
FLAGS_enable_buffered_allocator = true;
auto &instance = AllocatorFacade::Instance();
platform::Place place;
size_t size = 1024;
{
place = platform::CPUPlace();
size = 1024;
auto cpu_allocation = instance.Alloc(place, size);
ASSERT_NE(cpu_allocation, nullptr);
ASSERT_NE(cpu_allocation->ptr(), nullptr);
ASSERT_EQ(cpu_allocation->place(), place);
ASSERT_EQ(cpu_allocation->size(), size);
}
#ifdef PADDLE_WITH_CUDA
{
place = platform::CUDAPlace(0);
size = 1024;
auto gpu_allocation = instance.Alloc(place, size);
ASSERT_NE(gpu_allocation, nullptr);
ASSERT_NE(gpu_allocation->ptr(), nullptr);
ASSERT_EQ(gpu_allocation->place(), place);
ASSERT_GE(gpu_allocation->size(), size);
}
{
// Allocate 2GB gpu memory
place = platform::CUDAPlace(0);
size = 2 * static_cast<size_t>(1 << 30);
auto gpu_allocation = instance.Alloc(place, size);
ASSERT_NE(gpu_allocation, nullptr);
ASSERT_NE(gpu_allocation->ptr(), nullptr);
ASSERT_EQ(gpu_allocation->place(), place);
ASSERT_GE(gpu_allocation->size(), size);
}
{
place = platform::CUDAPinnedPlace();
size = (1 << 20);
auto cuda_pinned_allocation =
instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
ASSERT_NE(cuda_pinned_allocation, nullptr);
ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
ASSERT_EQ(cuda_pinned_allocation->place(), place);
ASSERT_GE(cuda_pinned_allocation->size(), size);
}
#endif
}
} // namespace allocation
} // namespace memory
} // namespace paddle
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <condition_variable> // NOLINT #include <condition_variable> // NOLINT
#include <memory> #include <memory>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <utility>
#include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/allocator.h"
namespace paddle { namespace paddle {
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h"
DECLARE_string(buffered_allocator_division_plan_path);
namespace paddle {
namespace memory {
namespace allocation {
TEST(buffered_allocator, division_plan) {
std::string path = "/tmp/buffered_allocator_divison_plan";
FLAGS_buffered_allocator_division_plan_path = path;
{
std::vector<std::string> plan(
{"100b", "300.7K", "500.3m", "1.02gB", "2g", "4G"});
std::ofstream os(path);
for (auto &p : plan) {
os << p << std::endl;
}
os.close();
}
auto plan = ReadBufferedAllocatorDivisionPlanFromFile(
FLAGS_buffered_allocator_division_plan_path);
ASSERT_EQ(plan.size(), 6UL);
ASSERT_EQ(plan[0], 100UL);
ASSERT_EQ(plan[1], static_cast<size_t>(300.7 * 1024));
ASSERT_EQ(plan[2], static_cast<size_t>(500.3 * 1024 * 1024));
ASSERT_EQ(plan[3], static_cast<size_t>(1.02 * 1024 * 1024 * 1024));
ASSERT_EQ(plan[4], static_cast<size_t>(2.0 * 1024 * 1024 * 1024));
ASSERT_EQ(plan[5], static_cast<size_t>(4.0 * 1024 * 1024 * 1024));
}
} // namespace allocation
} // namespace memory
} // namespace paddle
...@@ -22,21 +22,22 @@ bool ZeroSizeAllocator::IsAllocThreadSafe() const { ...@@ -22,21 +22,22 @@ bool ZeroSizeAllocator::IsAllocThreadSafe() const {
return underlying_allocator_->IsAllocThreadSafe(); return underlying_allocator_->IsAllocThreadSafe();
} }
void ZeroSizeAllocator::FreeImpl(Allocation *allocation) { Allocation *ZeroSizeAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
if (dynamic_cast<ZeroSizeAllocation *>(allocation)) { if (size == 0) {
delete allocation; return new Allocation(nullptr, 0, place_);
} else { } else {
underlying_allocator_->Free(allocation); return underlying_allocator_->Allocate(size, attr).release();
} }
} }
Allocation *ZeroSizeAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { void ZeroSizeAllocator::FreeImpl(Allocation *allocation) {
if (size == 0) { if (allocation->size() == 0) {
return new ZeroSizeAllocation(place_); delete allocation;
} else { } else {
return underlying_allocator_->Allocate(size, attr).release(); underlying_allocator_->Free(allocation);
} }
} }
} // namespace allocation } // namespace allocation
} // namespace memory } // namespace memory
} // namespace paddle } // namespace paddle
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <memory>
#include <utility> #include <utility>
#include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/allocator.h"
...@@ -23,12 +24,6 @@ namespace allocation { ...@@ -23,12 +24,6 @@ namespace allocation {
// The allocator handles the request's size is zero. Allocator will always // The allocator handles the request's size is zero. Allocator will always
// return an allocation even the request size is zero. However, the // return an allocation even the request size is zero. However, the
// allocation.ptr() is nullptr // allocation.ptr() is nullptr
class ZeroSizeAllocation : public Allocation {
public:
explicit ZeroSizeAllocation(const platform::Place& p)
: Allocation(nullptr, 0, p) {}
};
class ZeroSizeAllocator : public Allocator { class ZeroSizeAllocator : public Allocator {
public: public:
ZeroSizeAllocator(std::shared_ptr<Allocator> underlying_allocator, ZeroSizeAllocator(std::shared_ptr<Allocator> underlying_allocator,
......
...@@ -25,11 +25,9 @@ namespace detail { ...@@ -25,11 +25,9 @@ namespace detail {
BuddyAllocator::BuddyAllocator( BuddyAllocator::BuddyAllocator(
std::unique_ptr<SystemAllocator> system_allocator, size_t min_chunk_size, std::unique_ptr<SystemAllocator> system_allocator, size_t min_chunk_size,
size_t first_allocate_chunk_size, size_t reallocate_chunk_size) size_t max_chunk_size)
: min_chunk_size_(min_chunk_size), : min_chunk_size_(min_chunk_size),
first_allocate_chunk_size_(first_allocate_chunk_size), max_chunk_size_(max_chunk_size),
reallocate_chunk_size_(reallocate_chunk_size),
max_chunk_size_(first_allocate_chunk_size),
cache_(system_allocator->UseGpu()), cache_(system_allocator->UseGpu()),
system_allocator_(std::move(system_allocator)) {} system_allocator_(std::move(system_allocator)) {}
...@@ -38,10 +36,9 @@ BuddyAllocator::~BuddyAllocator() { ...@@ -38,10 +36,9 @@ BuddyAllocator::~BuddyAllocator() {
"have actually been freed"; "have actually been freed";
while (!pool_.empty()) { while (!pool_.empty()) {
auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin())); auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
auto desc = cache_.load(block); VLOG(10) << "Free from block (" << block << ", " << max_chunk_size_ << ")";
VLOG(10) << "Free from block (" << block << ", " << desc.size << ")";
system_allocator_->Free(block, desc.size, desc.index); system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
cache_.invalidate(block); cache_.invalidate(block);
pool_.erase(pool_.begin()); pool_.erase(pool_.begin());
} }
...@@ -66,7 +63,7 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) { ...@@ -66,7 +63,7 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
// if the allocation is huge, send directly to the system allocator // if the allocation is huge, send directly to the system allocator
if (size > max_chunk_size_) { if (size > max_chunk_size_) {
VLOG(10) << "Allocate from system allocator."; VLOG(10) << "Allocate from system allocator.";
return SystemAlloc(size, false); return SystemAlloc(size);
} }
// query and allocate from the existing chunk // query and allocate from the existing chunk
...@@ -75,9 +72,9 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) { ...@@ -75,9 +72,9 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
// refill the pool if failure // refill the pool if failure
if (it == pool_.end()) { if (it == pool_.end()) {
it = RefillPool(); it = RefillPool();
// if still failure, try to allocate from SystemAllocator // if still failure, fail fatally
if (it == pool_.end()) { if (it == pool_.end()) {
return SystemAlloc(size, false); return nullptr;
} }
} else { } else {
VLOG(10) << "Allocation from existing memory block " << std::get<2>(*it) VLOG(10) << "Allocation from existing memory block " << std::get<2>(*it)
...@@ -101,7 +98,7 @@ void BuddyAllocator::Free(void* p) { ...@@ -101,7 +98,7 @@ void BuddyAllocator::Free(void* p) {
VLOG(10) << "Free from address " << block; VLOG(10) << "Free from address " << block;
if (block->type(cache_) == MemoryBlock::UNMANAGED_HUGE_CHUNK) { if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) {
VLOG(10) << "Free directly from system allocator"; VLOG(10) << "Free directly from system allocator";
system_allocator_->Free(block, block->total_size(cache_), system_allocator_->Free(block, block->total_size(cache_),
block->index(cache_)); block->index(cache_));
...@@ -171,12 +168,9 @@ void BuddyAllocator::Free(void* p) { ...@@ -171,12 +168,9 @@ void BuddyAllocator::Free(void* p) {
size_t BuddyAllocator::Used() { return total_used_; } size_t BuddyAllocator::Used() { return total_used_; }
size_t BuddyAllocator::GetMinChunkSize() { return min_chunk_size_; } size_t BuddyAllocator::GetMinChunkSize() { return min_chunk_size_; }
size_t BuddyAllocator::GetMaxChunkSize() { size_t BuddyAllocator::GetMaxChunkSize() { return max_chunk_size_; }
std::lock_guard<std::mutex> lock(mutex_);
return max_chunk_size_;
}
void* BuddyAllocator::SystemAlloc(size_t size, bool is_managed) { void* BuddyAllocator::SystemAlloc(size_t size) {
size_t index = 0; size_t index = 0;
void* p = system_allocator_->Alloc(&index, size); void* p = system_allocator_->Alloc(&index, size);
...@@ -184,23 +178,25 @@ void* BuddyAllocator::SystemAlloc(size_t size, bool is_managed) { ...@@ -184,23 +178,25 @@ void* BuddyAllocator::SystemAlloc(size_t size, bool is_managed) {
if (p == nullptr) return nullptr; if (p == nullptr) return nullptr;
static_cast<MemoryBlock*>(p)->init( static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::HUGE_CHUNK, index,
&cache_, is_managed ? MemoryBlock::MANAGED_HUGE_CHUNK size, nullptr, nullptr);
: MemoryBlock::UNMANAGED_HUGE_CHUNK,
index, size, nullptr, nullptr);
return static_cast<MemoryBlock*>(p)->data(); return static_cast<MemoryBlock*>(p)->data();
} }
BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
if (total_used_ + total_free_ > 0) { #ifdef PADDLE_WITH_CUDA
max_chunk_size_ = reallocate_chunk_size_; if (system_allocator_->UseGpu()) {
if ((total_used_ + total_free_) == 0) {
// Compute the maximum allocation size for the first allocation.
max_chunk_size_ = platform::GpuMaxChunkSize();
}
} }
#endif
// Allocate a new maximum sized block // Allocate a new maximum sized block
size_t index = 0; size_t index = 0;
size_t chunk_size = max_chunk_size_; void* p = system_allocator_->Alloc(&index, max_chunk_size_);
void* p = system_allocator_->Alloc(&index, chunk_size);
if (p == nullptr) return pool_.end(); if (p == nullptr) return pool_.end();
...@@ -208,7 +204,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { ...@@ -208,7 +204,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
<< " from system allocator"; << " from system allocator";
static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index, static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index,
chunk_size, nullptr, nullptr); max_chunk_size_, nullptr, nullptr);
// gpu fallback allocation // gpu fallback allocation
if (system_allocator_->UseGpu() && if (system_allocator_->UseGpu() &&
...@@ -216,10 +212,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() { ...@@ -216,10 +212,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
fallback_alloc_count_++; fallback_alloc_count_++;
} }
total_free_ += chunk_size; total_free_ += max_chunk_size_;
// dump the block into pool // dump the block into pool
return pool_.insert(IndexSizeAddress(index, chunk_size, p)).first; return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first;
} }
BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) { BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
...@@ -275,24 +271,27 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it, ...@@ -275,24 +271,27 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
void BuddyAllocator::CleanIdleFallBackAlloc() { void BuddyAllocator::CleanIdleFallBackAlloc() {
// If fallback allocation does not exist, return directly // If fallback allocation does not exist, return directly
if (!fallback_alloc_count_ || !system_allocator_->UseGpu()) return; if (!fallback_alloc_count_) return;
for (auto pool = pool_.rbegin(); pool != pool_.rend();) { for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
// If free memory block less than max_chunk_size_, return directly
if (std::get<1>(*pool) < max_chunk_size_) return;
MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool)); MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
auto desc = cache_.load(block); // If no GPU fallback allocator, return
if (desc.index == 0) { if (!system_allocator_->UseGpu() || block->index(cache_) == 0) {
return; return;
} }
VLOG(10) << "Return block " << block << " to fallback allocator."; VLOG(10) << "Return block " << block << " to fallback allocator.";
system_allocator_->Free(block, desc.size, block->index(cache_)); system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
cache_.invalidate(block); cache_.invalidate(block);
pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base())); pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
total_free_ -= desc.size; total_free_ -= max_chunk_size_;
fallback_alloc_count_--; fallback_alloc_count_--;
// If no fall allocation exists, return directly // If no fall allocation exists, return directly
...@@ -316,21 +315,19 @@ void BuddyAllocator::CleanIdleNormalAlloc() { ...@@ -316,21 +315,19 @@ void BuddyAllocator::CleanIdleNormalAlloc() {
if (!shall_free_alloc()) return; if (!shall_free_alloc()) return;
for (auto pool = pool_.rbegin(); pool != pool_.rend();) { for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool)); // If free memory block less than max_chunk_size_, return directly
auto desc = cache_.load(block); if (std::get<1>(*pool) < max_chunk_size_) return;
if (desc.type != MemoryBlock::MANAGED_HUGE_CHUNK) { MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
return;
}
VLOG(10) << "Return block " << block << " to base allocator."; VLOG(10) << "Return block " << block << " to base allocator.";
system_allocator_->Free(block, desc.size, desc.index); system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
cache_.invalidate(block); cache_.invalidate(block);
pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base())); pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
total_free_ -= desc.size; total_free_ -= max_chunk_size_;
if (!shall_free_alloc()) return; if (!shall_free_alloc()) return;
} }
......
...@@ -34,8 +34,7 @@ namespace detail { ...@@ -34,8 +34,7 @@ namespace detail {
class BuddyAllocator { class BuddyAllocator {
public: public:
BuddyAllocator(std::unique_ptr<SystemAllocator> system_allocator, BuddyAllocator(std::unique_ptr<SystemAllocator> system_allocator,
size_t min_chunk_size, size_t first_allocate_chunk_size, size_t min_chunk_size, size_t max_chunk_size);
size_t reallocate_chunk_size);
~BuddyAllocator(); ~BuddyAllocator();
...@@ -58,7 +57,7 @@ class BuddyAllocator { ...@@ -58,7 +57,7 @@ class BuddyAllocator {
using PoolSet = std::set<IndexSizeAddress>; using PoolSet = std::set<IndexSizeAddress>;
/*! \brief Allocate fixed-size memory from system */ /*! \brief Allocate fixed-size memory from system */
void* SystemAlloc(size_t size, bool is_managed = true); void* SystemAlloc(size_t size);
/*! \brief If existing chunks are not suitable, refill pool */ /*! \brief If existing chunks are not suitable, refill pool */
PoolSet::iterator RefillPool(); PoolSet::iterator RefillPool();
...@@ -88,11 +87,7 @@ class BuddyAllocator { ...@@ -88,11 +87,7 @@ class BuddyAllocator {
size_t total_free_ = 0; // the total size of free memory size_t total_free_ = 0; // the total size of free memory
size_t min_chunk_size_; // the minimum size of each chunk size_t min_chunk_size_; // the minimum size of each chunk
size_t max_chunk_size_; // the maximum size of each chunk
size_t first_allocate_chunk_size_;
size_t reallocate_chunk_size_;
size_t max_chunk_size_;
private: private:
/** /**
......
...@@ -27,11 +27,10 @@ class MetadataCache; ...@@ -27,11 +27,10 @@ class MetadataCache;
// MemoryBlock::Desc and the payload. // MemoryBlock::Desc and the payload.
struct MemoryBlock { struct MemoryBlock {
enum Type { enum Type {
FREE_CHUNK, // memory is free and idle FREE_CHUNK, // memory is free and idle
ARENA_CHUNK, // memory is being occupied ARENA_CHUNK, // memory is being occupied
MANAGED_HUGE_CHUNK, // memory is huge and out of management HUGE_CHUNK, // memory is out of management
UNMANAGED_HUGE_CHUNK, // memory is huge and managed by allocator INVALID_CHUNK // memory is invalid
INVALID_CHUNK // memory is invalid
}; };
// init saves the MemoryBlock::Desc of the memory block in a MetadataCache. // init saves the MemoryBlock::Desc of the memory block in a MetadataCache.
......
...@@ -38,22 +38,6 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use, ...@@ -38,22 +38,6 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
"additional trunks of the same size will be requested from gpu " "additional trunks of the same size will be requested from gpu "
"until the gpu has no memory left for another trunk."); "until the gpu has no memory left for another trunk.");
DEFINE_double(
initial_gpu_memory_in_mb, -1.0,
"GPU memory chunk size in MB."
"Allocator would allocate FLAGS_initial_gpu_memory_in_mb size "
"chunk first and reallocate FLAGS_reallocate_gpu_memory_in_mb size "
"chunk when the first chunk is not enough. This flag has higher priority "
"than FLAGS_fraction_of_gpu_memory_to_use. Disable when less than 0.");
DEFINE_double(reallocate_gpu_memory_in_mb, -1.0,
"GPU memory chunk size in MB."
"If FLAGS_initial_gpu_memory_in_mb is set and "
"FLAGS_reallocate_gpu_memory_in_mb "
"is less than 0, it would be replaced by "
"FLAGS_initial_gpu_memory_in_mb. Disable "
"when FLAGS_initial_gpu_memory_in_mb is less than 0.");
DEFINE_bool( DEFINE_bool(
enable_cublas_tensor_op_math, false, enable_cublas_tensor_op_math, false,
"The enable_cublas_tensor_op_math indicate whether to use Tensor Core, " "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
...@@ -227,54 +211,13 @@ size_t GpuMaxChunkSize() { ...@@ -227,54 +211,13 @@ size_t GpuMaxChunkSize() {
size_t allocating = static_cast<size_t>(FLAGS_fraction_of_gpu_memory_to_use * size_t allocating = static_cast<size_t>(FLAGS_fraction_of_gpu_memory_to_use *
(total - reserving)); (total - reserving));
PADDLE_ENFORCE_LE(allocating, available, PADDLE_ENFORCE_LE(allocating, available,
"Insufficient GPU memory to allocation."); "Insufficient GPU memory to allocation.");
return allocating; return allocating;
} }
size_t GpuFirstAllocateChunkSize() {
if (FLAGS_initial_gpu_memory_in_mb <= 0) {
return GpuMaxChunkSize();
}
size_t total = 0;
size_t available = 0;
GpuMemoryUsage(&available, &total);
VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
<< total / 1024 / 1024 << "M";
size_t initial_mem =
static_cast<size_t>(FLAGS_initial_gpu_memory_in_mb * (1 << 20));
PADDLE_ENFORCE_LE(initial_mem, available,
"Insufficient GPU memory to allocation.");
return initial_mem;
}
size_t GpuReAllocateChunkSize() {
if (FLAGS_initial_gpu_memory_in_mb <= 0) {
return GpuMaxChunkSize();
}
double reallocate_mem = FLAGS_reallocate_gpu_memory_in_mb;
if (reallocate_mem < 0) {
PADDLE_ENFORCE(FLAGS_initial_gpu_memory_in_mb > 0,
"FLAGS_init_gpu_memory_to_use_mb must be larger than 0");
reallocate_mem = FLAGS_initial_gpu_memory_in_mb;
}
size_t total = 0;
size_t available = 0;
GpuMemoryUsage(&available, &total);
VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
<< total / 1024 / 1024 << "M";
size_t realloc_mem = static_cast<size_t>(reallocate_mem * (1 << 20));
PADDLE_ENFORCE_LE(realloc_mem, available,
"Insufficient GPU memory to allocation.");
return realloc_mem;
}
void GpuMemcpyAsync(void *dst, const void *src, size_t count, void GpuMemcpyAsync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind, cudaStream_t stream) { enum cudaMemcpyKind kind, cudaStream_t stream) {
PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream), PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream),
......
...@@ -66,12 +66,6 @@ size_t GpuMinChunkSize(); ...@@ -66,12 +66,6 @@ size_t GpuMinChunkSize();
//! Get the maximum chunk size for GPU buddy allocator. //! Get the maximum chunk size for GPU buddy allocator.
size_t GpuMaxChunkSize(); size_t GpuMaxChunkSize();
//! Get init chunk size for GPU buddy allocator.
size_t GpuFirstAllocateChunkSize();
//! Get reallocate chunk size for GPU buddy allocator.
size_t GpuReAllocateChunkSize();
//! Copy memory from address src to dst asynchronously. //! Copy memory from address src to dst asynchronously.
void GpuMemcpyAsync(void *dst, const void *src, size_t count, void GpuMemcpyAsync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind, cudaStream_t stream); enum cudaMemcpyKind kind, cudaStream_t stream);
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/platform/temporary_allocator.h" #include "paddle/fluid/platform/temporary_allocator.h"
#include <memory>
#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/memory/allocation/allocator_facade.h"
DEFINE_int64(limit_of_tmp_allocation, -1, DEFINE_int64(limit_of_tmp_allocation, -1,
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <condition_variable> // NOLINT #include <condition_variable> // NOLINT
#include <deque> #include <deque>
#include <map> #include <map>
#include <memory>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/lock_guard_ptr.h" #include "paddle/fluid/platform/lock_guard_ptr.h"
......
...@@ -39,6 +39,7 @@ limitations under the License. */ ...@@ -39,6 +39,7 @@ limitations under the License. */
#include "paddle/fluid/imperative/profiler.h" #include "paddle/fluid/imperative/profiler.h"
#include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h"
#include "paddle/fluid/memory/allocation/legacy_allocator.h" #include "paddle/fluid/memory/allocation/legacy_allocator.h"
#include "paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h"
#include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/py_func_op.h" #include "paddle/fluid/operators/py_func_op.h"
#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
...@@ -133,6 +134,9 @@ PYBIND11_MODULE(core, m) { ...@@ -133,6 +134,9 @@ PYBIND11_MODULE(core, m) {
paddle::platform::CpuTotalPhysicalMemory(); paddle::platform::CpuTotalPhysicalMemory();
paddle::memory::allocation::UseAllocatorStrategyGFlag(); paddle::memory::allocation::UseAllocatorStrategyGFlag();
paddle::memory::allocation::UseMultiBinBufferedAllocatorGFlags();
m.doc() = "C++ core of PaddlePaddle"; m.doc() = "C++ core of PaddlePaddle";
// using framework in this function. Since it is inside a function, it will // using framework in this function. Since it is inside a function, it will
......
...@@ -105,14 +105,12 @@ void Printf(const char* fmt, const Args&... args) { ...@@ -105,14 +105,12 @@ void Printf(const char* fmt, const Args&... args) {
Fprintf(std::cout, fmt, args...); Fprintf(std::cout, fmt, args...);
} }
template <typename T> inline std::string HumanReadableSize(double f_size) {
std::string HumanReadableSize(T size) {
size_t i = 0; size_t i = 0;
double f_size = static_cast<double>(size);
double orig = f_size; double orig = f_size;
const std::vector<std::string> units( const std::vector<std::string> units(
{"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"}); {"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"});
while (f_size > 1024) { while (f_size >= 1024) {
f_size /= 1024; f_size /= 1024;
i++; i++;
} }
......
...@@ -130,7 +130,8 @@ def __bootstrap__(): ...@@ -130,7 +130,8 @@ def __bootstrap__():
'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb', 'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb',
'fast_eager_deletion_mode', 'memory_fraction_of_eager_deletion', 'fast_eager_deletion_mode', 'memory_fraction_of_eager_deletion',
'allocator_strategy', 'enable_buffered_allocator', 'allocator_strategy', 'enable_buffered_allocator',
'buffered_allocator_excess_times', 'reader_queue_speed_test_mode', 'buffered_allocator_excess_times',
'buffered_allocator_division_plan_path', 'reader_queue_speed_test_mode',
'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir', 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir',
'inner_op_parallelism', 'enable_parallel_graph', 'inner_op_parallelism', 'enable_parallel_graph',
'multiple_of_cupti_buffer_size', 'enable_subgraph_optimize', 'multiple_of_cupti_buffer_size', 'enable_subgraph_optimize',
...@@ -163,7 +164,6 @@ def __bootstrap__(): ...@@ -163,7 +164,6 @@ def __bootstrap__():
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
read_env_flags += [ read_env_flags += [
'initial_gpu_memory_in_mb', 'reallocate_gpu_memory_in_mb',
'fraction_of_gpu_memory_to_use', 'cudnn_deterministic', 'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus', 'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus',
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册