提交 953214ad 编写于 作者: S sneaxiy

add more unittest

modify allocator strategy
remove changes of legacy buddy_allocator
test=develop
上级 fd23262e
......@@ -202,6 +202,8 @@ cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
cc_test(tuple_test SRCS tuple_test.cc )
cc_test(inlined_vector_test SRCS inlined_vector_test.cc)
if (NOT WIN32)
cc_test(rw_lock_test SRCS rw_lock_test.cc)
endif (NOT WIN32)
......
......@@ -14,18 +14,18 @@
#pragma once
#include <deque>
#include <vector>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
template <typename T, size_t N>
class InlinedStack {
class InlinedVector {
static_assert(N > 0, "N must be larger than 0");
public:
inline void push(const T& item) {
inline void push_back(const T& item) {
if (size_ < N) {
head_[size_] = item;
} else {
......@@ -34,21 +34,21 @@ class InlinedStack {
++size_;
}
inline void pop() {
PADDLE_ENFORCE(!empty(), "Try to pop element from empty stack.");
inline void pop_back() {
PADDLE_ENFORCE(!empty(), "Try to pop back element from empty vector.");
if (size_ > N) {
tail_.pop_back();
}
--size_;
}
inline const T& top() const {
PADDLE_ENFORCE(!empty(), "Try to get top element of empty stack.");
inline const T& back() const {
PADDLE_ENFORCE(!empty(), "Try to get back element of empty vector.");
return size_ <= N ? head_[size_ - 1] : tail_.back();
}
inline T& top() {
PADDLE_ENFORCE(!empty(), "Try to get top element of empty stack.");
inline T& back() {
PADDLE_ENFORCE(!empty(), "Try to get back element of empty vector.");
return size_ <= N ? head_[size_ - 1] : tail_.back();
}
......@@ -63,10 +63,19 @@ class InlinedStack {
return i < N ? head_[i] : tail_[i - N];
}
operator std::vector<T>() const {
std::vector<T> ret;
ret.reserve(size_);
for (size_t i = 0; i < size_; ++i) {
ret.emplace_back((*this)[i]);
}
return ret;
}
private:
T head_[N];
size_t size_{0};
std::deque<T> tail_;
std::vector<T> tail_;
};
} // namespace framework
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/inlined_vector.h"
#include <vector>
#include "gtest/gtest.h"
namespace paddle {
namespace framework {
TEST(inlined_stack, inlined_stack) {
size_t max_num = 10;
InlinedVector<size_t, 5> stack;
for (size_t i = 0; i < max_num; ++i) {
ASSERT_EQ(stack.size(), i);
stack.push_back(i);
ASSERT_EQ(stack.size(), i + 1);
}
std::vector<size_t> vec = stack;
ASSERT_EQ(stack.size(), vec.size());
for (size_t i = 0; i < vec.size(); ++i) {
ASSERT_EQ(stack[i], vec[i]);
}
for (size_t i = 0; i < max_num; ++i) {
ASSERT_EQ(stack[i], i);
}
for (size_t i = 0; i < max_num; ++i) {
ASSERT_EQ(stack.back(), max_num - 1 - i);
stack.pop_back();
ASSERT_EQ(stack.size(), max_num - 1 - i);
}
}
} // namespace framework
} // namespace paddle
......@@ -3,13 +3,18 @@ cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
cc_library(multi_bin_buffered_allocator SRCS multi_bin_buffered_allocator.cc DEPS allocator)
cc_library(multi_bin_buffered_allocator SRCS multi_bin_buffered_allocator.cc DEPS allocator gflags)
cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler)
cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator)
cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator)
cc_test(multi_bin_buffered_allocator_test SRCS multi_bin_buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator multi_bin_buffered_allocator cpu_allocator)
cc_library(auto_increment_best_fit_allocator SRCS auto_increment_best_fit_allocator.cc DEPS allocator)
cc_test(auto_increment_best_fit_allocator_test SRCS auto_increment_best_fit_allocator_test.cc DEPS cpu_allocator auto_increment_best_fit_allocator)
cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc DEPS allocator)
cc_test(auto_growth_best_fit_allocator_test SRCS auto_growth_best_fit_allocator_test.cc DEPS cpu_allocator auto_growth_best_fit_allocator)
if (NOT WIN32)
cc_test(test_multi_bin_buffered_allocator_division_plan SRCS test_multi_bin_buffered_allocator_division_plan.cc DEPS multi_bin_buffered_allocator)
endif()
if (WITH_GPU)
nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
......@@ -42,30 +47,20 @@ else ()
set(AllocatorFacadeDeps)
endif()
list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator best_fit_allocator aligned_allocator auto_increment_allocator conditional_allocator retry_allocator buffered_allocator multi_bin_buffered_allocator auto_growth_best_fit_allocator legacy_allocator zero_size_allocator)
cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
cc_library(auto_increment_allocator SRCS auto_increment_allocator.cc DEPS allocator)
cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator)
cc_library(conditional_allocator SRCS conditional_allocator.cc DEPS allocator)
cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags)
cc_library(allocator_facade SRCS allocator_facade.cc DEPS
${AllocatorFacadeDeps}
cpu_allocator
locked_allocator
best_fit_allocator
aligned_allocator
auto_increment_allocator
zero_size_allocator
conditional_allocator
retry_allocator
buffered_allocator
multi_bin_buffered_allocator
auto_increment_best_fit_allocator
allocator_strategy
legacy_allocator
)
cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps})
cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy)
nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade)
cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator best_fit_allocator locked_allocator cpu_allocator)
cc_test(allocator_facade_test SRCS allocator_facade_test.cc DEPS allocator_facade)
cc_test(naive_best_fit_allocator_facade_test SRCS naive_best_fit_allocator_facade_test.cc DEPS allocator_facade)
cc_test(auto_growth_best_fit_allocator_facade_test SRCS auto_growth_best_fit_allocator_facade_test.cc DEPS allocator_facade)
......@@ -14,6 +14,7 @@
#pragma once
#include <memory>
#include <utility>
#include "paddle/fluid/memory/allocation/allocator.h"
namespace paddle {
......
......@@ -27,24 +27,24 @@ bool Allocator::IsAllocThreadSafe() const { return false; }
AllocationPtr Allocator::Allocate(size_t size, Allocator::Attr attr) {
auto ptr = AllocateImpl(size, attr);
ptr->RegisterAllocatorChain(this);
ptr->RegisterDecoratedAllocator(this);
return AllocationPtr(ptr);
}
void Allocator::FreeImpl(Allocation* allocation) {
Allocator* allocator = allocation->TopAllocator();
Allocator* allocator = allocation->TopDecoratedAllocator();
allocator->Free(allocation);
}
void Allocator::Free(Allocation* allocation) {
allocation->PopAllocator();
allocation->PopDecoratedAllocator();
FreeImpl(allocation);
}
const char* BadAlloc::what() const noexcept { return msg_.c_str(); }
void AllocationDeleter::operator()(Allocation* allocation) const {
Allocator* allocator = allocation->TopAllocator();
Allocator* allocator = allocation->TopDecoratedAllocator();
allocator->Free(allocation);
}
......
......@@ -15,8 +15,9 @@
#pragma once
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/inlined_stack.h"
#include "paddle/fluid/framework/inlined_vector.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
......@@ -78,29 +79,26 @@ class Allocation {
virtual ~Allocation();
// This function should only be used in unittest
std::vector<Allocator*> GetAllocatorChain() const {
std::vector<Allocator*> allocators;
for (size_t i = 0; i < allocator_chain_.size(); ++i) {
allocators.push_back(allocator_chain_[i]);
}
return allocators;
private:
std::vector<Allocator*> DecoratedAllocators() const {
return static_cast<std::vector<Allocator*>>(decorated_allocators_);
}
private:
inline void RegisterAllocatorChain(Allocator* allocator) {
allocator_chain_.push(allocator);
inline void RegisterDecoratedAllocator(Allocator* allocator) {
decorated_allocators_.push_back(allocator);
}
inline void PopAllocator() { allocator_chain_.pop(); }
inline void PopDecoratedAllocator() { decorated_allocators_.pop_back(); }
inline Allocator* TopAllocator() { return allocator_chain_.top(); }
inline Allocator* TopDecoratedAllocator() {
return decorated_allocators_.back();
}
private:
void* ptr_;
size_t size_;
platform::Place place_;
framework::InlinedStack<Allocator*, 8> allocator_chain_;
framework::InlinedVector<Allocator*, 8> decorated_allocators_;
friend class Allocator;
friend class AllocationDeleter;
......
......@@ -17,12 +17,13 @@
#include <map>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/memory/allocation/aligned_allocator.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/allocation/allocator_strategy.h"
#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/auto_increment_allocator.h"
#include "paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/conditional_allocator.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
......@@ -32,6 +33,7 @@
#include "paddle/fluid/memory/allocation/retry_allocator.h"
#include "paddle/fluid/memory/allocation/zero_size_allocator.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
......@@ -51,6 +53,21 @@ namespace paddle {
namespace memory {
namespace allocation {
static inline std::shared_ptr<Allocator> WrapRetryAndBufferedAllocator(
std::shared_ptr<Allocator> allocator, int64_t retry_time,
bool enable_buffered) {
if (retry_time > 0) {
auto* retry_allocator =
new RetryAllocator(std::move(allocator), retry_time);
allocator.reset(retry_allocator);
}
if (enable_buffered) {
allocator.reset(new MultiBinBufferedAllocator(allocator));
}
return allocator;
}
// TODO(yy): Dirty code here. This class should be configurable in runtime.
class CPUManagedAllocator : public Allocator {
public:
......@@ -117,17 +134,10 @@ class ChunkedAllocator : public Allocator {
std::shared_ptr<Allocator> allocator(new LockedAllocator(
std::shared_ptr<Allocator>(new BestFitAllocator(allocation))));
if (retry_time_ > 0) {
auto* retry_allocator =
new RetryAllocator(std::move(allocator), retry_time_);
allocator.reset(retry_allocator);
}
allocator = WrapRetryAndBufferedAllocator(allocator, retry_time_,
FLAGS_enable_buffered_allocator);
if (FLAGS_enable_buffered_allocator) {
allocator.reset(new MultiBinBufferedAllocator(allocator));
}
return std::make_shared<AlignedAllocator<64u>>(std::move(allocator));
return std::make_shared<AlignedAllocator<4096>>(std::move(allocator));
}
bool IsAllocThreadSafe() const override { return true; }
......@@ -210,7 +220,7 @@ class AllocatorFacadePrivate {
break;
}
case AllocatorStrategy::kAutoGrowthBestFit: {
InitCPUAllocator();
InitAutoGrowthCPUAllocator();
InitAutoGrowthCUDAAllocator();
InitAutoGrowthCUDAPinnedAllocator();
WrapZeroSizeAllocator();
......@@ -224,15 +234,25 @@ class AllocatorFacadePrivate {
}
private:
void InitAutoGrowthCPUAllocator() {
auto cpu_allocator = std::make_shared<AlignedAllocator<4096>>(
std::make_shared<CPUAllocator>());
allocators_[platform::CPUPlace()] =
std::make_shared<AutoGrowthBestFitAllocator>(
cpu_allocator, platform::CpuMaxChunkSize(), 4096);
}
void InitAutoGrowthCUDAAllocator() {
#ifdef PADDLE_WITH_CUDA
int dev_cnt = platform::GetCUDADeviceCount();
for (int dev_id = 0; dev_id < dev_cnt; ++dev_id) {
auto cuda_allocator = std::make_shared<AlignedAllocator<4096>>(
std::make_shared<CUDAAllocator>(platform::CUDAPlace(dev_id)));
allocators_[platform::CUDAPlace(dev_id)] =
std::make_shared<AutoIncrementBestFitAllocator>(
cuda_allocator, platform::GpuMaxChunkSize(), 4096);
auto allocator = std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator, platform::GpuMaxChunkSize(), 4096);
allocators_[platform::CUDAPlace(dev_id)] = WrapRetryAndBufferedAllocator(
allocator, FLAGS_gpu_allocator_retry_time, false);
}
#endif
}
......@@ -242,7 +262,7 @@ class AllocatorFacadePrivate {
auto cuda_pinned_allocator = std::make_shared<AlignedAllocator<4096>>(
std::make_shared<CPUPinnedAllocator>());
allocators_[platform::CUDAPinnedPlace()] =
std::make_shared<AutoIncrementBestFitAllocator>(
std::make_shared<AutoGrowthBestFitAllocator>(
cuda_pinned_allocator, platform::CUDAPinnedMaxChunkSize(), 4096);
#endif
}
......@@ -300,8 +320,7 @@ AllocatorFacade& AllocatorFacade::Instance() {
std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
const platform::Place& place, size_t size, Allocator::Attr attr) {
return std::shared_ptr<Allocation>(Alloc(place, size, attr).release(),
AllocationDeleter());
return std::shared_ptr<Allocation>(Alloc(place, size, attr));
}
AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
......
......@@ -19,7 +19,9 @@
DEFINE_string(
allocator_strategy, "legacy",
"The allocation strategy. Legacy means the original allocator of Fluid."
"New means the experimental allocators of Fluid. in [legacy, new]");
"naive_best_fit means the experimental best fit allocator. "
"auto_growth_best_fit means the experimental auto growth best fit "
"allocator. Enum in [legacy, naive_best_fit, auto_growth_best_fit].");
namespace paddle {
namespace memory {
......@@ -28,7 +30,7 @@ namespace allocation {
static AllocatorStrategy GetStrategyFromFlag() {
if (FLAGS_allocator_strategy == "legacy") {
return AllocatorStrategy::kLegacy;
} else if (FLAGS_allocator_strategy == "navie_best_fit") {
} else if (FLAGS_allocator_strategy == "naive_best_fit") {
return AllocatorStrategy::kNaiveBestFit;
} else if (FLAGS_allocator_strategy == "auto_growth_best_fit") {
return AllocatorStrategy::kAutoGrowthBestFit;
......
......@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
#include <algorithm>
#include <list>
#include <map>
......@@ -29,16 +29,14 @@ static size_t align(size_t size, size_t alignment) {
return remaining == 0 ? size : size + alignment - remaining;
}
AutoIncrementBestFitAllocator::AutoIncrementBestFitAllocator(
AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
const std::shared_ptr<Allocator> &underlying_allocator, size_t chunk_size,
size_t alignment)
: underlying_allocator_(underlying_allocator),
chunk_size_(align(chunk_size, alignment)),
alignment_(alignment) {}
Allocation *AutoIncrementBestFitAllocator::AllocateImpl(size_t size,
Attr attr) {
if (size == 0) return nullptr;
Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size, Attr attr) {
size = align(size, alignment_);
std::lock_guard<std::mutex> guard(mtx_);
auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr));
......@@ -95,7 +93,7 @@ Allocation *AutoIncrementBestFitAllocator::AllocateImpl(size_t size,
return new Chunk::BlockAllocation(block_it);
}
void AutoIncrementBestFitAllocator::FreeImpl(Allocation *allocation) {
void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) {
auto &block_it = static_cast<Chunk::BlockAllocation *>(allocation)->block_it_;
auto &blocks = block_it->chunk_->blocks_;
......
......@@ -25,9 +25,9 @@ namespace paddle {
namespace memory {
namespace allocation {
class AutoIncrementBestFitAllocator : public Allocator {
class AutoGrowthBestFitAllocator : public Allocator {
public:
explicit AutoIncrementBestFitAllocator(
explicit AutoGrowthBestFitAllocator(
const std::shared_ptr<Allocator> &underlying_allocator, size_t chunk_size,
size_t alignment);
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#ifdef PADDLE_WITH_CUDA
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
DECLARE_int64(gpu_allocator_retry_time);
#endif
DECLARE_string(allocator_strategy);
namespace paddle {
namespace memory {
namespace allocation {
static inline size_t AlignTo(size_t size, size_t alignment = 4096) {
auto remaining = size % alignment;
return remaining == 0 ? size : size + alignment - remaining;
}
TEST(allocator, allocator) {
#ifdef PADDLE_WITH_CUDA
FLAGS_fraction_of_gpu_memory_to_use = 0.01;
FLAGS_gpu_allocator_retry_time = 500;
FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
#endif
FLAGS_allocator_strategy = "auto_growth_best_fit";
auto &instance = AllocatorFacade::Instance();
platform::Place place;
size_t size = 1024;
{
place = platform::CPUPlace();
size = 1024;
auto cpu_allocation = instance.Alloc(place, size);
ASSERT_NE(cpu_allocation, nullptr);
ASSERT_NE(cpu_allocation->ptr(), nullptr);
ASSERT_EQ(cpu_allocation->place(), place);
ASSERT_EQ(cpu_allocation->size(), AlignTo(size));
}
#ifdef PADDLE_WITH_CUDA
{
place = platform::CUDAPlace(0);
size = 1024;
auto gpu_allocation = instance.Alloc(place, size);
ASSERT_NE(gpu_allocation, nullptr);
ASSERT_NE(gpu_allocation->ptr(), nullptr);
ASSERT_EQ(gpu_allocation->place(), place);
ASSERT_GE(gpu_allocation->size(), AlignTo(size));
}
{
// Allocate 2GB gpu memory
place = platform::CUDAPlace(0);
size = 2 * static_cast<size_t>(1 << 30);
auto gpu_allocation = instance.Alloc(place, size);
ASSERT_NE(gpu_allocation, nullptr);
ASSERT_NE(gpu_allocation->ptr(), nullptr);
ASSERT_EQ(gpu_allocation->place(), place);
ASSERT_GE(gpu_allocation->size(), AlignTo(size));
}
{
place = platform::CUDAPinnedPlace();
size = (1 << 20);
auto cuda_pinned_allocation =
instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
ASSERT_NE(cuda_pinned_allocation, nullptr);
ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
ASSERT_EQ(cuda_pinned_allocation->place(), place);
ASSERT_GE(cuda_pinned_allocation->size(), AlignTo(size));
}
#endif
}
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -22,18 +22,18 @@
#include <iostream>
#include "paddle/fluid/memory/allocation/auto_increment_best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
TEST(allocator, auto_increment_best_fit_allocator) {
TEST(allocator, auto_growth_best_fit_allocator) {
auto cpu_allocator = std::make_shared<CPUAllocator>();
auto allocator =
std::make_shared<AutoIncrementBestFitAllocator>(cpu_allocator, 0, 4096);
std::make_shared<AutoGrowthBestFitAllocator>(cpu_allocator, 0, 4096);
std::mutex mtx;
std::condition_variable cv;
......@@ -60,13 +60,9 @@ TEST(allocator, auto_increment_best_fit_allocator) {
}
cv.notify_all();
thread_main();
for (auto &th : ths) {
th.join();
}
std::cout << "test ends" << std::endl;
}
} // namespace allocation
......
......@@ -14,6 +14,7 @@
#include "paddle/fluid/memory/allocation/buffered_allocator.h"
#include <gtest/gtest.h>
#include <utility>
#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
#include "paddle/fluid/memory/allocation/locked_allocator.h"
......
......@@ -37,8 +37,6 @@ DEFINE_bool(init_allocated_mem, false,
"that initializing the allocated memory with a small value "
"during unit testing.");
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_double(initial_gpu_memory_in_mb);
DECLARE_double(reallocate_gpu_memory_in_mb);
DECLARE_bool(benchmark);
namespace paddle {
......@@ -72,8 +70,7 @@ BuddyAllocator *GetCPUBuddyAllocator() {
std::call_once(init_flag, []() {
a = new detail::BuddyAllocator(
std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
platform::CpuMinChunkSize(), platform::CpuMaxChunkSize(),
platform::CpuMaxChunkSize());
platform::CpuMinChunkSize(), platform::CpuMaxChunkSize());
});
return a;
......@@ -147,28 +144,16 @@ class GPUBuddyAllocatorList {
PADDLE_ENFORCE(dev_id < flags_.size(), "Invalid device id %s", dev_id);
std::call_once(flags_[dev_id], [this, dev_id] {
platform::SetDeviceId(dev_id);
size_t first_size = platform::GpuFirstAllocateChunkSize();
size_t re_size = platform::GpuReAllocateChunkSize();
allocators_[dev_id] =
new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::GPUAllocator(dev_id)),
platform::GpuMinChunkSize(), first_size, re_size);
VLOG(2) << "\n\nNOTE: each GPU device use "
<< string::HumanReadableSize(first_size) << "(initial chunk) "
<< string::HumanReadableSize(re_size) << "(reallocate chunk) "
<< "% of GPU memory.\n"
<< "You can set GFlags environment variable '"
<< "FLAGS_fraction_of_gpu_memory_to_use"
<< "' or "
"'FLAGS_initial_gpu_memory_in_mb/"
"FLAGS_reallocate_gpu_memory_in_mb' to change the fraction "
"of GPU usage.\n\n";
VLOG(2) << "Currently, FLAGS_fraction_of_gpu_memory_to_use="
<< FLAGS_fraction_of_gpu_memory_to_use << ", "
<< "FLAGS_initial_gpu_memory_in_mb="
<< FLAGS_initial_gpu_memory_in_mb << ", "
<< "FLAGS_reallocate_gpu_memory_in_mb="
<< FLAGS_reallocate_gpu_memory_in_mb;
allocators_[dev_id] = new BuddyAllocator(
std::unique_ptr<detail::SystemAllocator>(
new detail::GPUAllocator(dev_id)),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
VLOG(10) << "\n\nNOTE: each GPU device use "
<< FLAGS_fraction_of_gpu_memory_to_use * 100
<< "% of GPU memory.\n"
<< "You can set GFlags environment variable '"
<< "FLAGS_fraction_of_gpu_memory_to_use"
<< "' to change the fraction of GPU usage.\n\n";
});
return allocators_[dev_id];
}
......@@ -251,7 +236,6 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::CUDAPinnedAllocator),
platform::CUDAPinnedMinChunkSize(),
platform::CUDAPinnedMaxChunkSize(),
platform::CUDAPinnedMaxChunkSize());
});
......
......@@ -14,8 +14,10 @@
#include "paddle/fluid/memory/allocation/locked_allocator.h"
#include <mutex> // NOLINT
#include <utility>
#include "paddle/fluid/memory/allocation/allocation_with_underlying.h"
#include "paddle/fluid/platform/lock_guard_ptr.h"
namespace paddle {
namespace memory {
namespace allocation {
......
......@@ -17,20 +17,37 @@
#include <cctype>
#include <fstream>
#include <limits>
#include <mutex> // NOLINT
#include <sstream>
#include <string>
#include <utility>
#include "paddle/fluid/platform/lock_guard_ptr.h"
DEFINE_double(buffered_allocator_excess_times, 2,
"Tolerant memory size times of buffered_allocator");
DEFINE_double(
buffered_allocator_excess_times, 2,
"Excess memory size times of buffered_allocator. BufferedAllocator"
" would try to reuse memory freed previously, but the size of freed"
" allocation may not be exactly the same as the requested. Here, we"
" use a flag to control the excess times of reused memory size. "
"Not quite sure what is the best excess times value.");
DEFINE_string(division_plan_path, "", "Division plan file path");
DEFINE_string(
buffered_allocator_division_plan_path, "",
"The file path which "
"determines the memory size division plans of BufferedAllocator."
"If it is empty, use the default division plan. The file must be a "
"text file which each lines indicates the bound of division plan. "
"For example, if the text file has 3 lines, which are '500M', '1G', "
" '2G', the division plan would be [0, 500M), [500M, 1G), [1G, 2G) "
"and [2G, +inf). Allocation request whose requested memory size is "
"inside the last interval of division plan would be dispatched to "
" underlying_allocator directly without caching when freed.");
namespace paddle {
namespace memory {
namespace allocation {
std::string TrimStringAndToLowerCase(const std::string &str) {
static std::string TrimStringAndToUpperCase(const std::string &str) {
auto not_space = [](char ch) { return std::isspace(ch) == 0; };
auto first_idx = static_cast<size_t>(
std::find_if(str.begin(), str.end(), not_space) - str.begin());
......@@ -38,41 +55,69 @@ std::string TrimStringAndToLowerCase(const std::string &str) {
std::find_if(str.rbegin(), str.rend(), not_space) - str.rbegin());
if (first_idx == str.size() || last_idx == str.size()) return "";
last_idx = str.size() - 1 - last_idx;
last_idx = str.size() - last_idx;
auto ret = str.substr(first_idx, last_idx - first_idx);
std::for_each(ret.begin(), ret.end(),
[](char &ch) { ch = std::tolower(ch); });
[](char &ch) { ch = std::toupper(ch); });
return ret;
}
static size_t ParseStringToBytes(const std::string &str) {
std::string ret = str;
if (ret.back() == 'b') {
ret.pop_back();
namespace {
enum DivisionPlanFileStatus { kEOF, kException, kNormal };
} // NOLINT
static size_t ParseStringToBytes(const std::string &original_str,
DivisionPlanFileStatus *ret_code) {
std::string str = TrimStringAndToUpperCase(original_str);
if (str.empty()) {
*ret_code = kEOF;
return 0;
}
if (str.back() == 'B') {
str.pop_back();
if (str.empty()) {
*ret_code = kException;
return 0;
}
}
PADDLE_ENFORCE(!ret.empty(), "Wrong format: %s", str);
size_t multiples = 1;
switch (ret.back()) {
case 'g':
switch (str.back()) {
case 'G':
multiples *= (static_cast<size_t>(1) << 30);
break;
case 'm':
case 'M':
multiples *= (static_cast<size_t>(1) << 20);
break;
case 'k':
case 'K':
multiples *= (static_cast<size_t>(1) << 10);
break;
default:
break;
}
if (multiples != 1) ret.pop_back();
ret = TrimStringAndToLowerCase(ret);
double ret_val = 0.0;
std::stringstream ss(ret);
PADDLE_ENFORCE((ss >> ret_val).good(), "Wrong format %s", str);
return static_cast<size_t>(ret_val * multiples);
if (multiples != 1) {
str.pop_back();
if (str.empty()) {
*ret_code = kException;
return 0;
}
}
str = TrimStringAndToUpperCase(str);
double mem_val = -1.0;
std::stringstream ss(str);
if (!(ss >> mem_val) || mem_val < 0) {
*ret_code = kException;
return 0;
}
*ret_code = kNormal;
return static_cast<size_t>(mem_val * multiples);
}
static std::string GetDebugStringOfPlan(const std::vector<size_t> &plan) {
......@@ -84,16 +129,27 @@ static std::string GetDebugStringOfPlan(const std::vector<size_t> &plan) {
return ret + "]";
}
static std::vector<size_t> ReadDivisionPlanFromFile(
std::vector<size_t> ReadBufferedAllocatorDivisionPlanFromFile(
const std::string &filepath) {
std::ifstream is(filepath.c_str());
PADDLE_ENFORCE(is.good(), "File not exist");
PADDLE_ENFORCE(is.good(), "File %s not exist", filepath);
std::string str;
std::vector<size_t> plan;
size_t line_num = 1;
while (std::getline(is, str).good()) {
str = TrimStringAndToLowerCase(str);
if (str.empty()) break;
plan.push_back(ParseStringToBytes(str));
DivisionPlanFileStatus status;
size_t ret = ParseStringToBytes(str, &status);
if (status == kEOF) {
break;
}
if (status == kException) {
PADDLE_THROW(
"Invalid format in line %d of file %s: '%s'. Only support B, KB, MB, "
"GB.",
line_num, filepath, str);
}
plan.push_back(ret);
++line_num;
}
return plan;
}
......@@ -110,11 +166,12 @@ static void CheckAndModifyMemoryDivisionPlan(
}
PADDLE_ENFORCE(is_strictly_sorted, "Divison plan must be stricted sorted");
// Insert 0 and remove MAX to disivion plan for clean binary searching code
// Insert 0 to disivion plan for clean binary searching code
if (division_plan->empty() || division_plan->front() != 0) {
division_plan->insert(division_plan->begin(), 0);
}
// Remove MAX from disivion plan for clean binary searching code
constexpr auto kSizeTypeMax = std::numeric_limits<size_t>::max();
if (division_plan->back() == kSizeTypeMax) {
division_plan->pop_back();
......@@ -124,21 +181,17 @@ static void CheckAndModifyMemoryDivisionPlan(
}
static std::vector<size_t> GetDefaultDivisionPlan() {
if (!FLAGS_division_plan_path.empty()) {
return ReadDivisionPlanFromFile(FLAGS_division_plan_path);
if (!FLAGS_buffered_allocator_division_plan_path.empty()) {
return ReadBufferedAllocatorDivisionPlanFromFile(
FLAGS_buffered_allocator_division_plan_path);
}
// Default division plan is 4K, 8K, 16K, ..., 500M, 1G
constexpr size_t kMaxLogSize = 30;
std::vector<size_t> plan;
for (size_t i = 12; i <= kMaxLogSize; ++i) {
plan.push_back(static_cast<size_t>(1) << i);
}
/*
for (size_t i = 0; i < sizeof(size_t) * 8; ++i) {
plan.push_back(static_cast<size_t>(1) << i);
}
*/
return plan;
}
......@@ -164,6 +217,7 @@ MultiBinBufferedAllocator::MultiBinBufferedAllocator(
division_plan_(division_plan) {
CheckAndModifyMemoryDivisionPlan(&division_plan_);
allocations_.resize(division_plan_.size() - 1);
accumulated_cache_size_.assign(division_plan_.size() - 1, 0UL);
mtx_.resize(division_plan_.size() - 1);
if (underlying_allocator_->IsAllocThreadSafe()) {
for (auto &mtx : mtx_) {
......@@ -182,28 +236,22 @@ void MultiBinBufferedAllocator::FreeImpl(Allocation *allocation) {
platform::LockGuardPtr<std::mutex> guard(mtx_[bin_index]);
allocations_[bin_index].emplace(allocation->size(),
AllocationPtr(allocation));
accumulated_cache_size_[bin_index] += allocation->size();
} else {
underlying_allocator_->Free(allocation);
}
}
// bin_index is not used currently.
// Maybe we can design more flexible FreeCache strategy based on bin_index
size_t MultiBinBufferedAllocator::FreeCache(size_t size, size_t bin_index) {
// and require size.
size_t MultiBinBufferedAllocator::ClearCache() {
size_t accumulated_size = 0;
// FIXME(zjl): free the largest first when there is no extra
for (size_t i = allocations_.size() - 1; i != static_cast<size_t>(-1); --i) {
platform::LockGuardPtr<std::mutex> lock(mtx_[i]);
if (allocations_[i].empty()) continue;
auto it = --allocations_[i].end();
do {
accumulated_size += it->second->size();
underlying_allocator_->Free(it->second.release());
allocations_[i].erase(it--);
if (accumulated_size >= size) {
return accumulated_size;
}
} while (!allocations_[i].empty());
allocations_[i].clear();
accumulated_size += accumulated_cache_size_[i];
accumulated_cache_size_[i] = 0;
}
return accumulated_size;
}
......@@ -212,10 +260,6 @@ Allocation *MultiBinBufferedAllocator::AllocateImpl(size_t size, Attr attr) {
auto bin_index = FindDivisionPlanBinIndex(division_plan_, size);
auto upper_size = TolerantUpperSize(size);
// if (bin_index >= allocations_.size()) {
// VLOG(2) << "Allocate " << size << " from underlying directly";
//}
for (; bin_index < allocations_.size() &&
upper_size >= division_plan_[bin_index];
++bin_index) {
......@@ -226,6 +270,7 @@ Allocation *MultiBinBufferedAllocator::AllocateImpl(size_t size, Attr attr) {
size_t sz = it->second->size();
auto ret = std::move(it->second);
allocation.erase(it);
accumulated_cache_size_[bin_index] -= sz;
VLOG(3) << "Allocate " << sz << "(required " << size
<< ") from cache directly";
return ret.release();
......@@ -239,10 +284,7 @@ Allocation *MultiBinBufferedAllocator::AllocateImpl(size_t size, Attr attr) {
VLOG(2) << "Allocate " << size << " from underlying directly";
return ret;
} catch (BadAlloc &) {
VLOG(1) << retry_time << "-th BadAlloc raises, try to free " << size
<< " bytes caches";
// size_t actual_free_size = FreeCache(size, bin_index);
size_t actual_free_size = FreeCache(-1UL, bin_index);
size_t actual_free_size = ClearCache();
VLOG(1) << retry_time << "-th free " << actual_free_size
<< " bytes caches";
if (actual_free_size == 0) throw;
......@@ -251,6 +293,8 @@ Allocation *MultiBinBufferedAllocator::AllocateImpl(size_t size, Attr attr) {
}
}
void UseMultiBinBufferedAllocatorGFlags() {}
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -16,6 +16,8 @@
#include <map>
#include <memory>
#include <mutex> // NOLINT
#include <string>
#include <vector>
#include "paddle/fluid/memory/allocation/allocator.h"
......@@ -24,6 +26,9 @@ namespace paddle {
namespace memory {
namespace allocation {
std::vector<size_t> ReadBufferedAllocatorDivisionPlanFromFile(
const std::string& filepath);
class MultiBinBufferedAllocator : public Allocator {
public:
explicit MultiBinBufferedAllocator(
......@@ -34,21 +39,24 @@ class MultiBinBufferedAllocator : public Allocator {
bool IsAllocThreadSafe() const override { return mtx_.front() != nullptr; }
void ClearCache() { FreeCache(static_cast<size_t>(-1), 0); }
size_t ClearCache();
const std::vector<size_t>& DivisionPlan() const { return division_plan_; }
protected:
Allocation* AllocateImpl(size_t size, Attr attr) override;
void FreeImpl(Allocation* allocation) override;
private:
size_t FreeCache(size_t size, size_t bin_index);
std::shared_ptr<Allocator> underlying_allocator_;
std::vector<std::multimap<size_t, AllocationPtr>> allocations_;
std::vector<size_t> accumulated_cache_size_;
std::vector<size_t> division_plan_;
std::vector<std::unique_ptr<std::mutex>> mtx_;
};
extern void UseMultiBinBufferedAllocatorGFlags();
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -14,6 +14,7 @@
#include "paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h"
#include <gtest/gtest.h>
#include <utility>
#include <vector>
#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
......@@ -123,10 +124,31 @@ TEST(buffered_allocator, lazy_free) {
{
underlying_allocator->ResetCounter();
allocator->ClearCache();
size_t cache_size = allocator->ClearCache();
ASSERT_EQ(cache_size, static_cast<size_t>(alloc_size + 2048));
ASSERT_EQ(underlying_allocator->GetAllocCount(), kZero);
ASSERT_EQ(underlying_allocator->GetFreeCount(), kTwo);
}
{
underlying_allocator->ResetCounter();
auto p = allocator->Allocate(allocator->DivisionPlan().back(),
allocator->kDefault);
ASSERT_EQ(underlying_allocator->GetAllocCount(), kOne);
ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
}
ASSERT_EQ(underlying_allocator->GetFreeCount(), kOne);
{
underlying_allocator->ResetCounter();
auto p = allocator->Allocate(allocator->DivisionPlan().back() - 1,
allocator->kDefault);
ASSERT_EQ(underlying_allocator->GetAllocCount(), kOne);
ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
}
ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
}
}
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#ifdef PADDLE_WITH_CUDA
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
DECLARE_int64(gpu_allocator_retry_time);
#endif
DECLARE_bool(enable_buffered_allocator);
DECLARE_string(allocator_strategy);
namespace paddle {
namespace memory {
namespace allocation {
TEST(allocator, allocator) {
#ifdef PADDLE_WITH_CUDA
FLAGS_fraction_of_gpu_memory_to_use = 0.01;
FLAGS_gpu_allocator_retry_time = 500;
FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
#endif
FLAGS_allocator_strategy = "naive_best_fit";
FLAGS_enable_buffered_allocator = true;
auto &instance = AllocatorFacade::Instance();
platform::Place place;
size_t size = 1024;
{
place = platform::CPUPlace();
size = 1024;
auto cpu_allocation = instance.Alloc(place, size);
ASSERT_NE(cpu_allocation, nullptr);
ASSERT_NE(cpu_allocation->ptr(), nullptr);
ASSERT_EQ(cpu_allocation->place(), place);
ASSERT_EQ(cpu_allocation->size(), size);
}
#ifdef PADDLE_WITH_CUDA
{
place = platform::CUDAPlace(0);
size = 1024;
auto gpu_allocation = instance.Alloc(place, size);
ASSERT_NE(gpu_allocation, nullptr);
ASSERT_NE(gpu_allocation->ptr(), nullptr);
ASSERT_EQ(gpu_allocation->place(), place);
ASSERT_GE(gpu_allocation->size(), size);
}
{
// Allocate 2GB gpu memory
place = platform::CUDAPlace(0);
size = 2 * static_cast<size_t>(1 << 30);
auto gpu_allocation = instance.Alloc(place, size);
ASSERT_NE(gpu_allocation, nullptr);
ASSERT_NE(gpu_allocation->ptr(), nullptr);
ASSERT_EQ(gpu_allocation->place(), place);
ASSERT_GE(gpu_allocation->size(), size);
}
{
place = platform::CUDAPinnedPlace();
size = (1 << 20);
auto cuda_pinned_allocation =
instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
ASSERT_NE(cuda_pinned_allocation, nullptr);
ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
ASSERT_EQ(cuda_pinned_allocation->place(), place);
ASSERT_GE(cuda_pinned_allocation->size(), size);
}
#endif
}
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -18,6 +18,7 @@
#include <condition_variable> // NOLINT
#include <memory>
#include <mutex> // NOLINT
#include <utility>
#include "paddle/fluid/memory/allocation/allocator.h"
namespace paddle {
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h"
DECLARE_string(buffered_allocator_division_plan_path);
namespace paddle {
namespace memory {
namespace allocation {
TEST(buffered_allocator, division_plan) {
std::string path = "/tmp/buffered_allocator_divison_plan";
FLAGS_buffered_allocator_division_plan_path = path;
{
std::vector<std::string> plan(
{"100b", "300.7K", "500.3m", "1.02gB", "2g", "4G"});
std::ofstream os(path);
for (auto &p : plan) {
os << p << std::endl;
}
os.close();
}
auto plan = ReadBufferedAllocatorDivisionPlanFromFile(
FLAGS_buffered_allocator_division_plan_path);
ASSERT_EQ(plan.size(), 6UL);
ASSERT_EQ(plan[0], 100UL);
ASSERT_EQ(plan[1], static_cast<size_t>(300.7 * 1024));
ASSERT_EQ(plan[2], static_cast<size_t>(500.3 * 1024 * 1024));
ASSERT_EQ(plan[3], static_cast<size_t>(1.02 * 1024 * 1024 * 1024));
ASSERT_EQ(plan[4], static_cast<size_t>(2.0 * 1024 * 1024 * 1024));
ASSERT_EQ(plan[5], static_cast<size_t>(4.0 * 1024 * 1024 * 1024));
}
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -22,21 +22,22 @@ bool ZeroSizeAllocator::IsAllocThreadSafe() const {
return underlying_allocator_->IsAllocThreadSafe();
}
void ZeroSizeAllocator::FreeImpl(Allocation *allocation) {
if (dynamic_cast<ZeroSizeAllocation *>(allocation)) {
delete allocation;
Allocation *ZeroSizeAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
if (size == 0) {
return new Allocation(nullptr, 0, place_);
} else {
underlying_allocator_->Free(allocation);
return underlying_allocator_->Allocate(size, attr).release();
}
}
Allocation *ZeroSizeAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
if (size == 0) {
return new ZeroSizeAllocation(place_);
void ZeroSizeAllocator::FreeImpl(Allocation *allocation) {
if (allocation->size() == 0) {
delete allocation;
} else {
return underlying_allocator_->Allocate(size, attr).release();
underlying_allocator_->Free(allocation);
}
}
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -13,6 +13,7 @@
// limitations under the License.
#pragma once
#include <memory>
#include <utility>
#include "paddle/fluid/memory/allocation/allocator.h"
......@@ -23,12 +24,6 @@ namespace allocation {
// The allocator handles the request's size is zero. Allocator will always
// return an allocation even the request size is zero. However, the
// allocation.ptr() is nullptr
class ZeroSizeAllocation : public Allocation {
public:
explicit ZeroSizeAllocation(const platform::Place& p)
: Allocation(nullptr, 0, p) {}
};
class ZeroSizeAllocator : public Allocator {
public:
ZeroSizeAllocator(std::shared_ptr<Allocator> underlying_allocator,
......
......@@ -25,11 +25,9 @@ namespace detail {
BuddyAllocator::BuddyAllocator(
std::unique_ptr<SystemAllocator> system_allocator, size_t min_chunk_size,
size_t first_allocate_chunk_size, size_t reallocate_chunk_size)
size_t max_chunk_size)
: min_chunk_size_(min_chunk_size),
first_allocate_chunk_size_(first_allocate_chunk_size),
reallocate_chunk_size_(reallocate_chunk_size),
max_chunk_size_(first_allocate_chunk_size),
max_chunk_size_(max_chunk_size),
cache_(system_allocator->UseGpu()),
system_allocator_(std::move(system_allocator)) {}
......@@ -38,10 +36,9 @@ BuddyAllocator::~BuddyAllocator() {
"have actually been freed";
while (!pool_.empty()) {
auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
auto desc = cache_.load(block);
VLOG(10) << "Free from block (" << block << ", " << desc.size << ")";
VLOG(10) << "Free from block (" << block << ", " << max_chunk_size_ << ")";
system_allocator_->Free(block, desc.size, desc.index);
system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
cache_.invalidate(block);
pool_.erase(pool_.begin());
}
......@@ -66,7 +63,7 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
// if the allocation is huge, send directly to the system allocator
if (size > max_chunk_size_) {
VLOG(10) << "Allocate from system allocator.";
return SystemAlloc(size, false);
return SystemAlloc(size);
}
// query and allocate from the existing chunk
......@@ -75,9 +72,9 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
// refill the pool if failure
if (it == pool_.end()) {
it = RefillPool();
// if still failure, try to allocate from SystemAllocator
// if still failure, fail fatally
if (it == pool_.end()) {
return SystemAlloc(size, false);
return nullptr;
}
} else {
VLOG(10) << "Allocation from existing memory block " << std::get<2>(*it)
......@@ -101,7 +98,7 @@ void BuddyAllocator::Free(void* p) {
VLOG(10) << "Free from address " << block;
if (block->type(cache_) == MemoryBlock::UNMANAGED_HUGE_CHUNK) {
if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) {
VLOG(10) << "Free directly from system allocator";
system_allocator_->Free(block, block->total_size(cache_),
block->index(cache_));
......@@ -171,12 +168,9 @@ void BuddyAllocator::Free(void* p) {
size_t BuddyAllocator::Used() { return total_used_; }
size_t BuddyAllocator::GetMinChunkSize() { return min_chunk_size_; }
size_t BuddyAllocator::GetMaxChunkSize() {
std::lock_guard<std::mutex> lock(mutex_);
return max_chunk_size_;
}
size_t BuddyAllocator::GetMaxChunkSize() { return max_chunk_size_; }
void* BuddyAllocator::SystemAlloc(size_t size, bool is_managed) {
void* BuddyAllocator::SystemAlloc(size_t size) {
size_t index = 0;
void* p = system_allocator_->Alloc(&index, size);
......@@ -184,23 +178,25 @@ void* BuddyAllocator::SystemAlloc(size_t size, bool is_managed) {
if (p == nullptr) return nullptr;
static_cast<MemoryBlock*>(p)->init(
&cache_, is_managed ? MemoryBlock::MANAGED_HUGE_CHUNK
: MemoryBlock::UNMANAGED_HUGE_CHUNK,
index, size, nullptr, nullptr);
static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::HUGE_CHUNK, index,
size, nullptr, nullptr);
return static_cast<MemoryBlock*>(p)->data();
}
BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
if (total_used_ + total_free_ > 0) {
max_chunk_size_ = reallocate_chunk_size_;
#ifdef PADDLE_WITH_CUDA
if (system_allocator_->UseGpu()) {
if ((total_used_ + total_free_) == 0) {
// Compute the maximum allocation size for the first allocation.
max_chunk_size_ = platform::GpuMaxChunkSize();
}
}
#endif
// Allocate a new maximum sized block
size_t index = 0;
size_t chunk_size = max_chunk_size_;
void* p = system_allocator_->Alloc(&index, chunk_size);
void* p = system_allocator_->Alloc(&index, max_chunk_size_);
if (p == nullptr) return pool_.end();
......@@ -208,7 +204,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
<< " from system allocator";
static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index,
chunk_size, nullptr, nullptr);
max_chunk_size_, nullptr, nullptr);
// gpu fallback allocation
if (system_allocator_->UseGpu() &&
......@@ -216,10 +212,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
fallback_alloc_count_++;
}
total_free_ += chunk_size;
total_free_ += max_chunk_size_;
// dump the block into pool
return pool_.insert(IndexSizeAddress(index, chunk_size, p)).first;
return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first;
}
BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
......@@ -275,24 +271,27 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
void BuddyAllocator::CleanIdleFallBackAlloc() {
// If fallback allocation does not exist, return directly
if (!fallback_alloc_count_ || !system_allocator_->UseGpu()) return;
if (!fallback_alloc_count_) return;
for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
// If free memory block less than max_chunk_size_, return directly
if (std::get<1>(*pool) < max_chunk_size_) return;
MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
auto desc = cache_.load(block);
if (desc.index == 0) {
// If no GPU fallback allocator, return
if (!system_allocator_->UseGpu() || block->index(cache_) == 0) {
return;
}
VLOG(10) << "Return block " << block << " to fallback allocator.";
system_allocator_->Free(block, desc.size, block->index(cache_));
system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
cache_.invalidate(block);
pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
total_free_ -= desc.size;
total_free_ -= max_chunk_size_;
fallback_alloc_count_--;
// If no fall allocation exists, return directly
......@@ -316,21 +315,19 @@ void BuddyAllocator::CleanIdleNormalAlloc() {
if (!shall_free_alloc()) return;
for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
auto desc = cache_.load(block);
// If free memory block less than max_chunk_size_, return directly
if (std::get<1>(*pool) < max_chunk_size_) return;
if (desc.type != MemoryBlock::MANAGED_HUGE_CHUNK) {
return;
}
MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
VLOG(10) << "Return block " << block << " to base allocator.";
system_allocator_->Free(block, desc.size, desc.index);
system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
cache_.invalidate(block);
pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
total_free_ -= desc.size;
total_free_ -= max_chunk_size_;
if (!shall_free_alloc()) return;
}
......
......@@ -34,8 +34,7 @@ namespace detail {
class BuddyAllocator {
public:
BuddyAllocator(std::unique_ptr<SystemAllocator> system_allocator,
size_t min_chunk_size, size_t first_allocate_chunk_size,
size_t reallocate_chunk_size);
size_t min_chunk_size, size_t max_chunk_size);
~BuddyAllocator();
......@@ -58,7 +57,7 @@ class BuddyAllocator {
using PoolSet = std::set<IndexSizeAddress>;
/*! \brief Allocate fixed-size memory from system */
void* SystemAlloc(size_t size, bool is_managed = true);
void* SystemAlloc(size_t size);
/*! \brief If existing chunks are not suitable, refill pool */
PoolSet::iterator RefillPool();
......@@ -88,11 +87,7 @@ class BuddyAllocator {
size_t total_free_ = 0; // the total size of free memory
size_t min_chunk_size_; // the minimum size of each chunk
size_t first_allocate_chunk_size_;
size_t reallocate_chunk_size_;
size_t max_chunk_size_;
size_t max_chunk_size_; // the maximum size of each chunk
private:
/**
......
......@@ -27,11 +27,10 @@ class MetadataCache;
// MemoryBlock::Desc and the payload.
struct MemoryBlock {
enum Type {
FREE_CHUNK, // memory is free and idle
ARENA_CHUNK, // memory is being occupied
MANAGED_HUGE_CHUNK, // memory is huge and out of management
UNMANAGED_HUGE_CHUNK, // memory is huge and managed by allocator
INVALID_CHUNK // memory is invalid
FREE_CHUNK, // memory is free and idle
ARENA_CHUNK, // memory is being occupied
HUGE_CHUNK, // memory is out of management
INVALID_CHUNK // memory is invalid
};
// init saves the MemoryBlock::Desc of the memory block in a MetadataCache.
......
......@@ -38,22 +38,6 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
"additional trunks of the same size will be requested from gpu "
"until the gpu has no memory left for another trunk.");
DEFINE_double(
initial_gpu_memory_in_mb, -1.0,
"GPU memory chunk size in MB."
"Allocator would allocate FLAGS_initial_gpu_memory_in_mb size "
"chunk first and reallocate FLAGS_reallocate_gpu_memory_in_mb size "
"chunk when the first chunk is not enough. This flag has higher priority "
"than FLAGS_fraction_of_gpu_memory_to_use. Disable when less than 0.");
DEFINE_double(reallocate_gpu_memory_in_mb, -1.0,
"GPU memory chunk size in MB."
"If FLAGS_initial_gpu_memory_in_mb is set and "
"FLAGS_reallocate_gpu_memory_in_mb "
"is less than 0, it would be replaced by "
"FLAGS_initial_gpu_memory_in_mb. Disable "
"when FLAGS_initial_gpu_memory_in_mb is less than 0.");
DEFINE_bool(
enable_cublas_tensor_op_math, false,
"The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
......@@ -227,54 +211,13 @@ size_t GpuMaxChunkSize() {
size_t allocating = static_cast<size_t>(FLAGS_fraction_of_gpu_memory_to_use *
(total - reserving));
PADDLE_ENFORCE_LE(allocating, available,
"Insufficient GPU memory to allocation.");
return allocating;
}
size_t GpuFirstAllocateChunkSize() {
if (FLAGS_initial_gpu_memory_in_mb <= 0) {
return GpuMaxChunkSize();
}
size_t total = 0;
size_t available = 0;
GpuMemoryUsage(&available, &total);
VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
<< total / 1024 / 1024 << "M";
size_t initial_mem =
static_cast<size_t>(FLAGS_initial_gpu_memory_in_mb * (1 << 20));
PADDLE_ENFORCE_LE(initial_mem, available,
"Insufficient GPU memory to allocation.");
return initial_mem;
}
size_t GpuReAllocateChunkSize() {
if (FLAGS_initial_gpu_memory_in_mb <= 0) {
return GpuMaxChunkSize();
}
double reallocate_mem = FLAGS_reallocate_gpu_memory_in_mb;
if (reallocate_mem < 0) {
PADDLE_ENFORCE(FLAGS_initial_gpu_memory_in_mb > 0,
"FLAGS_init_gpu_memory_to_use_mb must be larger than 0");
reallocate_mem = FLAGS_initial_gpu_memory_in_mb;
}
size_t total = 0;
size_t available = 0;
GpuMemoryUsage(&available, &total);
VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
<< total / 1024 / 1024 << "M";
size_t realloc_mem = static_cast<size_t>(reallocate_mem * (1 << 20));
PADDLE_ENFORCE_LE(realloc_mem, available,
"Insufficient GPU memory to allocation.");
return realloc_mem;
}
void GpuMemcpyAsync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind, cudaStream_t stream) {
PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream),
......
......@@ -66,12 +66,6 @@ size_t GpuMinChunkSize();
//! Get the maximum chunk size for GPU buddy allocator.
size_t GpuMaxChunkSize();
//! Get init chunk size for GPU buddy allocator.
size_t GpuFirstAllocateChunkSize();
//! Get reallocate chunk size for GPU buddy allocator.
size_t GpuReAllocateChunkSize();
//! Copy memory from address src to dst asynchronously.
void GpuMemcpyAsync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind, cudaStream_t stream);
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/platform/temporary_allocator.h"
#include <memory>
#include "paddle/fluid/memory/allocation/allocator_facade.h"
DEFINE_int64(limit_of_tmp_allocation, -1,
......
......@@ -16,6 +16,7 @@
#include <condition_variable> // NOLINT
#include <deque>
#include <map>
#include <memory>
#include <mutex> // NOLINT
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/lock_guard_ptr.h"
......
......@@ -39,6 +39,7 @@ limitations under the License. */
#include "paddle/fluid/imperative/profiler.h"
#include "paddle/fluid/memory/allocation/allocator_strategy.h"
#include "paddle/fluid/memory/allocation/legacy_allocator.h"
#include "paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h"
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/py_func_op.h"
#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
......@@ -133,6 +134,9 @@ PYBIND11_MODULE(core, m) {
paddle::platform::CpuTotalPhysicalMemory();
paddle::memory::allocation::UseAllocatorStrategyGFlag();
paddle::memory::allocation::UseMultiBinBufferedAllocatorGFlags();
m.doc() = "C++ core of PaddlePaddle";
// using framework in this function. Since it is inside a function, it will
......
......@@ -105,14 +105,12 @@ void Printf(const char* fmt, const Args&... args) {
Fprintf(std::cout, fmt, args...);
}
template <typename T>
std::string HumanReadableSize(T size) {
inline std::string HumanReadableSize(double f_size) {
size_t i = 0;
double f_size = static_cast<double>(size);
double orig = f_size;
const std::vector<std::string> units(
{"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"});
while (f_size > 1024) {
while (f_size >= 1024) {
f_size /= 1024;
i++;
}
......
......@@ -130,7 +130,8 @@ def __bootstrap__():
'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb',
'fast_eager_deletion_mode', 'memory_fraction_of_eager_deletion',
'allocator_strategy', 'enable_buffered_allocator',
'buffered_allocator_excess_times', 'reader_queue_speed_test_mode',
'buffered_allocator_excess_times',
'buffered_allocator_division_plan_path', 'reader_queue_speed_test_mode',
'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir',
'inner_op_parallelism', 'enable_parallel_graph',
'multiple_of_cupti_buffer_size', 'enable_subgraph_optimize',
......@@ -163,7 +164,6 @@ def __bootstrap__():
if core.is_compiled_with_cuda():
read_env_flags += [
'initial_gpu_memory_in_mb', 'reallocate_gpu_memory_in_mb',
'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus',
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册