未验证 提交 0a73f720 编写于 作者: Z Zeng Jinle 提交者: GitHub

Add retry_allocator for gpu (#19409)

* add retry_allocator for gpu, test=develop

* follow chengduoZH's comments, test=develop

* follow huihuang's comments,test=develop

* change f,l in enforce.h to be file,line, test=develop

* increase code coverage by adding unittests, test=develop

* fix CMakeLists.txt, test=develop
上级 c756b5d2
......@@ -46,7 +46,12 @@ cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy)
cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator locked_allocator cpu_allocator)
if (WITH_TESTING)
if (WITH_GPU)
target_link_libraries(retry_allocator_test cuda_allocator)
endif()
set_tests_properties(retry_allocator_test PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
set_property(TEST retry_allocator_test PROPERTY ENVIRONMENT GLOG_vmodule=retry_allocator=10)
endif()
cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc DEPS allocator_facade)
......
......@@ -19,6 +19,7 @@
#include <utility>
#include <vector>
#include "paddle/fluid/framework/inlined_vector.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
......@@ -26,14 +27,14 @@ namespace memory {
namespace allocation {
// Exception when `Alloc`/`AllocShared` failed
class BadAlloc : public std::exception {
public:
inline explicit BadAlloc(std::string msg) : msg_(std::move(msg)) {}
struct BadAlloc : public std::exception {
inline explicit BadAlloc(std::string err_msg, const char* file, int line)
: err_str_(platform::GetTraceBackString(std::move(err_msg), file, line)) {
}
inline const char* what() const noexcept override { return msg_.c_str(); }
const char* what() const noexcept override { return err_str_.c_str(); }
private:
std::string msg_;
std::string err_str_;
};
class Allocator;
......
......@@ -37,7 +37,7 @@
#endif
DEFINE_int64(
gpu_allocator_retry_time, 0,
gpu_allocator_retry_time, 100,
"The retry time (milliseconds) when allocator fails "
"to allocate memory. No retry if this value is not greater than 0");
......@@ -80,6 +80,12 @@ class AllocatorFacadePrivate {
}
}
InitZeroSizeAllocators();
if (FLAGS_gpu_allocator_retry_time > 0) {
WrapCUDARetryAllocator(FLAGS_gpu_allocator_retry_time);
}
CheckAllocThreadSafe();
}
inline const std::shared_ptr<Allocator>& GetAllocator(
......@@ -118,6 +124,8 @@ class AllocatorFacadePrivate {
public:
explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
bool IsAllocThreadSafe() const override { return true; }
protected:
Allocation* AllocateImpl(size_t size) override {
return new Allocation(nullptr, 0, place_);
......@@ -145,6 +153,25 @@ class AllocatorFacadePrivate {
}
}
void CheckAllocThreadSafe() const {
for (auto& pair : allocators_) {
PADDLE_ENFORCE_EQ(pair.second->IsAllocThreadSafe(), true);
}
for (auto& pair : zero_size_allocators_) {
PADDLE_ENFORCE_EQ(pair.second->IsAllocThreadSafe(), true);
}
}
void WrapCUDARetryAllocator(size_t retry_time) {
PADDLE_ENFORCE_GT(retry_time, 0, "Retry time must be larger than 0");
for (auto& pair : allocators_) {
if (platform::is_gpu_place(pair.first)) {
pair.second = std::make_shared<RetryAllocator>(pair.second, retry_time);
}
}
}
private:
std::map<platform::Place, std::shared_ptr<Allocator>> allocators_;
std::map<platform::Place, std::shared_ptr<Allocator>> zero_size_allocators_;
......
......@@ -150,8 +150,8 @@ Allocation* BestFitAllocator::AllocateImpl(size_t size) {
}
}
if (UNLIKELY(highest_set_bit == free_chunks_.size())) {
throw BadAlloc(string::Sprintf(
"Cannot allocate %d, All fragments size is %d", size, FreeSize()));
PADDLE_THROW_BAD_ALLOC("Cannot allocate %d, All fragments size is %d", size,
FreeSize());
}
auto chunk_it = SplitChunk(size, highest_set_bit, map_it);
return new BestFitAllocation(this, chunk_it);
......
......@@ -17,6 +17,7 @@
#include <cuda_runtime.h>
#include <string>
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h"
namespace paddle {
......@@ -36,9 +37,9 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size) {
void* ptr;
auto status = cudaMalloc(&ptr, size);
if (UNLIKELY(status != cudaSuccess)) {
throw BadAlloc(string::Sprintf(
"Cannot allocate %d on GPU %d, cuda status %d, %s", size, place_.device,
status, cudaGetErrorString(status)));
PADDLE_THROW_BAD_ALLOC("Cannot allocate %d on GPU %d, cuda status %d, %s",
size, place_.device, status,
cudaGetErrorString(status));
}
return new Allocation(ptr, size, platform::Place(place_));
}
......
......@@ -28,6 +28,8 @@ class NaiveBestFitAllocator : public Allocator {
public:
explicit NaiveBestFitAllocator(const platform::Place &p) : place_(p) {}
bool IsAllocThreadSafe() const override { return true; }
protected:
Allocation *AllocateImpl(size_t size) override;
void FreeImpl(Allocation *allocation) override;
......
......@@ -13,14 +13,40 @@
// limitations under the License.
#include "paddle/fluid/memory/allocation/retry_allocator.h"
namespace paddle {
namespace memory {
namespace allocation {
class WaitedAllocateSizeGuard {
public:
WaitedAllocateSizeGuard(std::atomic<size_t>* waited_size,
size_t requested_size)
: waited_size_(waited_size), requested_size_(requested_size) {
waited_size_->fetch_add(requested_size_,
std::memory_order::memory_order_relaxed);
}
~WaitedAllocateSizeGuard() {
waited_size_->fetch_sub(requested_size_,
std::memory_order::memory_order_relaxed);
}
private:
std::atomic<size_t>* waited_size_;
size_t requested_size_;
};
void RetryAllocator::FreeImpl(Allocation* allocation) {
// Delete underlying allocation first.
size_t size = allocation->size();
underlying_allocator_->Free(allocation);
cv_.notify_all();
if (UNLIKELY(waited_allocate_size_)) {
VLOG(10) << "Free " << size << " bytes and notify all waited threads, "
"where waited_allocate_size_ = "
<< waited_allocate_size_;
cv_.notify_all();
}
}
Allocation* RetryAllocator::AllocateImpl(size_t size) {
......@@ -31,29 +57,38 @@ Allocation* RetryAllocator::AllocateImpl(size_t size) {
// But it would add lock even when allocation success at the first time
try {
return alloc_func();
} catch (BadAlloc& bad_alloc) {
} catch (BadAlloc&) {
{
WaitedAllocateSizeGuard guard(&waited_allocate_size_, size);
VLOG(10) << "Allocation failed when allocating " << size
<< " bytes, waited_allocate_size_ = " << waited_allocate_size_;
// We can just write allocation retry inside the predicate function of
// wait_until
// But it needs to acquire the lock when executing predicate function
// For better performance, we use loop here
// wait_until. But it needs to acquire the lock when executing predicate
// function. For better performance, we use loop here
auto end_time = std::chrono::high_resolution_clock::now() + retry_time_;
auto wait_until = [&, this] {
std::unique_lock<std::mutex> lock(mutex_);
return cv_.wait_until(lock, end_time);
};
size_t retry_time = 0;
while (wait_until() != std::cv_status::timeout) {
try {
return alloc_func();
} catch (BadAlloc& ex) {
bad_alloc = ex;
} catch (BadAlloc&) {
// do nothing when it is not timeout
++retry_time;
VLOG(10) << "Allocation failed when retrying " << retry_time
<< " times when allocating " << size
<< " bytes. Wait still.";
} catch (...) {
throw;
}
}
throw; // rethrow the original exception or throw the internal bad_alloc
}
VLOG(10) << "Allocation failed because of timeout when allocating " << size
<< " bytes.";
return alloc_func(); // If timeout, try last allocation request.
} catch (...) {
throw;
}
......
......@@ -14,12 +14,14 @@
#pragma once
#include <atomic> // NOLINT
#include <chrono> // NOLINT
#include <condition_variable> // NOLINT
#include <memory>
#include <mutex> // NOLINT
#include <utility>
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace memory {
......@@ -48,9 +50,7 @@ class RetryAllocator : public Allocator {
std::mutex mutex_;
std::condition_variable cv_;
// For debug, We can add an atomic integer to record how many memory sizes are
// waited to allocate
// std::atomic<size_t> waited_allocate_size_{0};
std::atomic<size_t> waited_allocate_size_{0};
};
} // namespace allocation
......
......@@ -17,12 +17,16 @@
#include <chrono> // NOLINT
#include <condition_variable> // NOLINT
#include <mutex> // NOLINT
#include <thread> // NOLINT
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
#include "paddle/fluid/memory/allocation/locked_allocator.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
#endif
namespace paddle {
namespace memory {
......@@ -93,6 +97,51 @@ TEST(RetryAllocator, RetryAllocator) {
}
}
class DummyAllocator : public Allocator {
public:
bool IsAllocThreadSafe() const override { return true; }
protected:
Allocation *AllocateImpl(size_t size) override {
PADDLE_THROW_BAD_ALLOC("Always BadAlloc");
}
void FreeImpl(Allocation *) override {}
};
TEST(RetryAllocator, RetryAllocatorLastAllocFailure) {
size_t retry_ms = 10;
{
RetryAllocator allocator(std::make_shared<DummyAllocator>(), retry_ms);
try {
auto allocation = allocator.Allocate(100);
ASSERT_TRUE(false);
allocation.reset();
} catch (BadAlloc &ex) {
ASSERT_TRUE(std::string(ex.what()).find("Always BadAlloc") !=
std::string::npos);
}
}
#ifdef PADDLE_WITH_CUDA
{
platform::CUDAPlace p(0);
RetryAllocator allocator(std::make_shared<CUDAAllocator>(p), retry_ms);
size_t allocate_size = -1UL; // Very large number
try {
auto allocation = allocator.Allocate(allocate_size);
ASSERT_TRUE(false);
allocation.reset();
} catch (BadAlloc &ex) {
ASSERT_TRUE(std::string(ex.what()).find(
"Cannot allocate " + std::to_string(allocate_size) +
" on GPU " + std::to_string(p.device)) !=
std::string::npos);
}
}
#endif
}
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -164,15 +164,6 @@ void BuddyAllocator::Free(void* p) {
<< block->total_size(cache_) << ")";
pool_.insert(
IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
if (FLAGS_free_idle_memory) {
// Clean up if existing too much free memory
// Prefer freeing fallback allocation first
CleanIdleFallBackAlloc();
// Free normal allocation
CleanIdleNormalAlloc();
}
}
size_t BuddyAllocator::Used() { return total_used_; }
......@@ -225,12 +216,6 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index,
allocate_bytes, nullptr, nullptr);
// gpu fallback allocation
if (system_allocator_->UseGpu() &&
static_cast<MemoryBlock*>(p)->index(cache_) == 1) {
fallback_alloc_count_++;
}
total_free_ += allocate_bytes;
// dump the block into pool
......@@ -288,70 +273,6 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
return block;
}
void BuddyAllocator::CleanIdleFallBackAlloc() {
// If fallback allocation does not exist, return directly
if (!fallback_alloc_count_) return;
for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
// If free memory block less than max_chunk_size_, return directly
if (std::get<1>(*pool) < max_chunk_size_) return;
MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
// If no GPU fallback allocator, return
if (!system_allocator_->UseGpu() || block->index(cache_) == 0) {
return;
}
VLOG(10) << "Return block " << block << " to fallback allocator.";
system_allocator_->Free(block, block->size(cache_), block->index(cache_));
cache_.invalidate(block);
pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
total_free_ -= block->size(cache_);
fallback_alloc_count_--;
// If no fall allocation exists, return directly
if (!fallback_alloc_count_) return;
}
}
void BuddyAllocator::CleanIdleNormalAlloc() {
auto shall_free_alloc = [&]() -> bool {
// free all fallback allocations
if (fallback_alloc_count_ > 0) {
return true;
}
// keep 2x overhead if we haven't fallen back
if ((total_used_ + max_chunk_size_) * 2 < total_free_) {
return true;
}
return false;
};
if (!shall_free_alloc()) return;
for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
// If free memory block less than max_chunk_size_, return directly
if (std::get<1>(*pool) < max_chunk_size_) return;
MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
VLOG(10) << "Return block " << block << " to base allocator.";
system_allocator_->Free(block, block->size(cache_), block->index(cache_));
cache_.invalidate(block);
pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
total_free_ -= block->size(cache_);
if (!shall_free_alloc()) return;
}
}
} // namespace detail
} // namespace memory
} // namespace paddle
......@@ -75,12 +75,6 @@ class BuddyAllocator {
/*! \brief Find the existing chunk which used to allocation */
PoolSet::iterator FindExistChunk(size_t size);
/*! \brief Clean idle fallback allocation */
void CleanIdleFallBackAlloc();
/*! \brief Clean idle normal allocation */
void CleanIdleNormalAlloc();
private:
size_t total_used_ = 0; // the total size of used memory
size_t total_free_ = 0; // the total size of free memory
......@@ -98,9 +92,6 @@ class BuddyAllocator {
*/
PoolSet pool_;
/*! Record fallback allocation count for auto-scaling */
size_t fallback_alloc_count_ = 0;
private:
/*! Unify the metadata format between GPU and CPU allocations */
MetadataCache cache_;
......
......@@ -25,6 +25,7 @@ limitations under the License. */
#include <algorithm> // for std::max
#include "gflags/gflags.h"
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h"
......@@ -117,19 +118,19 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
gpu_alloc_size_ += size;
return p;
} else {
LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
<< " MB GPU memory. Please shrink "
"FLAGS_fraction_of_gpu_memory_to_use or "
"FLAGS_initial_gpu_memory_in_mb or "
"FLAGS_reallocate_gpu_memory_in_mb"
"environment variable to a lower value. "
<< "Current FLAGS_fraction_of_gpu_memory_to_use value is "
<< FLAGS_fraction_of_gpu_memory_to_use
<< ". Current FLAGS_initial_gpu_memory_in_mb value is "
<< FLAGS_initial_gpu_memory_in_mb
<< ". Current FLAGS_reallocate_gpu_memory_in_mb value is "
<< FLAGS_reallocate_gpu_memory_in_mb;
return nullptr;
PADDLE_THROW_BAD_ALLOC(
"Cannot malloc " + std::to_string(size / 1024.0 / 1024.0) +
" MB GPU memory. Please shrink "
"FLAGS_fraction_of_gpu_memory_to_use or "
"FLAGS_initial_gpu_memory_in_mb or "
"FLAGS_reallocate_gpu_memory_in_mb"
"environment variable to a lower value. " +
"Current FLAGS_fraction_of_gpu_memory_to_use value is " +
std::to_string(FLAGS_fraction_of_gpu_memory_to_use) +
". Current FLAGS_initial_gpu_memory_in_mb value is " +
std::to_string(FLAGS_initial_gpu_memory_in_mb) +
". Current FLAGS_reallocate_gpu_memory_in_mb value is " +
std::to_string(FLAGS_reallocate_gpu_memory_in_mb));
}
}
......
......@@ -66,25 +66,61 @@ inline std::string demangle(std::string name) {
inline std::string demangle(std::string name) { return name; }
#endif
template <typename StrType>
inline std::string GetTraceBackString(StrType&& what, const char* file,
int line) {
static constexpr int TRACE_STACK_LIMIT = 100;
std::ostringstream sout;
sout << string::Sprintf("%s at [%s:%d]", std::forward<StrType>(what), file,
line)
<< std::endl;
sout << "PaddlePaddle Call Stacks: " << std::endl;
#if !defined(_WIN32)
void* call_stack[TRACE_STACK_LIMIT];
auto size = backtrace(call_stack, TRACE_STACK_LIMIT);
auto symbols = backtrace_symbols(call_stack, size);
Dl_info info;
for (int i = 0; i < size; ++i) {
if (dladdr(call_stack[i], &info) && info.dli_sname) {
auto demangled = demangle(info.dli_sname);
auto addr_offset = static_cast<char*>(call_stack[i]) -
static_cast<char*>(info.dli_saddr);
sout << string::Sprintf("%-3d %*0p %s + %zd\n", i, 2 + sizeof(void*) * 2,
call_stack[i], demangled, addr_offset);
} else {
sout << string::Sprintf("%-3d %*0p\n", i, 2 + sizeof(void*) * 2,
call_stack[i]);
}
}
free(symbols);
#else
sout << "Windows not support stack backtrace yet.";
#endif
return sout.str();
}
struct EnforceNotMet : public std::exception {
std::string err_str_;
EnforceNotMet(std::exception_ptr e, const char* f, int l) {
EnforceNotMet(std::exception_ptr e, const char* file, int line) {
try {
std::rethrow_exception(e);
} catch (std::exception& e) {
Init(e.what(), f, l);
err_str_ = GetTraceBackString(e.what(), file, line);
SaveErrorInformation(err_str_);
}
}
EnforceNotMet(const std::string& str, const char* f, int l) {
Init(str, f, l);
EnforceNotMet(const std::string& str, const char* file, int line)
: err_str_(GetTraceBackString(str, file, line)) {
SaveErrorInformation(err_str_);
}
const char* what() const noexcept override { return err_str_.c_str(); }
private:
const std::string output_file_name{"paddle_err_info"};
void saveErrorInformation(const std::string& err) {
static void SaveErrorInformation(const std::string& err) {
const std::string output_file_name{"paddle_err_info"};
std::stringstream ss;
ss << output_file_name;
std::time_t t = std::time(nullptr);
......@@ -98,49 +134,15 @@ struct EnforceNotMet : public std::exception {
err_file.close();
}
}
template <typename StrType>
inline void Init(StrType what, const char* f, int l) {
static constexpr int TRACE_STACK_LIMIT = 100;
std::ostringstream sout;
sout << string::Sprintf("%s at [%s:%d]", what, f, l) << std::endl;
sout << "PaddlePaddle Call Stacks: " << std::endl;
#if !defined(_WIN32)
void* call_stack[TRACE_STACK_LIMIT];
auto size = backtrace(call_stack, TRACE_STACK_LIMIT);
auto symbols = backtrace_symbols(call_stack, size);
Dl_info info;
for (int i = 0; i < size; ++i) {
if (dladdr(call_stack[i], &info) && info.dli_sname) {
auto demangled = demangle(info.dli_sname);
auto addr_offset = static_cast<char*>(call_stack[i]) -
static_cast<char*>(info.dli_saddr);
sout << string::Sprintf("%-3d %*0p %s + %zd\n", i,
2 + sizeof(void*) * 2, call_stack[i], demangled,
addr_offset);
} else {
sout << string::Sprintf("%-3d %*0p\n", i, 2 + sizeof(void*) * 2,
call_stack[i]);
}
}
free(symbols);
#else
sout << "Windows not support stack backtrace yet.";
#endif
err_str_ = sout.str();
saveErrorInformation(err_str_);
}
};
struct EOFException : public std::exception {
std::string err_str_;
EOFException(const char* err_msg, const char* f, int l) {
err_str_ = string::Sprintf("%s at [%s:%d]", err_msg, f, l);
EOFException(const char* err_msg, const char* file, int line) {
err_str_ = string::Sprintf("%s at [%s:%d]", err_msg, file, line);
}
const char* what() const noexcept { return err_str_.c_str(); }
const char* what() const noexcept override { return err_str_.c_str(); }
};
// Because most enforce conditions would evaluate to true, we can use
......@@ -329,6 +331,12 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
__LINE__); \
} while (0)
#define PADDLE_THROW_BAD_ALLOC(...) \
do { \
throw ::paddle::memory::allocation::BadAlloc( \
::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__); \
} while (0)
/*
* Some enforce helpers here, usage:
* int a = 1;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册