From 64d94596abfa6ff449f23a09f1c985b51c04eae7 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Mon, 15 Oct 2018 12:09:29 +0000 Subject: [PATCH] fix allocator_facade bug --- .../memory/allocation/allocator_facade.cc | 24 ++++++-- .../allocation/auto_increment_allocator.h | 60 ++++++++++++------- .../memory/allocation/best_fit_allocator.cc | 7 ++- 3 files changed, 62 insertions(+), 29 deletions(-) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 052e1646de6..4f07c1610dc 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -74,10 +74,24 @@ class CUDAManagedAllocator : public ManagedAllocator { explicit CUDAManagedAllocator(int dev_id) { platform::CUDADeviceGuard guard(dev_id); max_chunk_size_ = platform::GpuMaxChunkSize(); + raw_allocator_ = NaiveManagedAllocator::Create(std::unique_ptr( new CUDAAllocator(platform::CUDAPlace(dev_id)))); - default_allocator_ = std::make_shared( - [this] { return std::move(BestFitAllocatorCreator()); }); + + if (max_chunk_size_ == 0) { + default_allocator_ = raw_allocator_; + } else { + size_t available, total; + platform::GpuMemoryUsage(&available, &total); + size_t capacity = available / max_chunk_size_; + + if (capacity == 1) { + default_allocator_ = BestFitAllocatorCreator(); + } else { + default_allocator_ = std::make_shared( + [this] { return std::move(BestFitAllocatorCreator()); }, capacity); + } + } auto* cond_allocator = new ConditionalAllocator(); cond_allocator @@ -110,9 +124,11 @@ class CUDAManagedAllocator : public ManagedAllocator { chunks_.emplace_back(raw_allocator_->Allocate(max_chunk_size_)); auto* allocation = chunks_.back().get(); return std::make_shared>( - NaiveManagedAllocator::Create( - std::unique_ptr(new BestFitAllocator(allocation)))); + NaiveManagedAllocator::Create(std::unique_ptr( + new LockedAllocator(std::unique_ptr( + new BestFitAllocator(allocation)))))); } + bool IsAllocThreadSafe() const override { return true; } private: diff --git a/paddle/fluid/memory/allocation/auto_increment_allocator.h b/paddle/fluid/memory/allocation/auto_increment_allocator.h index 650f1d1cc6c..f026c413d4b 100644 --- a/paddle/fluid/memory/allocation/auto_increment_allocator.h +++ b/paddle/fluid/memory/allocation/auto_increment_allocator.h @@ -40,13 +40,18 @@ namespace allocation { // allocator. The allocation requests from many threads may be dispatched // to the same underlying allocator. So the underlying allocator must be // thread safe. +// +// NOTE(zjl): Add capacity parameters to constructor. A high-performance +// thread-safe std::vector with varying size is hard to implement. +// Fortunately, we can get the total GPU memory and each chunk size. +// Therefore, we can get the suitable capacity of AutoIncrementAllocator. class AutoIncrementAllocator : public ManagedAllocator { public: // Creator is the method to create ManagedAllocator using AllocatorCreator = std::function()>; - explicit AutoIncrementAllocator(AllocatorCreator&& creator) - : creator_(std::move(creator)), prev_success_allocator_{0} {} + explicit AutoIncrementAllocator(AllocatorCreator&& creator, size_t capacity) + : creator_(std::move(creator)), underlying_allocators_(capacity) {} std::unique_ptr Allocate(size_t size, Attr attr) override; std::shared_ptr AllocateShared(size_t size, Attr attr) override; bool IsAllocThreadSafe() const override; @@ -56,15 +61,13 @@ class AutoIncrementAllocator : public ManagedAllocator { template inline typename std::result_of::type InvokeOrCreateUnderlyingAllocator(Callback callback) { - std::shared_ptr> - underlying_allocators = underlying_allocators_; - size_t retry_count = underlying_allocators->size(); - size_t allocator_num = retry_count; auto cur = prev_success_allocator_.load(); + size_t retry_count = allocator_num_.load(); + size_t allocator_num = retry_count; while (retry_count-- > 0) { // until there retry count is zero try { - auto res = callback(*((*underlying_allocators)[cur])); - prev_success_allocator_.store(cur); + auto res = callback(*underlying_allocators_[cur]); + prev_success_allocator_ = cur; return std::move(res); } catch (BadAlloc&) { if (++cur >= allocator_num) { @@ -77,20 +80,34 @@ class AutoIncrementAllocator : public ManagedAllocator { } // No suitable allocator + // This happens when the first allocator is exhausted and + // there are more than 1 allocation requests + // In this situation, the first allocation request would success + // and the second allocation request would fail if we do not use + // the newly created allocator by the first allocation request. + for (size_t new_allocator_num = allocator_num_.load(); + allocator_num < new_allocator_num; ++allocator_num) { + try { + auto ret = callback(*underlying_allocators_[allocator_num]); + prev_success_allocator_ = allocator_num; + return std::move(ret); + } catch (BadAlloc&) { + } catch (...) { + std::rethrow_exception(std::current_exception()); + } + } + ManagedAllocator* new_allocator; { std::lock_guard guard(mtx_); - auto old_size = underlying_allocators_->size(); - decltype(underlying_allocators_) new_allocators( - new std::vector(old_size + 1)); - for (size_t i = 0; i < old_size; ++i) { - (*new_allocators)[i] = (*underlying_allocators_)[i]; - } - - (*new_allocators)[old_size] = creator_(); - new_allocator = (*new_allocators)[old_size].get(); - underlying_allocators_ = new_allocators; - prev_success_allocator_.store(old_size); + auto old_size = allocator_num_.load(); + PADDLE_ENFORCE_LT(old_size, underlying_allocators_.size(), + "Allocator number exceeds capacity %d", + underlying_allocators_.size()); + underlying_allocators_[old_size] = creator_(); + new_allocator = underlying_allocators_[old_size].get(); + prev_success_allocator_ = old_size; + allocator_num_.fetch_add(1); } PADDLE_ENFORCE( @@ -102,9 +119,8 @@ class AutoIncrementAllocator : public ManagedAllocator { AllocatorCreator creator_; - // Use std::shared_ptr to ensure thread-safety - std::shared_ptr> - underlying_allocators_; + std::vector underlying_allocators_; + std::atomic allocator_num_{0}; // Use std::atomic rather than std::mutex, since std::atomic is usually // lock-free diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc index aa338f46756..1d9e7177f95 100644 --- a/paddle/fluid/memory/allocation/best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc @@ -26,10 +26,11 @@ static int HighestBitPos(size_t N) { if (UNLIKELY(N == 0)) { return 0; } else { - // NOTE: here we can use __builtin_clz in GCC. - // However, let's use std::log2 for better readability - // and trust std::log2's performance. +#ifdef __GNUC__ + return sizeof(unsigned int) * 8 - __builtin_clz(N); +#else return static_cast(std::log2(N) + 1); +#endif } } -- GitLab