From 6d353aa524770279a9b216e011d6623b7be0ea35 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Mon, 11 Oct 2021 20:59:49 +0800 Subject: [PATCH] refine auto_growth allocator (#35732) * do not use alignedAllocator when cuda has alignment * update test * fix error during multiple process --- .../memory/allocation/aligned_allocator.cc | 1 + .../memory/allocation/allocator_facade.cc | 36 ++++++++++++++++++- .../auto_growth_best_fit_allocator.cc | 15 ++++---- .../auto_growth_best_fit_allocator_test.cc | 14 +++++--- 4 files changed, 55 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/memory/allocation/aligned_allocator.cc b/paddle/fluid/memory/allocation/aligned_allocator.cc index 1d89918bfe..f0b7f1a4b0 100644 --- a/paddle/fluid/memory/allocation/aligned_allocator.cc +++ b/paddle/fluid/memory/allocation/aligned_allocator.cc @@ -20,6 +20,7 @@ namespace paddle { namespace memory { namespace allocation { +// For memory address alignment class AlignedAllocation : public Allocation { public: AlignedAllocation(AllocationPtr underlying_allocation, size_t offset) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 0388e2d13a..281902f3a2 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -23,6 +23,7 @@ #ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" #endif +#include "paddle/fluid/memory/allocation/aligned_allocator.h" #include "paddle/fluid/memory/allocation/retry_allocator.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" @@ -201,6 +202,8 @@ class AllocatorFacadePrivate { inline const std::shared_ptr& GetAllocator( const platform::Place& place, size_t size) { + VLOG(4) << "GetAllocator" + << " " << place << " " << size; const auto& allocators = (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_ : GetAllocatorMap()) @@ -256,8 +259,39 @@ class AllocatorFacadePrivate { void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, bool allow_free_idle_chunk) { auto cuda_allocator = std::make_shared(p); + auto alignment = platform::GpuMinChunkSize(); + bool need_addr_align = true; + // NOTE: sometimes, since cuda runtime can not be forked, calling any cuda + // API in that case may got cuda error(3), i.e., + // cudaErrorInitializationError. And, the CUDAAllocator is only initialized + // but not really used. + // Here, the try-catch block is added to handle the case that + // GetDeviceProperties() may failed in the multiple process(for example, in + // dataloader with num_worker > 0) + try { + const auto& prop = platform::GetDeviceProperties(p.GetDeviceId()); + need_addr_align = prop.textureAlignment < alignment; + VLOG(4) << "GetDeviceProperties ok, textureAlignment: " + << prop.textureAlignment + << ", set need_addr_align=" << need_addr_align; + } catch (...) { + need_addr_align = true; + VLOG(4) << "GetDeviceProperties failed, set need_addr_align=true"; + } + // The address returned is aligned already, + // ref: + // https://stackoverflow.com/questions/14082964/cuda-alignment-256bytes-seriously/14083295#14083295 + std::shared_ptr underlying_allocator{nullptr}; + if (need_addr_align) { + VLOG(10) << "use AlignedAllocator with alignment: " << alignment; + underlying_allocator = + std::make_shared(underlying_allocator, alignment); + } else { + VLOG(10) << "not use AlignedAllocator with alignment: " << alignment; + underlying_allocator = cuda_allocator; + } allocators_[p] = std::make_shared( - cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk); + underlying_allocator, alignment, 0, allow_free_idle_chunk); } #endif diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc index f36d589f90..9f34f5198a 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc @@ -40,14 +40,14 @@ namespace allocation { AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator( const std::shared_ptr &underlying_allocator, size_t alignment, size_t chunk_size, bool allow_free_idle_chunk) - : underlying_allocator_( - std::make_shared(underlying_allocator, alignment)), + : underlying_allocator_(underlying_allocator), alignment_(alignment), chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)), allow_free_idle_chunk_(allow_free_idle_chunk) {} -Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) { - size = AlignedSize(size, alignment_); +Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) { + size_t size = AlignedSize(unaligned_size, alignment_); + VLOG(10) << "Allocate " << unaligned_size << " bytes, aligned to " << size; std::lock_guard guard(spinlock_); auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr)); @@ -57,6 +57,8 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) { free_blocks_.erase(iter); auto *chunk = block_it->chunk_; size_t remaining_size = block_it->size_ - size; + VLOG(10) << "Allocate " << size << " bytes from chunk size " + << block_it->size_ << ", remaining " << remaining_size; if (remaining_size == 0) { block_it->is_free_ = false; } else { @@ -95,13 +97,14 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) { } blocks.emplace_back(p + remaining_size, size, false, chunk); block_it = --(blocks.end()); - VLOG(2) << "Not found and reallocate " << realloc_size << ", and remaining " - << remaining_size; + VLOG(2) << "Not found and reallocate " << realloc_size << "(" + << static_cast(p) << "), and remaining " << remaining_size; } return new BlockAllocation(block_it); } void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) { + VLOG(10) << "Free " << allocation->size() << " bytes"; std::lock_guard guard(spinlock_); auto block_it = static_cast(allocation)->block_it_; auto &blocks = block_it->chunk_->blocks_; diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc index 6f2591c8b1..926af8292d 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc @@ -12,10 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h" - #include +#include "paddle/fluid/memory/allocation/aligned_allocator.h" +#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h" + #include "gtest/gtest.h" DECLARE_bool(free_idle_chunk); @@ -50,10 +51,13 @@ static void TestFreeIdleChunk(bool free_idle_chunk, FLAGS_free_idle_chunk = free_idle_chunk; FLAGS_free_when_no_cache_hit = free_when_no_cache_hit; auto recorded_allocator = std::make_shared(); + size_t alignment = 4096; size_t memory_size = 8192; + auto underlying_allocator = + std::make_shared(recorded_allocator, alignment); auto ag_allocator = std::make_shared( - recorded_allocator, alignment); + underlying_allocator, alignment); for (size_t i = 0; i < 10; ++i) { auto allocation = ag_allocator->Allocate(memory_size); @@ -131,8 +135,10 @@ static void TestFreeWhenNoCacheHit(bool free_when_no_cache_hit) { auto underlying_allocator = std::make_shared(memory_capacity); + auto aligned_allocator = + std::make_shared(underlying_allocator, alignment); auto ag_allocator = std::make_shared( - underlying_allocator, alignment); + aligned_allocator, alignment); ag_allocator->Allocate(allocate_size[0]); ASSERT_EQ(underlying_allocator->AllocatedSize(), -- GitLab