From d8bff9883edd7e08bb68c8d174b586605e9d407b Mon Sep 17 00:00:00 2001 From: From00 Date: Wed, 23 Mar 2022 12:45:07 +0800 Subject: [PATCH] Performance optimization for StreamSafeCudaAllocator (#40718) * Performance optimize * Optimize GetAllocator, RWLock and ProcessUnfreedAllocation * Remove test file * Fix CI error * Fix CI errors * Fix CI errors --- .../memory/allocation/allocator_facade.cc | 158 ++++++++++++------ .../memory/allocation/allocator_facade.h | 9 +- .../allocation/stream_safe_cuda_allocator.cc | 22 ++- .../allocation/stream_safe_cuda_allocator.h | 3 + paddle/fluid/platform/device_context.cc | 14 +- 5 files changed, 136 insertions(+), 70 deletions(-) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 4a44448dc84..abf72564753 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -34,6 +34,7 @@ #include "paddle/fluid/memory/allocation/thread_local_allocator.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/backends/gpu/gpu_context.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h" @@ -210,13 +211,28 @@ class AllocatorFacadePrivate { InitNaiveBestFitCPUAllocator(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) allow_free_idle_chunk_ = allow_free_idle_chunk; - if (!FLAGS_use_stream_safe_cuda_allocator) { - for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); - ++dev_id) { - InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id), - allow_free_idle_chunk_); - } + for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { + InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id), + allow_free_idle_chunk_); + } + + // Note(Ruibiao): For GPU multi-stream case, the 'allocators_' map(place + // -> Allocator) hold the StreamSafeCUDAAllocator releate to default + // stream (i.e., the stream directly got from DeviceContex), while the + // 'cuda_allocators_' map(place -> map(stream -> Allocator)) hold the + // StreamSafeCUDAAllocator releate to non-default stream (i.e., the + // stream users pass in). The default stream Allocator is built in the + // structure of AllocatorFacadePrivate, while the non-default stream is + // build in a delayed manner in GetAllocator function with + // 'create_if_not_found = ture'. We make special treatment for the + // default stream for performance reasons. Since most Alloc calls are + // for default stream in application, treating it separately can avoid + // lots of overhead of acquiring default stream and applying read-write + // lock. + if (FLAGS_use_stream_safe_cuda_allocator) { + WrapStreamSafeCUDAAllocatorForDefault(); } + InitNaiveBestFitCUDAPinnedAllocator(); #endif #ifdef PADDLE_WITH_ASCEND_CL @@ -301,7 +317,8 @@ class AllocatorFacadePrivate { CheckAllocThreadSafe(); #ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) { + if (FLAGS_use_stream_safe_cuda_allocator == false && + UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) { WrapCUDAGraphAllocator(); } #endif @@ -341,7 +358,12 @@ class AllocatorFacadePrivate { const std::shared_ptr& GetAllocator( const platform::CUDAPlace& place, const gpuStream_t& stream, bool create_if_not_found = false) { - { // shared_lock_guard + if (stream == GetDefaultStream(place)) { + VLOG(7) << "Get Allocator by passing in a default stream"; + return GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); + } + + /* shared_lock_guard */ { std::shared_lock lock_guard( cuda_allocator_mutex_); if (LIKELY(HasCUDAAllocator(place, stream))) { @@ -355,7 +377,7 @@ class AllocatorFacadePrivate { } } - { // unique_lock_guard + /* unique_lock_guard */ { std::unique_lock lock_guard( cuda_allocator_mutex_); InitStreamSafeCUDAAllocator(place, stream); @@ -363,9 +385,40 @@ class AllocatorFacadePrivate { } } - gpuStream_t GetDefaultStream(const platform::CUDAPlace& place) { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - return static_cast(pool.Get(place))->stream(); + const std::shared_ptr + GetDefaultStreamSafeCUDAAllocator(const platform::CUDAPlace& place) const { + const auto iter = default_stream_safe_cuda_allocators_.find(place); + PADDLE_ENFORCE_NE( + iter, default_stream_safe_cuda_allocators_.end(), + platform::errors::NotFound( + "No StreamSafeCUDAAllocator found for the place, %s", place)); + return iter->second; + } + + const gpuStream_t& GetDefaultStream(const platform::CUDAPlace& place) const { + const std::shared_ptr& allocator = + GetDefaultStreamSafeCUDAAllocator(place); + return allocator->GetDefaultStream(); + } + + void SetDefaultStream(const platform::CUDAPlace& place, + const gpuStream_t& stream) { + const std::shared_ptr& allocator = + GetDefaultStreamSafeCUDAAllocator(place); + allocator->SetDefaultStream(stream); + VLOG(8) << "Set default stream to " << stream + << " for StreamSafeCUDAAllocator(" << allocator.get() << ") in " + << place; + } + + void SetDefaultStreamFromDeviceContext() { + VLOG(8) << "Set default stream from DeviceContex"; + for (auto& pair : default_stream_safe_cuda_allocators_) { + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + pair.second->SetDefaultStream( + static_cast(pool.Get(pair.first))->stream()); + } } void RecordStream(std::shared_ptr allocation, @@ -635,6 +688,26 @@ class AllocatorFacadePrivate { /* in_cuda_graph_capturing = */ !allow_free_idle_chunk_); } + void WrapStreamSafeCUDAAllocatorForDefault() { + for (auto& pair : allocators_) { + auto& place = pair.first; + if (platform::is_gpu_place(place)) { + std::shared_ptr&& allocator = + std::make_shared( + pair.second, place, /* default_stream = */ nullptr, + /* in_cuda_graph_capturing = */ !allow_free_idle_chunk_); + pair.second = allocator; + + // NOTE(Ruibiao): A tricky implement to give StreamSafeCUDAAllocator an + // ability to interact with the outside world, i.e., change default + // stream from outside + default_stream_safe_cuda_allocators_[place] = allocator; + VLOG(8) << "WrapStreamSafeCUDAAllocator for " << place + << ", allocator address = " << pair.second.get(); + } + } + } + void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream, size_t retry_time) { PADDLE_ENFORCE_GT( @@ -813,7 +886,6 @@ class AllocatorFacadePrivate { #endif } - // NOTE(Ruibiao): Old single-stream version, will be removed later void WrapCUDARetryAllocator(size_t retry_time) { PADDLE_ENFORCE_GT( retry_time, 0, @@ -828,6 +900,8 @@ class AllocatorFacadePrivate { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // a standalone CUDA allocator to support multi-stream GC in new executor + std::map> + default_stream_safe_cuda_allocators_; CUDAAllocatorMap cuda_allocators_; std::shared_timed_mutex cuda_allocator_mutex_; #endif @@ -870,15 +944,6 @@ AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const { const std::shared_ptr& AllocatorFacade::GetAllocator( const platform::Place& place) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && - FLAGS_use_system_allocator == false) { - AllocatorFacadePrivate* m = GetPrivate(); - platform::CUDAPlace cuda_place(place.GetDeviceId()); - return m->GetAllocator(cuda_place, m->GetDefaultStream(cuda_place)); - } -#endif - return GetPrivate()->GetAllocator( place, /* A non-zero num to choose allocator_ */ 1); } @@ -898,19 +963,6 @@ void* AllocatorFacade::GetBasePtr( return GetPrivate()->GetBasePtr(allocation); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -const std::shared_ptr& AllocatorFacade::GetAllocator( - const platform::Place& place, const gpuStream_t& stream) { - if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && - FLAGS_use_system_allocator == false) { - return GetPrivate()->GetAllocator(place, stream, - /*create_if_not_found=*/true); - } - return GetPrivate()->GetAllocator( - place, /* A non-zero num to choose allocator_ */ 1); -} -#endif - const std::shared_ptr& AllocatorFacade::GetZeroAllocator( const platform::Place& place) { return GetPrivate()->GetAllocator(place, /* zero size */ 0); @@ -923,26 +975,10 @@ std::shared_ptr AllocatorFacade::AllocShared( AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && - size > 0 && FLAGS_use_system_allocator == false) { - platform::CUDAPlace cuda_place(place.GetDeviceId()); - phi::Stream default_stream = phi::Stream(reinterpret_cast( - GetPrivate()->GetDefaultStream(cuda_place))); - return Alloc(cuda_place, size, default_stream); - } -#endif return GetPrivate()->GetAllocator(place, size)->Allocate(size); } uint64_t AllocatorFacade::Release(const platform::Place& place) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && - FLAGS_use_system_allocator == false) { - platform::CUDAPlace cuda_place(place.GetDeviceId()); - return Release(cuda_place, GetPrivate()->GetDefaultStream(cuda_place)); - } -#endif return GetPrivate() ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1) ->Release(place); @@ -1028,6 +1064,17 @@ void AllocatorFacade::RecordStream(std::shared_ptr allocation, GetPrivate()->RecordStream(allocation, stream); } +const std::shared_ptr& AllocatorFacade::GetAllocator( + const platform::Place& place, const gpuStream_t& stream) { + if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && + FLAGS_use_system_allocator == false) { + return GetPrivate()->GetAllocator(place, stream, + /*create_if_not_found=*/true); + } + return GetPrivate()->GetAllocator( + place, /* A non-zero num to choose allocator_ */ 1); +} + const gpuStream_t& AllocatorFacade::GetStream( const std::shared_ptr& allocation) const { PADDLE_ENFORCE_EQ( @@ -1040,6 +1087,13 @@ const gpuStream_t& AllocatorFacade::GetStream( return GetPrivate()->GetStream(allocation); } +void AllocatorFacade::SetDefaultStream(const platform::CUDAPlace& place, + const gpuStream_t& stream) { + if (FLAGS_use_stream_safe_cuda_allocator) { + GetPrivate()->SetDefaultStream(place, stream); + } +} + #ifdef PADDLE_WITH_CUDA void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth, @@ -1055,6 +1109,8 @@ void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { "The memory pool of the CUDA Graph with ID %d have been prepared.", id)); allocator.reset(new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false)); + allocator->SetDefaultStreamFromDeviceContext(); + VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id; } diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index 9066bb284e2..1ea872f7eca 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -55,11 +55,6 @@ class AllocatorFacade { void* GetBasePtr(const std::shared_ptr& allocation); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - const std::shared_ptr& GetAllocator(const platform::Place& place, - const gpuStream_t& stream); -#endif - const std::shared_ptr& GetZeroAllocator( const platform::Place& place); @@ -86,8 +81,12 @@ class AllocatorFacade { uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream); void RecordStream(std::shared_ptr allocation, const gpuStream_t& stream); + const std::shared_ptr& GetAllocator(const platform::Place& place, + const gpuStream_t& stream); const gpuStream_t& GetStream( const std::shared_ptr& allocation) const; + void SetDefaultStream(const platform::CUDAPlace& place, + const gpuStream_t& stream); #endif #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc index 072c4dee3bc..7e47d35176b 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc @@ -154,6 +154,14 @@ StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() { bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; } +const gpuStream_t& StreamSafeCUDAAllocator::GetDefaultStream() const { + return default_stream_; +} + +void StreamSafeCUDAAllocator::SetDefaultStream(const gpuStream_t& stream) { + default_stream_ = stream; +} + phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) { platform::RecordEvent("StreamSafeCUDAAllocator::Allocate", platform::TracerEventType::UserDefined, 9 /*level*/); @@ -187,12 +195,8 @@ void StreamSafeCUDAAllocator::FreeImpl(phi::Allocation* allocation) { platform::RecordEvent("StreamSafeCUDAAllocator::Free", platform::TracerEventType::UserDefined, 9 /*level*/); StreamSafeCUDAAllocation* stream_safe_cuda_allocation = - dynamic_cast(allocation); - PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation, - platform::errors::InvalidArgument( - "Failed to dynamic cast %p from Allocation* to " - "StreamSafeCUDAAllocation*", - allocation)); + static_cast(allocation); + VLOG(8) << "Try free allocation " << stream_safe_cuda_allocation->ptr(); if (stream_safe_cuda_allocation->CanBeFreed()) { VLOG(9) << "Directly delete allocation"; @@ -221,6 +225,12 @@ uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) { } void StreamSafeCUDAAllocator::ProcessUnfreedAllocations() { + // NOTE(Ruibiao): This condition is to reduce lock competion. It does not need + // to be thread-safe since here occasional misjudgments are permissible. + if (unfreed_allocations_.empty()) { + return; + } + std::lock_guard lock_guard(unfreed_allocation_lock_); for (auto it = unfreed_allocations_.begin(); it != unfreed_allocations_.end();) { diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h index ecddff97c20..65af32c701b 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h @@ -64,7 +64,10 @@ class StreamSafeCUDAAllocator platform::CUDAPlace place, gpuStream_t default_stream, bool in_cuda_graph_capturing = false); ~StreamSafeCUDAAllocator(); + bool IsAllocThreadSafe() const override; + const gpuStream_t &GetDefaultStream() const; + void SetDefaultStream(const gpuStream_t &stream); protected: phi::Allocation *AllocateImpl(size_t size) override; diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 18ac979b48e..5605d326f2c 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -159,10 +159,8 @@ inline void EmplaceDeviceContext( cuda_ctx, platform::errors::InvalidArgument( "Failed to dynamic_cast dev_ctx into CUDADeviceContext.")); - // Note: A trick method to init context, why GetAllocator interface - // needs a stream parameter? dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance() - .GetAllocator(p, cuda_ctx->stream()) + .GetAllocator(p) .get()); cuda_ctx->PartialInitWithAllocator(); dev_ctx->SetGenerator( @@ -517,10 +515,10 @@ CUDAContext::~CUDAContext() { CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : phi::GPUContext(place) { phi::GPUContext::PartialInitWithoutAllocator(); cuda_stream_.reset(new stream::CUDAStream(phi::GPUContext::stream(), place)); - workspace_.reset(new phi::DnnWorkspaceHandle( - memory::allocation::AllocatorFacade::Instance() - .GetAllocator(place, phi::GPUContext::stream()) - .get())); + auto& instance = memory::allocation::AllocatorFacade::Instance(); + instance.SetDefaultStream(place, phi::GPUContext::stream()); + workspace_.reset( + new phi::DnnWorkspaceHandle(instance.GetAllocator(place).get())); } CUDADeviceContext::~CUDADeviceContext() = default; @@ -618,7 +616,7 @@ phi::DnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const { // return workspace_.get(); return phi::DnnWorkspaceHandle( memory::allocation::AllocatorFacade::Instance() - .GetAllocator(GetPlace(), phi::GPUContext::stream()) + .GetAllocator(GetPlace()) .get()); } return phi::GPUContext::cudnn_workspace_handle(); -- GitLab