From 4c0511faa406cde9db59f1233f6791e0e7c4098d Mon Sep 17 00:00:00 2001 From: From00 Date: Thu, 3 Mar 2022 15:17:01 +0800 Subject: [PATCH] Support cuda graph in StreamSafeCudaAllocator (#39594) * Support cuda graph in StreamSafeCudaAllocator * Fix CI error * Arrange AllocatorFacade * Fix CI error * Fix CI error * Fix ROCM Compile error * Fix ROCM Compile error --- paddle/fluid/memory/allocation/CMakeLists.txt | 2 +- .../memory/allocation/allocator_facade.cc | 276 +++++------- .../memory/allocation/allocator_facade.h | 11 +- .../allocation/stream_safe_cuda_allocator.cc | 120 +++-- .../allocation/stream_safe_cuda_allocator.h | 28 +- paddle/fluid/memory/malloc.cc | 10 +- paddle/fluid/memory/malloc.h | 6 +- .../memory/stream_safe_cuda_alloc_test.cu | 409 +++++++++--------- 8 files changed, 436 insertions(+), 426 deletions(-) diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 6cd7d873323..a7a417c29a7 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -17,7 +17,7 @@ if (WITH_GPU) nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard) nv_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info) nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator) - nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator) + nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator cuda_graph) nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator) cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 6b7828236a8..61e292a922f 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -210,12 +210,7 @@ class AllocatorFacadePrivate { InitNaiveBestFitCPUAllocator(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) allow_free_idle_chunk_ = allow_free_idle_chunk; - if (FLAGS_use_stream_safe_cuda_allocator) { - for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); - ++dev_id) { - InitStreamSafeCUDAAllocator(platform::CUDAPlace(dev_id), nullptr); - } - } else { + if (!FLAGS_use_stream_safe_cuda_allocator) { for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id), @@ -298,6 +293,12 @@ class AllocatorFacadePrivate { } CheckAllocThreadSafe(); + +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) { + WrapCUDAGraphAllocator(); + } +#endif } inline const std::shared_ptr& GetAllocator( @@ -388,39 +389,6 @@ class AllocatorFacadePrivate { allocation.get())); return stream_safe_cuda_allocation->GetOwningStream(); } - -#ifdef PADDLE_WITH_CUDA - void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { - PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth, - platform::errors::InvalidArgument( - "CUDA Graph is only supported when the " - "FLAGS_allocator_strategy=\"auto_growth\", but got " - "FLAGS_allocator_strategy=\"%s\"", - FLAGS_allocator_strategy)); - auto& allocator = cuda_graph_allocator_map_[id]; - PADDLE_ENFORCE_EQ( - allocator.get(), nullptr, - platform::errors::InvalidArgument( - "The memory pool of the CUDA Graph with ID %d have been prepared.", - id)); - allocator.reset( - new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false)); - for (auto& item : allocator->allocators_) { - auto& old_allocator = item.second; - old_allocator = CUDAGraphAllocator::Create(old_allocator); - } - VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id; - } - - void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) { - auto iter = cuda_graph_allocator_map_.find(id); - PADDLE_ENFORCE_NE(iter, cuda_graph_allocator_map_.end(), - platform::errors::InvalidArgument( - "Cannot find CUDA Graph with ID = %d", id)); - cuda_graph_allocator_map_.erase(iter); - VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id; - } -#endif #endif private: @@ -439,24 +407,7 @@ class AllocatorFacadePrivate { platform::Place place_; }; - const AllocatorMap& GetAllocatorMap() { -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) { - auto id = platform::CUDAGraph::CapturingID(); - auto iter = cuda_graph_allocator_map_.find(id); - PADDLE_ENFORCE_NE( - iter, cuda_graph_allocator_map_.end(), - platform::errors::PermissionDenied( - "No memory pool is prepared for CUDA Graph capturing.")); - VLOG(10) << "Choose CUDA Graph memory pool to allocate memory"; - return iter->second->allocators_; - } else { - return allocators_; - } -#else - return allocators_; -#endif - } + const AllocatorMap& GetAllocatorMap() { return allocators_; } void InitNaiveBestFitCPUAllocator() { allocators_[platform::CPUPlace()] = @@ -672,10 +623,10 @@ class AllocatorFacadePrivate { } void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) { - const std::shared_ptr& underlying_allocator = - cuda_allocators_[p][stream]; - cuda_allocators_[p][stream] = std::make_shared( - underlying_allocator, p, stream); + std::shared_ptr& allocator = cuda_allocators_[p][stream]; + allocator = std::make_shared( + allocator, p, stream, + /* in_cuda_graph_capturing = */ !allow_free_idle_chunk_); } void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream, @@ -684,10 +635,19 @@ class AllocatorFacadePrivate { retry_time, 0, platform::errors::InvalidArgument( "Retry time should be larger than 0, but got %d", retry_time)); - std::shared_ptr allocator = cuda_allocators_[p][stream]; + std::shared_ptr& allocator = cuda_allocators_[p][stream]; allocator = std::make_shared(allocator, retry_time); } +#ifdef PADDLE_WITH_CUDA + void WrapCUDAGraphAllocator() { + for (auto& item : allocators_) { + auto& allocator = item.second; + allocator = CUDAGraphAllocator::Create(allocator); + } + } +#endif + static void CheckCUDAAllocThreadSafe(const CUDAAllocatorMap& allocators) { for (auto& place_pair : allocators) { for (auto& stream_pair : place_pair.second) { @@ -864,10 +824,6 @@ class AllocatorFacadePrivate { // a standalone CUDA allocator to support multi-stream GC in new executor CUDAAllocatorMap cuda_allocators_; std::shared_timed_mutex cuda_allocator_mutex_; -#ifdef PADDLE_WITH_CUDA - std::unordered_map> - cuda_graph_allocator_map_; -#endif #endif AllocatorStrategy strategy_; AllocatorMap allocators_; @@ -886,8 +842,24 @@ AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {} AllocatorFacade::~AllocatorFacade() {} AllocatorFacade& AllocatorFacade::Instance() { - static AllocatorFacade instance; - return instance; + static AllocatorFacade* instance = new AllocatorFacade; + return *instance; +} + +AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const { +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) { + auto id = platform::CUDAGraph::CapturingID(); + auto iter = cuda_graph_map_.find(id); + PADDLE_ENFORCE_NE( + iter, cuda_graph_map_.end(), + platform::errors::PermissionDenied( + "No memory pool is prepared for CUDA Graph capturing.")); + VLOG(10) << "Choose CUDA Graph memory pool"; + return iter->second.get(); + } +#endif + return m_; } const std::shared_ptr& AllocatorFacade::GetAllocator( @@ -895,19 +867,14 @@ const std::shared_ptr& AllocatorFacade::GetAllocator( #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && FLAGS_use_system_allocator == false) { -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - return m_->GetAllocator(place, - /* A non-zero num to choose allocator_ */ 1); - } -#endif - + AllocatorFacadePrivate* m = GetPrivate(); platform::CUDAPlace cuda_place(place.GetDeviceId()); - return m_->GetAllocator(cuda_place, m_->GetDefaultStream(cuda_place)); + return m->GetAllocator(cuda_place, m->GetDefaultStream(cuda_place)); } #endif - return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); + return GetPrivate()->GetAllocator( + place, /* A non-zero num to choose allocator_ */ 1); } void* AllocatorFacade::GetBasePtr( @@ -922,7 +889,7 @@ void* AllocatorFacade::GetBasePtr( "GetBasePtr() is only implemented for CUDAPlace(), not " "suppot place: %s", allocation->place())); - return m_->GetBasePtr(allocation); + return GetPrivate()->GetBasePtr(allocation); } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -930,21 +897,17 @@ const std::shared_ptr& AllocatorFacade::GetAllocator( const platform::Place& place, const gpuStream_t& stream) { if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && FLAGS_use_system_allocator == false) { -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - return m_->GetAllocator(place, - /* A non-zero num to choose allocator_ */ 1); - } -#endif - return m_->GetAllocator(place, stream, /*create_if_not_found=*/true); + return GetPrivate()->GetAllocator(place, stream, + /*create_if_not_found=*/true); } - return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); + return GetPrivate()->GetAllocator( + place, /* A non-zero num to choose allocator_ */ 1); } #endif const std::shared_ptr& AllocatorFacade::GetZeroAllocator( const platform::Place& place) { - return m_->GetAllocator(place, /* zero size */ 0); + return GetPrivate()->GetAllocator(place, /* zero size */ 0); } std::shared_ptr AllocatorFacade::AllocShared( @@ -957,43 +920,30 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && size > 0 && FLAGS_use_system_allocator == false) { -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - return m_->GetAllocator(place, size)->Allocate(size); - } -#endif - platform::CUDAPlace cuda_place(place.GetDeviceId()); - return Alloc(cuda_place, size, m_->GetDefaultStream(cuda_place)); + phi::Stream default_stream = phi::Stream(reinterpret_cast( + GetPrivate()->GetDefaultStream(cuda_place))); + return Alloc(cuda_place, size, default_stream); } #endif - - return m_->GetAllocator(place, size)->Allocate(size); + return GetPrivate()->GetAllocator(place, size)->Allocate(size); } uint64_t AllocatorFacade::Release(const platform::Place& place) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && FLAGS_use_system_allocator == false) { -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - return m_ - ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1) - ->Release(place); - } -#endif - platform::CUDAPlace cuda_place(place.GetDeviceId()); - return Release(cuda_place, m_->GetDefaultStream(cuda_place)); + return Release(cuda_place, GetPrivate()->GetDefaultStream(cuda_place)); } #endif - return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1) + return GetPrivate() + ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1) ->Release(place); } std::shared_ptr AllocatorFacade::AllocShared( const platform::Place& place, size_t size, const phi::Stream& stream) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_EQ( FLAGS_use_stream_safe_cuda_allocator, true, platform::errors::Unimplemented( @@ -1001,71 +951,53 @@ std::shared_ptr AllocatorFacade::AllocShared( "multi-stream 'AllocaShared' function. To enable it, you can enter" "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " "terminal.")); - -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - PADDLE_THROW(platform::errors::Unavailable( - "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); - } -#endif - gpuStream_t s = reinterpret_cast(stream.id()); - return std::shared_ptr(Alloc(place, size, s)); -#else - PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU.")); -#endif + return std::shared_ptr(Alloc(place, size, stream)); } -bool AllocatorFacade::InSameStream( - const std::shared_ptr& allocation, - const phi::Stream& stream) { +AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size, + const phi::Stream& stream) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_EQ( FLAGS_use_stream_safe_cuda_allocator, true, platform::errors::Unimplemented( "StreamSafeCUDAAllocator is disabled, you should not call this " - "multi-stream 'InSameStream' function. To enable it, you can enter" + "multi-stream 'Alloc' function. To enable it, you can enter" "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " "terminal.")); -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - PADDLE_THROW(platform::errors::Unavailable( - "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); + platform::CUDAPlace p(place.GetDeviceId()); + if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) { + gpuStream_t s = reinterpret_cast(stream.id()); + return GetPrivate() + ->GetAllocator(p, s, /* create_if_not_found = */ true) + ->Allocate(size); + } else { + return GetPrivate()->GetAllocator(p, size)->Allocate(size); } -#endif - gpuStream_t s = reinterpret_cast(stream.id()); - return s == GetStream(allocation); #else PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU.")); #endif } +bool AllocatorFacade::InSameStream( + const std::shared_ptr& allocation, + const phi::Stream& stream) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size, - const gpuStream_t& stream) { PADDLE_ENFORCE_EQ( FLAGS_use_stream_safe_cuda_allocator, true, platform::errors::Unimplemented( "StreamSafeCUDAAllocator is disabled, you should not call this " - "multi-stream 'Alloc' function. To enable it, you can enter" + "multi-stream 'InSameStream' function. To enable it, you can enter" "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " "terminal.")); - -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - PADDLE_THROW(platform::errors::Unavailable( - "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); - } + gpuStream_t s = reinterpret_cast(stream.id()); + return s == GetStream(allocation); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU.")); #endif - platform::CUDAPlace p(place.GetDeviceId()); - if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) { - return m_->GetAllocator(p, stream, /* create_if_not_found = */ true) - ->Allocate(size); - } else { - return m_->GetAllocator(p, size)->Allocate(size); - } } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place, const gpuStream_t& stream) { PADDLE_ENFORCE_EQ( @@ -1075,15 +1007,7 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place, "multi-stream 'Release' function. To enable it, you can enter" "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " "terminal.")); - -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - PADDLE_THROW(platform::errors::Unavailable( - "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); - } -#endif - - return m_->GetAllocator(place, stream)->Release(place); + return GetPrivate()->GetAllocator(place, stream)->Release(place); } void AllocatorFacade::RecordStream(std::shared_ptr allocation, @@ -1095,15 +1019,7 @@ void AllocatorFacade::RecordStream(std::shared_ptr allocation, "'RecordStream' function. To enable it, you can enter" "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " "terminal.")); - -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - PADDLE_THROW(platform::errors::Unavailable( - "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); - } -#endif - - m_->RecordStream(allocation, stream); + GetPrivate()->RecordStream(allocation, stream); } const gpuStream_t& AllocatorFacade::GetStream( @@ -1115,24 +1031,34 @@ const gpuStream_t& AllocatorFacade::GetStream( "'GetStream' function. To enable it, you can enter" "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " "terminal.")); - -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - PADDLE_THROW(platform::errors::Unavailable( - "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); - } -#endif - - return m_->GetStream(allocation); + return GetPrivate()->GetStream(allocation); } #ifdef PADDLE_WITH_CUDA void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { - return m_->PrepareMemoryPoolForCUDAGraph(id); + PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth, + platform::errors::InvalidArgument( + "CUDA Graph is only supported when the " + "FLAGS_allocator_strategy=\"auto_growth\", but got " + "FLAGS_allocator_strategy=\"%s\"", + FLAGS_allocator_strategy)); + auto& allocator = cuda_graph_map_[id]; + PADDLE_ENFORCE_EQ( + allocator.get(), nullptr, + platform::errors::InvalidArgument( + "The memory pool of the CUDA Graph with ID %d have been prepared.", + id)); + allocator.reset(new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false)); + VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id; } void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) { - return m_->RemoveMemoryPoolOfCUDAGraph(id); + auto iter = cuda_graph_map_.find(id); + PADDLE_ENFORCE_NE(iter, cuda_graph_map_.end(), + platform::errors::InvalidArgument( + "Cannot find CUDA Graph with ID = %d", id)); + cuda_graph_map_.erase(iter); + VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id; } #endif #endif diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index 1722a06b01f..9066bb284e2 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -49,6 +49,8 @@ class AllocatorFacade { static AllocatorFacade& Instance(); + AllocatorFacadePrivate* GetPrivate() const; + const std::shared_ptr& GetAllocator(const platform::Place& place); void* GetBasePtr(const std::shared_ptr& allocation); @@ -73,13 +75,14 @@ class AllocatorFacade { size_t size, const phi::Stream& stream); + AllocationPtr Alloc(const platform::Place& place, size_t size, + const phi::Stream& stream); + bool InSameStream(const std::shared_ptr& allocation, const phi::Stream& stream); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // TODO(zhiqiu): change gpuStream_t to phi::Stream if needed. - AllocationPtr Alloc(const platform::Place& place, size_t size, - const gpuStream_t& stream); uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream); void RecordStream(std::shared_ptr allocation, const gpuStream_t& stream); @@ -96,6 +99,10 @@ class AllocatorFacade { private: AllocatorFacade(); AllocatorFacadePrivate* m_; +#ifdef PADDLE_WITH_CUDA + std::unordered_map> + cuda_graph_map_; +#endif }; } // namespace allocation diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc index 8627e3e6f88..072c4dee3bc 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc @@ -15,56 +15,52 @@ #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h" +#endif + namespace paddle { namespace memory { namespace allocation { StreamSafeCUDAAllocation::StreamSafeCUDAAllocation( - DecoratedAllocationPtr underlying_allocation, gpuStream_t owning_stream) + DecoratedAllocationPtr underlying_allocation, gpuStream_t owning_stream, + StreamSafeCUDAAllocator* allocator) : Allocation(underlying_allocation->ptr(), underlying_allocation->base_ptr(), underlying_allocation->size(), underlying_allocation->place()), underlying_allocation_(std::move(underlying_allocation)), - owning_stream_(std::move(owning_stream)) {} + owning_stream_(std::move(owning_stream)), + allocator_(allocator->shared_from_this()) {} void StreamSafeCUDAAllocation::RecordStream(const gpuStream_t& stream) { VLOG(8) << "Try record stream " << stream << " for address " << ptr(); if (stream == owning_stream_) { - VLOG(9) << "Record the same stream of " << stream; return; } std::lock_guard lock_guard(outstanding_event_map_lock_); - gpuEvent_t record_event; - auto it = outstanding_event_map_.find(stream); - if (it == outstanding_event_map_.end()) { - gpuEvent_t new_event; #ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS( - cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming)); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - hipEventCreateWithFlags(&new_event, hipEventDisableTiming)); -#endif - outstanding_event_map_[stream] = new_event; - record_event = new_event; - VLOG(9) << "Create a new event " << new_event; - } else { - record_event = it->second; - VLOG(9) << "Reuse event " << record_event; + if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) { + graph_capturing_stream_set_.insert(stream); + return; } - -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream)); #endif - VLOG(8) << "Record event " << record_event << " to stream " << stream; + + RecordStreamWithNoGraphCapturing(stream); + RecordGraphCapturingStreams(); } bool StreamSafeCUDAAllocation::CanBeFreed() { - // NOTE(Ruibiao): This function will not execute concurrently, - // so outstanding_event_lock_ is not required here +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) { + return graph_capturing_stream_set_.empty() && + outstanding_event_map_.empty(); + } +#endif + + RecordGraphCapturingStreams(); + for (auto it = outstanding_event_map_.begin(); it != outstanding_event_map_.end(); ++it) { gpuEvent_t& event = it->second; @@ -98,21 +94,62 @@ const gpuStream_t& StreamSafeCUDAAllocation::GetOwningStream() const { return owning_stream_; } +void StreamSafeCUDAAllocation::RecordGraphCapturingStreams() { + for (gpuStream_t stream : graph_capturing_stream_set_) { + RecordStreamWithNoGraphCapturing(stream); + } + graph_capturing_stream_set_.clear(); +} + +void StreamSafeCUDAAllocation::RecordStreamWithNoGraphCapturing( + const gpuStream_t& stream) { + gpuEvent_t record_event; + auto it = outstanding_event_map_.find(stream); + if (it == outstanding_event_map_.end()) { + gpuEvent_t new_event; +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_GPU_SUCCESS( + cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + hipEventCreateWithFlags(&new_event, hipEventDisableTiming)); +#endif + outstanding_event_map_[stream] = new_event; + record_event = new_event; + VLOG(9) << "Create a new event " << new_event; + } else { + record_event = it->second; + VLOG(9) << "Reuse event " << record_event; + } + +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream)); +#else + PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream)); +#endif + VLOG(8) << "Record event " << record_event << " to stream " << stream; +} + StreamSafeCUDAAllocator::StreamSafeCUDAAllocator( std::shared_ptr underlying_allocator, platform::CUDAPlace place, - gpuStream_t default_stream) + gpuStream_t default_stream, bool in_cuda_graph_capturing) : underlying_allocator_(std::move(underlying_allocator)), place_(std::move(place)), - default_stream_(std::move(default_stream)) { - std::lock_guard lock_guard(allocator_map_lock_); - allocator_map_[place].emplace_back(this); + default_stream_(std::move(default_stream)), + in_cuda_graph_capturing_(in_cuda_graph_capturing) { + if (LIKELY(!in_cuda_graph_capturing)) { + std::lock_guard lock_guard(allocator_map_lock_); + allocator_map_[place].emplace_back(this); + } } StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() { - std::lock_guard lock_guard(allocator_map_lock_); - std::vector& allocators = allocator_map_[place_]; - allocators.erase(std::remove(allocators.begin(), allocators.end(), this), - allocators.end()); + if (LIKELY(!in_cuda_graph_capturing_)) { + std::lock_guard lock_guard(allocator_map_lock_); + std::vector& allocators = allocator_map_[place_]; + allocators.erase(std::remove(allocators.begin(), allocators.end(), this), + allocators.end()); + } } bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; } @@ -140,7 +177,7 @@ phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) { } StreamSafeCUDAAllocation* allocation = new StreamSafeCUDAAllocation( static_unique_ptr_cast(std::move(underlying_allocation)), - default_stream_); + default_stream_, this); VLOG(8) << "Allocate " << allocation->size() << " bytes at address " << allocation->ptr(); return allocation; @@ -157,22 +194,27 @@ void StreamSafeCUDAAllocator::FreeImpl(phi::Allocation* allocation) { "StreamSafeCUDAAllocation*", allocation)); VLOG(8) << "Try free allocation " << stream_safe_cuda_allocation->ptr(); - std::lock_guard lock_guard(unfreed_allocation_lock_); if (stream_safe_cuda_allocation->CanBeFreed()) { VLOG(9) << "Directly delete allocation"; delete stream_safe_cuda_allocation; } else { VLOG(9) << "Put into unfreed_allocation list"; + std::lock_guard lock_guard(unfreed_allocation_lock_); unfreed_allocations_.emplace_back(stream_safe_cuda_allocation); } } uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) { + if (UNLIKELY(in_cuda_graph_capturing_)) { + VLOG(7) << "Memory release forbidden in CUDA Graph Captruing"; + return 0; + } + std::lock_guard lock_guard(allocator_map_lock_); std::vector& allocators = allocator_map_[place]; uint64_t released_size = 0; for (StreamSafeCUDAAllocator* allocator : allocators) { - released_size += allocator->ProcessUnfreedAllocationsWithRelease(); + released_size += allocator->ProcessUnfreedAllocationsAndRelease(); } VLOG(8) << "Release " << released_size << " bytes memory from all streams"; return released_size; @@ -191,7 +233,7 @@ void StreamSafeCUDAAllocator::ProcessUnfreedAllocations() { } } -uint64_t StreamSafeCUDAAllocator::ProcessUnfreedAllocationsWithRelease() { +uint64_t StreamSafeCUDAAllocator::ProcessUnfreedAllocationsAndRelease() { ProcessUnfreedAllocations(); return underlying_allocator_->Release(place_); } diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h index 7354836308c..ecddff97c20 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h @@ -14,10 +14,9 @@ #pragma once -#include #include #include -#include +#include #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/spin_lock.h" #include "paddle/fluid/platform/place.h" @@ -32,27 +31,38 @@ namespace paddle { namespace memory { namespace allocation { +class StreamSafeCUDAAllocator; + class StreamSafeCUDAAllocation : public Allocation { public: StreamSafeCUDAAllocation(DecoratedAllocationPtr underlying_allocation, - gpuStream_t owning_stream); + gpuStream_t owning_stream, + StreamSafeCUDAAllocator *allocator); + void RecordStream(const gpuStream_t &stream); bool CanBeFreed(); - const gpuStream_t &GetOwningStream() const; private: + void RecordGraphCapturingStreams(); + void RecordStreamWithNoGraphCapturing(const gpuStream_t &stream); DecoratedAllocationPtr underlying_allocation_; + std::set graph_capturing_stream_set_; std::map outstanding_event_map_; gpuStream_t owning_stream_; SpinLock outstanding_event_map_lock_; + // To compatiable with CUDA Graph, hold the allocator shared_ptr so that + // Allocator will not deconstruct before Allocation + std::shared_ptr allocator_; }; -class StreamSafeCUDAAllocator : public Allocator { +class StreamSafeCUDAAllocator + : public Allocator, + public std::enable_shared_from_this { public: StreamSafeCUDAAllocator(std::shared_ptr underlying_allocator, - platform::CUDAPlace place, - gpuStream_t default_stream); + platform::CUDAPlace place, gpuStream_t default_stream, + bool in_cuda_graph_capturing = false); ~StreamSafeCUDAAllocator(); bool IsAllocThreadSafe() const override; @@ -63,7 +73,7 @@ class StreamSafeCUDAAllocator : public Allocator { private: void ProcessUnfreedAllocations(); - uint64_t ProcessUnfreedAllocationsWithRelease(); + uint64_t ProcessUnfreedAllocationsAndRelease(); static std::map> allocator_map_; @@ -74,6 +84,8 @@ class StreamSafeCUDAAllocator : public Allocator { gpuStream_t default_stream_; std::list unfreed_allocations_; SpinLock unfreed_allocation_lock_; + + bool in_cuda_graph_capturing_; }; } // namespace allocation diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index b60bb4fc1d1..2bca2c388a0 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -41,6 +41,11 @@ std::shared_ptr AllocShared(const platform::Place& place, stream); } +AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, + const phi::Stream& stream) { + return allocation::AllocatorFacade::Instance().Alloc(place, size, stream); +} + bool InSameStream(const std::shared_ptr& allocation, const phi::Stream& stream) { return allocation::AllocatorFacade::Instance().InSameStream(allocation, @@ -52,11 +57,6 @@ void* GetBasePtr(const std::shared_ptr& allocation) { } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, - const gpuStream_t& stream) { - return allocation::AllocatorFacade::Instance().Alloc(place, size, stream); -} - uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream) { return allocation::AllocatorFacade::Instance().Release(place, stream); } diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h index 89b4caa5bed..601fe3f2a42 100644 --- a/paddle/fluid/memory/malloc.h +++ b/paddle/fluid/memory/malloc.h @@ -41,15 +41,15 @@ extern std::shared_ptr AllocShared(const platform::Place& place, size_t size, const phi::Stream& stream); +extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, + const phi::Stream& stream); + extern bool InSameStream(const std::shared_ptr& allocation, const phi::Stream& stream); extern void* GetBasePtr(const std::shared_ptr& allocation); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, - const gpuStream_t& stream); - extern uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream); diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu index 933717f3090..5e4a4234bb4 100644 --- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu +++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu @@ -12,34 +12,35 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifdef PADDLE_WITH_CUDA -#include -#include -#endif - -#ifdef PADDLE_WITH_HIP -#include -#endif - #include // NOLINT #include #include "gtest/gtest.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" -#include "paddle/fluid/memory/malloc.h" -#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" +#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/core/stream.h" +#ifdef PADDLE_WITH_CUDA +#include +#include +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" +#endif + +#ifdef PADDLE_WITH_HIP +#include +#endif + namespace paddle { namespace memory { -__global__ void add_kernel(int *x, int n) { +// y += (x + 1) +__global__ void add_kernel(int *x, int *y, int n) { int thread_num = gridDim.x * blockDim.x; int thread_id = blockIdx.x * blockDim.x + threadIdx.x; for (int i = thread_id; i < n; i += thread_num) { - atomicAdd(x + i, thread_id); + y[i] += x[i] + 1; } } @@ -51,153 +52,6 @@ void CheckMemLeak(const platform::CUDAPlace &place) { << " there may be a memory leak problem"; } -class StreamSafeCUDAAllocTest : public ::testing::Test { - protected: - void SetUp() override { - place_ = platform::CUDAPlace(); - stream_num_ = 64; - grid_num_ = 1; - block_num_ = 32; - data_num_ = 131072; - workspace_size_ = data_num_ * sizeof(int); - - // alloc workspace for each stream - for (size_t i = 0; i < stream_num_; ++i) { - gpuStream_t stream; -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream)); -#endif - - std::shared_ptr allocation = - AllocShared(place_, workspace_size_, - phi::Stream(reinterpret_cast(stream))); -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS( - cudaMemset(allocation->ptr(), 0, allocation->size())); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - hipMemset(allocation->ptr(), 0, allocation->size())); -#endif - - streams_.emplace_back(stream); - workspaces_.emplace_back(allocation); - } - - result_ = Alloc(place_, stream_num_ * workspace_size_); - } - - void SingleStreamRun(size_t idx) { - // for all stream i, - // stream idx lauch a kernel to add (j % thread_num) to workspaces_[i][j] - for (size_t i = 0; i < stream_num_; ++i) { - int *x = reinterpret_cast(workspaces_[i]->ptr()); - add_kernel<<>>(x, data_num_); - RecordStream(workspaces_[i], streams_[idx]); - } - } - - void CopyResultAsync() { - for (size_t i = 0; i < stream_num_; ++i) { -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync( - reinterpret_cast(result_->ptr()) + i * data_num_, - workspaces_[i]->ptr(), workspace_size_, cudaMemcpyDeviceToDevice)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync( - reinterpret_cast(result_->ptr()) + i * data_num_, - workspaces_[i]->ptr(), workspace_size_, hipMemcpyDeviceToDevice)); -#endif - } - } - - void MultiStreamRun() { - for (size_t i = 0; i < stream_num_; ++i) { - SingleStreamRun(i); - } - CopyResultAsync(); - workspaces_.clear(); // fast_gc - cudaDeviceSynchronize(); - } - - void MultiThreadMUltiStreamRun() { - std::vector threads; - for (size_t i = 0; i < stream_num_; ++i) { - threads.push_back( - std::thread(&StreamSafeCUDAAllocTest::SingleStreamRun, this, i)); - } - for (size_t i = 0; i < stream_num_; ++i) { - threads[i].join(); - } - CopyResultAsync(); - workspaces_.clear(); // fast_gc - cudaDeviceSynchronize(); - } - - void CheckResult() { - auto result_host = std::unique_ptr(new int[result_->size()]); -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(result_host.get(), result_->ptr(), - result_->size(), - cudaMemcpyDeviceToHost)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(result_host.get(), result_->ptr(), - result_->size(), - hipMemcpyDeviceToHost)); -#endif - size_t thread_num = grid_num_ * block_num_; - for (size_t i = 0; i < stream_num_; ++i) { - for (size_t j = 0; j < data_num_; ++j) { - EXPECT_TRUE(result_host[i * stream_num_ + j] == - (j % thread_num) * stream_num_); - } - } - result_.reset(); - } - - void TearDown() override { -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); -#else - PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); -#endif - for (gpuStream_t stream : streams_) { - Release(place_, stream); - } - - for (size_t i = 1; i < stream_num_; ++i) { -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams_[i])); -#else - PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(streams_[i])); -#endif - } - - CheckMemLeak(place_); - } - - size_t stream_num_; - size_t grid_num_; - size_t block_num_; - size_t data_num_; - size_t workspace_size_; - platform::CUDAPlace place_; - std::vector streams_; - std::vector> workspaces_; - allocation::AllocationPtr result_; -}; - -TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) { - MultiStreamRun(); - CheckResult(); -} - -TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStreamTest) { - MultiThreadMUltiStreamRun(); - CheckResult(); -} - TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) { platform::CUDAPlace place = platform::CUDAPlace(); size_t alloc_size = 256; @@ -214,7 +68,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) { paddle::platform::DeviceContextPool::Instance().Get(place)) ->stream(); allocation::AllocationPtr allocation_unique = - Alloc(place, alloc_size, default_stream); + Alloc(place, alloc_size, + phi::Stream(reinterpret_cast(default_stream))); EXPECT_GE(allocation_unique->size(), alloc_size); EXPECT_EQ(allocation_unique->ptr(), address); allocation_unique.reset(); @@ -303,36 +158,6 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) { CheckMemLeak(place); } -#ifdef PADDLE_WITH_CUDA -TEST(StreamSafeCUDAAllocInterfaceTest, CUDAGraphExceptionTest) { - platform::CUDAPlace place = platform::CUDAPlace(); - size_t alloc_size = 1; - std::shared_ptr allocation = AllocShared(place, alloc_size); - - platform::BeginCUDAGraphCapture(place, cudaStreamCaptureModeGlobal); - EXPECT_THROW(AllocShared(place, alloc_size), paddle::platform::EnforceNotMet); - EXPECT_THROW(Alloc(place, alloc_size), paddle::platform::EnforceNotMet); - EXPECT_THROW(Release(place), paddle::platform::EnforceNotMet); - EXPECT_THROW(allocation::AllocatorFacade::Instance().GetAllocator(place), - paddle::platform::EnforceNotMet); - EXPECT_THROW( - AllocShared(place, alloc_size, - phi::Stream(reinterpret_cast(nullptr))), - paddle::platform::EnforceNotMet); - EXPECT_THROW(Alloc(place, alloc_size, nullptr), - paddle::platform::EnforceNotMet); - EXPECT_THROW(Release(place, nullptr), paddle::platform::EnforceNotMet); - EXPECT_THROW(RecordStream(allocation, nullptr), - paddle::platform::EnforceNotMet); - EXPECT_THROW(GetStream(allocation), paddle::platform::EnforceNotMet); - platform::EndCUDAGraphCapture(); - - allocation.reset(); - Release(place); - CheckMemLeak(place); -} -#endif - TEST(StreamSafeCUDAAllocRetryTest, RetryTest) { platform::CUDAPlace place = platform::CUDAPlace(); gpuStream_t stream1, stream2; @@ -348,12 +173,14 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) { // so the second alloc will fail and retry size_t alloc_size = available_size / 4 * 3; - allocation::AllocationPtr allocation1 = Alloc(place, alloc_size, stream1); + allocation::AllocationPtr allocation1 = Alloc( + place, alloc_size, phi::Stream(reinterpret_cast(stream1))); allocation::AllocationPtr allocation2; std::thread th([&allocation2, &place, &stream2, alloc_size]() { std::this_thread::sleep_for(std::chrono::seconds(1)); - allocation2 = Alloc(place, alloc_size, stream2); + allocation2 = Alloc(place, alloc_size, + phi::Stream(reinterpret_cast(stream2))); }); allocation1.reset(); // free but not release th.join(); @@ -371,5 +198,201 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) { CheckMemLeak(place); } +class StreamSafeCUDAAllocTest : public ::testing::Test { + protected: + void SetUp() override { + place_ = platform::CUDAPlace(); + stream_num_ = 64; + grid_num_ = 1; + block_num_ = 32; + data_num_ = 131072; + workspace_size_ = data_num_ * sizeof(int); + + for (size_t i = 0; i < stream_num_; ++i) { + gpuStream_t stream; +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream)); +#else + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream)); +#endif + + std::shared_ptr workspace_allocation = + AllocShared(place_, workspace_size_, + phi::Stream(reinterpret_cast(stream))); + std::shared_ptr result_allocation = + AllocShared(place_, workspace_size_, + phi::Stream(reinterpret_cast(stream))); + std::shared_ptr host_result_allocation = + AllocShared(platform::CPUPlace(), workspace_size_); + +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemset(workspace_allocation->ptr(), 0, + workspace_allocation->size())); + PADDLE_ENFORCE_GPU_SUCCESS( + cudaMemset(result_allocation->ptr(), 0, result_allocation->size())); +#else + PADDLE_ENFORCE_GPU_SUCCESS(hipMemset(workspace_allocation->ptr(), 0, + workspace_allocation->size())); + PADDLE_ENFORCE_GPU_SUCCESS( + hipMemset(result_allocation->ptr(), 0, result_allocation->size())); +#endif + + streams_.emplace_back(stream); + workspaces_.emplace_back(workspace_allocation); + results_.emplace_back(result_allocation); + host_results_.emplace_back(host_result_allocation); + } + } + + void SingleStreamRun(size_t idx) { + int *y = reinterpret_cast(results_[idx]->ptr()); + int neighbouring_idx = idx > 0 ? idx - 1 : idx; + + add_kernel<<>>( + reinterpret_cast(workspaces_[idx]->ptr()), y, data_num_); + add_kernel<<>>( + reinterpret_cast(workspaces_[neighbouring_idx]->ptr()), y, + data_num_); + RecordStream(workspaces_[neighbouring_idx], streams_[idx]); + } + + void MultiStreamRun() { + // Must run in reverse order, or the workspace_[i - 1] will be released + // before streams_[i]'s kernel launch + for (int i = stream_num_ - 1; i >= 0; --i) { + SingleStreamRun(i); + workspaces_[i].reset(); // fast GC + } + } + + void MultiThreadMultiStreamRun() { + std::vector threads; + for (size_t i = 0; i < stream_num_; ++i) { + threads.push_back( + std::thread(&StreamSafeCUDAAllocTest::SingleStreamRun, this, i)); + } + for (size_t i = 0; i < stream_num_; ++i) { + threads[i].join(); + } + workspaces_.clear(); + } + + void CUDAGraphRun() { + testing_cuda_graph_ = true; + platform::BeginCUDAGraphCapture(platform::CUDAPlace(), + cudaStreamCaptureModeGlobal); + + std::shared_ptr data_allocation = + AllocShared(platform::CUDAPlace(), workspace_size_); + std::shared_ptr result_allocation = + AllocShared(platform::CUDAPlace(), workspace_size_); + + int *data = static_cast(data_allocation->ptr()); + int *result = static_cast(result_allocation->ptr()); + + gpuStream_t main_stream = GetStream(data_allocation); + gpuStream_t other_stream; + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&other_stream)); + + add_kernel<<>>(data, result, + data_num_); + RecordStream(data_allocation, other_stream); + + std::unique_ptr cuda_graph = + platform::EndCUDAGraphCapture(); + + int replay_times = 10; + for (int i = 0; i < replay_times; ++i) { + cuda_graph->Replay(); + } + + std::shared_ptr host_result_allocation = + AllocShared(platform::CPUPlace(), workspace_size_); + Copy(host_result_allocation->place(), host_result_allocation->ptr(), + result_allocation->place(), result_allocation->ptr(), workspace_size_, + main_stream); + cudaStreamSynchronize(main_stream); + + int *host_result = static_cast(host_result_allocation->ptr()); + for (int i = 0; i < data_num_; ++i) { + EXPECT_EQ(host_result[i], replay_times); + } + + data_allocation.reset(); + result_allocation.reset(); + cuda_graph.release(); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(other_stream)); + } + + void CheckResult() { + for (size_t i = 0; i < stream_num_; ++i) { + Copy(host_results_[i]->place(), host_results_[i]->ptr(), + results_[i]->place(), results_[i]->ptr(), workspace_size_, + streams_[i]); + } + cudaDeviceSynchronize(); + + size_t thread_num = grid_num_ * block_num_; + for (size_t i = 0; i < stream_num_; ++i) { + int *result = static_cast(host_results_[i]->ptr()); + for (size_t j = 0; j < data_num_; ++j) { + EXPECT_EQ(result[j], 2); + } + } + } + + void TearDown() override { + workspaces_.clear(); + results_.clear(); + host_results_.clear(); + for (gpuStream_t stream : streams_) { + Release(place_, stream); + } + + for (size_t i = 0; i < stream_num_; ++i) { +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams_[i])); +#else + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(streams_[i])); +#endif + } + + // Memory release for CUDA Graph memory pool is forbidden + if (!testing_cuda_graph_) { + CheckMemLeak(place_); + } + } + + bool testing_cuda_graph_{0}; + size_t stream_num_; + size_t grid_num_; + size_t block_num_; + size_t data_num_; + size_t workspace_size_; + platform::CUDAPlace place_; + std::vector streams_; + std::vector> workspaces_; + std::vector> results_; + std::vector> host_results_; +}; + +TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) { + MultiStreamRun(); + CheckResult(); +} + +TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStreamTest) { + MultiThreadMultiStreamRun(); + CheckResult(); +} + +#ifdef PADDLE_WITH_CUDA +TEST_F(StreamSafeCUDAAllocTest, CUDAGraphTest) { + MultiStreamRun(); + CUDAGraphRun(); + CheckResult(); +} +#endif + } // namespace memory } // namespace paddle -- GitLab