From 0ad2e192649d909a348195859aaa2e3135e47ae6 Mon Sep 17 00:00:00 2001 From: From00 Date: Sun, 27 Mar 2022 10:03:19 +0800 Subject: [PATCH] Make StreamSafeCUDAAllocator compatible with NaiveBestFit strategy (#40886) * Make StreamSafeCUDAAllocator compatible with NaiveBestFit strategy * Set FLAGS_use_stream_safe_cuda_allocator to false * Update * Remove unnecessary code * Fix CI errors * Add UT --- .../framework/new_executor/interpretercore.cc | 5 +- .../memory/allocation/allocator_facade.cc | 212 ++++++++---------- .../memory/allocation/allocator_facade.h | 4 +- paddle/fluid/memory/malloc.cc | 2 +- paddle/fluid/memory/malloc.h | 2 +- paddle/fluid/memory/malloc_test.cu | 24 +- .../memory/stream_safe_cuda_alloc_test.cu | 13 ++ 7 files changed, 130 insertions(+), 132 deletions(-) diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 6e73aaef15e..b36ff519ce1 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -32,7 +32,6 @@ PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope, true, DECLARE_bool(check_nan_inf); DECLARE_bool(benchmark); DECLARE_bool(fast_eager_deletion_mode); -DECLARE_bool(use_stream_safe_cuda_allocator); constexpr const char* kExceptionCaught = "ExceptionCaught"; constexpr const char* kTaskCompletion = "TaskCompletion"; @@ -44,7 +43,9 @@ static constexpr size_t kHostNumThreads = 4; static constexpr size_t kDeviceNumThreads = 1; bool IsInterpretercoreFastGCEnabled() { - return FLAGS_fast_eager_deletion_mode && FLAGS_use_stream_safe_cuda_allocator; + return memory::allocation::AllocatorFacade::Instance() + .IsStreamSafeCUDAAllocatorUsed() && + FLAGS_fast_eager_deletion_mode; } InterpreterCore::InterpreterCore(const platform::Place& place, diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index abf72564753..88bbe339f87 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -145,6 +145,14 @@ class CUDAGraphAllocator }; #endif +static bool IsCUDAGraphCapturing() { +#ifdef PADDLE_WITH_CUDA + return UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing()); +#else + return false; +#endif +} + class AllocatorFacadePrivate { public: using AllocatorMap = std::map>; @@ -157,6 +165,8 @@ class AllocatorFacadePrivate { explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) { strategy_ = GetAllocatorStrategy(); + is_stream_safe_cuda_allocator_used_ = false; + switch (strategy_) { case AllocatorStrategy::kNaiveBestFit: { InitNaiveBestFitCPUAllocator(); @@ -166,12 +176,6 @@ class AllocatorFacadePrivate { } #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - PADDLE_ENFORCE_EQ( - FLAGS_use_stream_safe_cuda_allocator, false, - paddle::platform::errors::Unimplemented( - "StreamSafeCUDAAllocator is only implemented for auto_growth " - "strategy, not support naive_best_fit strategy")); - for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id)); } @@ -216,21 +220,24 @@ class AllocatorFacadePrivate { allow_free_idle_chunk_); } - // Note(Ruibiao): For GPU multi-stream case, the 'allocators_' map(place - // -> Allocator) hold the StreamSafeCUDAAllocator releate to default - // stream (i.e., the stream directly got from DeviceContex), while the - // 'cuda_allocators_' map(place -> map(stream -> Allocator)) hold the - // StreamSafeCUDAAllocator releate to non-default stream (i.e., the - // stream users pass in). The default stream Allocator is built in the - // structure of AllocatorFacadePrivate, while the non-default stream is - // build in a delayed manner in GetAllocator function with - // 'create_if_not_found = ture'. We make special treatment for the - // default stream for performance reasons. Since most Alloc calls are - // for default stream in application, treating it separately can avoid - // lots of overhead of acquiring default stream and applying read-write - // lock. + // Note(Ruibiao): For GPU multi-stream case without CUDA graph + // capturing, the 'allocators_' map(place -> Allocator) hold the + // StreamSafeCUDAAllocator releate to defaultstream (i.e., the stream + // directly got from DeviceContex), while the 'cuda_allocators_' map + // (place -> map(stream -> Allocator)) hold the StreamSafeCUDAAllocator + // releate to non-default stream (i.e., the stream users pass in). The + // default stream Allocator is built in the structure of + // AllocatorFacadePrivate, while the non-default stream is build in a + // manner in GetAllocator function with 'create_if_not_found = ture'. + // We make special treatment for the default stream for performance + // reasons. Since most Alloc calls are for default stream in + // application, treating it separately can avoid lots of overhead of + // acquiring default stream and applying read-write lock. if (FLAGS_use_stream_safe_cuda_allocator) { - WrapStreamSafeCUDAAllocatorForDefault(); + if (LIKELY(!IsCUDAGraphCapturing())) { + WrapStreamSafeCUDAAllocatorForDefault(); + } + is_stream_safe_cuda_allocator_used_ = true; } InitNaiveBestFitCUDAPinnedAllocator(); @@ -283,12 +290,6 @@ class AllocatorFacadePrivate { } #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - PADDLE_ENFORCE_EQ( - FLAGS_use_stream_safe_cuda_allocator, false, - paddle::platform::errors::Unimplemented( - "StreamSafeCUDAAllocator is only implemented for auto_growth " - "strategy, not support thread_local strategy")); - for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id)); } @@ -317,8 +318,9 @@ class AllocatorFacadePrivate { CheckAllocThreadSafe(); #ifdef PADDLE_WITH_CUDA - if (FLAGS_use_stream_safe_cuda_allocator == false && - UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) { + // No need to wrap CUDAGraphAllocator for StreamSafeCUDAAllocator + if (!is_stream_safe_cuda_allocator_used_ && + UNLIKELY(IsCUDAGraphCapturing())) { WrapCUDAGraphAllocator(); } #endif @@ -343,6 +345,11 @@ class AllocatorFacadePrivate { return static_cast(allocation.get())->base_ptr(); } + bool IsStreamSafeCUDAAllocatorUsed() { + return is_stream_safe_cuda_allocator_used_ && + LIKELY(FLAGS_use_system_allocator == false); + } + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) bool HasCUDAAllocator(const platform::CUDAPlace& place, const gpuStream_t& stream) { @@ -358,9 +365,11 @@ class AllocatorFacadePrivate { const std::shared_ptr& GetAllocator( const platform::CUDAPlace& place, const gpuStream_t& stream, bool create_if_not_found = false) { - if (stream == GetDefaultStream(place)) { - VLOG(7) << "Get Allocator by passing in a default stream"; - return GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); + if (LIKELY(!IsCUDAGraphCapturing())) { + if (stream == GetDefaultStream(place)) { + VLOG(7) << "Get Allocator by passing in a default stream"; + return GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); + } } /* shared_lock_guard */ { @@ -411,42 +420,30 @@ class AllocatorFacadePrivate { << place; } - void SetDefaultStreamFromDeviceContext() { - VLOG(8) << "Set default stream from DeviceContex"; - for (auto& pair : default_stream_safe_cuda_allocators_) { - platform::DeviceContextPool& pool = - platform::DeviceContextPool::Instance(); - pair.second->SetDefaultStream( - static_cast(pool.Get(pair.first))->stream()); - } - } - void RecordStream(std::shared_ptr allocation, const gpuStream_t& stream) { - if (allocation->size() == 0) { - return; + std::shared_ptr stream_safe_cuda_allocation = + std::dynamic_pointer_cast(allocation); + if (stream_safe_cuda_allocation != nullptr) { + stream_safe_cuda_allocation->RecordStream(stream); + } else { + VLOG(6) << "RecordStream for a non-StreamSafeCUDAAllocation"; } - - StreamSafeCUDAAllocation* stream_safe_cuda_allocation = - dynamic_cast(allocation.get()); - PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation, - platform::errors::InvalidArgument( - "Failed to dynamic cast %p from Allocation* to " - "StreamSafeCUDAAllocation*", - allocation.get())); - stream_safe_cuda_allocation->RecordStream(stream); } - const gpuStream_t& GetStream( + const gpuStream_t GetStream( const std::shared_ptr& allocation) const { - const StreamSafeCUDAAllocation* stream_safe_cuda_allocation = - dynamic_cast(allocation.get()); - PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation, - platform::errors::InvalidArgument( - "Failed to dynamic cast %p from Allocation* to " - "StreamSafeCUDAAllocation*", - allocation.get())); - return stream_safe_cuda_allocation->GetOwningStream(); + const std::shared_ptr + stream_safe_cuda_allocation = + std::dynamic_pointer_cast(allocation); + if (stream_safe_cuda_allocation != nullptr) { + return stream_safe_cuda_allocation->GetOwningStream(); + } + + VLOG(6) << "GetStream for a non-StreamSafeCUDAAllocation"; + return static_cast( + platform::DeviceContextPool::Instance().Get(allocation->place())) + ->stream(); } #endif @@ -880,7 +877,7 @@ class AllocatorFacadePrivate { CheckAllocThreadSafe(zero_size_allocators_); CheckAllocThreadSafe(system_allocators_); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (FLAGS_use_stream_safe_cuda_allocator) { + if (is_stream_safe_cuda_allocator_used_) { CheckCUDAAllocThreadSafe(cuda_allocators_); } #endif @@ -910,6 +907,7 @@ class AllocatorFacadePrivate { static AllocatorMap zero_size_allocators_; static AllocatorMap system_allocators_; bool allow_free_idle_chunk_; + bool is_stream_safe_cuda_allocator_used_; }; AllocatorFacadePrivate::AllocatorMap AllocatorFacadePrivate::zero_size_allocators_; @@ -928,7 +926,7 @@ AllocatorFacade& AllocatorFacade::Instance() { AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const { #ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) { + if (UNLIKELY(IsCUDAGraphCapturing())) { auto id = platform::CUDAGraph::CapturingID(); auto iter = cuda_graph_map_.find(id); PADDLE_ENFORCE_NE( @@ -986,35 +984,25 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) { std::shared_ptr AllocatorFacade::AllocShared( const platform::Place& place, size_t size, const phi::Stream& stream) { - PADDLE_ENFORCE_EQ( - FLAGS_use_stream_safe_cuda_allocator, true, - platform::errors::Unimplemented( - "StreamSafeCUDAAllocator is disabled, you should not call this " - "multi-stream 'AllocaShared' function. To enable it, you can enter" - "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " - "terminal.")); return std::shared_ptr(Alloc(place, size, stream)); } AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size, const phi::Stream& stream) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - PADDLE_ENFORCE_EQ( - FLAGS_use_stream_safe_cuda_allocator, true, - platform::errors::Unimplemented( - "StreamSafeCUDAAllocator is disabled, you should not call this " - "multi-stream 'Alloc' function. To enable it, you can enter" - "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " - "terminal.")); + AllocatorFacadePrivate* m = GetPrivate(); + if (!m->IsStreamSafeCUDAAllocatorUsed()) { + VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!"; + return Alloc(place, size); + } platform::CUDAPlace p(place.GetDeviceId()); if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) { gpuStream_t s = reinterpret_cast(stream.id()); - return GetPrivate() - ->GetAllocator(p, s, /* create_if_not_found = */ true) + return m->GetAllocator(p, s, /* create_if_not_found = */ true) ->Allocate(size); } else { - return GetPrivate()->GetAllocator(p, size)->Allocate(size); + return m->GetAllocator(p, size)->Allocate(size); } #else PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU.")); @@ -1025,13 +1013,6 @@ bool AllocatorFacade::InSameStream( const std::shared_ptr& allocation, const phi::Stream& stream) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - PADDLE_ENFORCE_EQ( - FLAGS_use_stream_safe_cuda_allocator, true, - platform::errors::Unimplemented( - "StreamSafeCUDAAllocator is disabled, you should not call this " - "multi-stream 'InSameStream' function. To enable it, you can enter" - "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " - "terminal.")); gpuStream_t s = reinterpret_cast(stream.id()); return s == GetStream(allocation); #else @@ -1039,58 +1020,52 @@ bool AllocatorFacade::InSameStream( #endif } +bool AllocatorFacade::IsStreamSafeCUDAAllocatorUsed() { + return GetPrivate()->IsStreamSafeCUDAAllocatorUsed(); +} + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place, const gpuStream_t& stream) { - PADDLE_ENFORCE_EQ( - FLAGS_use_stream_safe_cuda_allocator, true, - platform::errors::Unimplemented( - "StreamSafeCUDAAllocator is disabled, you should not call this " - "multi-stream 'Release' function. To enable it, you can enter" - "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " - "terminal.")); - return GetPrivate()->GetAllocator(place, stream)->Release(place); + AllocatorFacadePrivate* m = GetPrivate(); + if (!m->IsStreamSafeCUDAAllocatorUsed()) { + VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!"; + return Release(place); + } + + return m->GetAllocator(place, stream)->Release(place); } void AllocatorFacade::RecordStream(std::shared_ptr allocation, const gpuStream_t& stream) { - PADDLE_ENFORCE_EQ( - FLAGS_use_stream_safe_cuda_allocator, true, - platform::errors::Unimplemented( - "StreamSafeCUDAAllocator is disabled, you should not call this " - "'RecordStream' function. To enable it, you can enter" - "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " - "terminal.")); GetPrivate()->RecordStream(allocation, stream); } const std::shared_ptr& AllocatorFacade::GetAllocator( const platform::Place& place, const gpuStream_t& stream) { - if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && - FLAGS_use_system_allocator == false) { - return GetPrivate()->GetAllocator(place, stream, - /*create_if_not_found=*/true); + AllocatorFacadePrivate* m = GetPrivate(); + + if (!m->IsStreamSafeCUDAAllocatorUsed()) { + VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!"; + return GetAllocator(place); } - return GetPrivate()->GetAllocator( - place, /* A non-zero num to choose allocator_ */ 1); + + if (platform::is_gpu_place(place) && FLAGS_use_system_allocator == false) { + return m->GetAllocator(place, stream, + /*create_if_not_found=*/true); + } + return m->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); } -const gpuStream_t& AllocatorFacade::GetStream( +const gpuStream_t AllocatorFacade::GetStream( const std::shared_ptr& allocation) const { - PADDLE_ENFORCE_EQ( - FLAGS_use_stream_safe_cuda_allocator, true, - platform::errors::Unimplemented( - "StreamSafeCUDAAllocator is disabled, you should not call this " - "'GetStream' function. To enable it, you can enter" - "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " - "terminal.")); return GetPrivate()->GetStream(allocation); } void AllocatorFacade::SetDefaultStream(const platform::CUDAPlace& place, const gpuStream_t& stream) { - if (FLAGS_use_stream_safe_cuda_allocator) { - GetPrivate()->SetDefaultStream(place, stream); + if (m_->IsStreamSafeCUDAAllocatorUsed()) { + m_->SetDefaultStream(place, stream); } } @@ -1109,7 +1084,6 @@ void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { "The memory pool of the CUDA Graph with ID %d have been prepared.", id)); allocator.reset(new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false)); - allocator->SetDefaultStreamFromDeviceContext(); VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id; } diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index 1ea872f7eca..d5c1e7c908c 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -76,6 +76,8 @@ class AllocatorFacade { bool InSameStream(const std::shared_ptr& allocation, const phi::Stream& stream); + bool IsStreamSafeCUDAAllocatorUsed(); + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // TODO(zhiqiu): change gpuStream_t to phi::Stream if needed. uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream); @@ -83,7 +85,7 @@ class AllocatorFacade { const gpuStream_t& stream); const std::shared_ptr& GetAllocator(const platform::Place& place, const gpuStream_t& stream); - const gpuStream_t& GetStream( + const gpuStream_t GetStream( const std::shared_ptr& allocation) const; void SetDefaultStream(const platform::CUDAPlace& place, const gpuStream_t& stream); diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index 2bca2c388a0..f3de317dd1d 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -67,7 +67,7 @@ void RecordStream(std::shared_ptr allocation, stream); } -const gpuStream_t& GetStream(const std::shared_ptr& allocation) { +const gpuStream_t GetStream(const std::shared_ptr& allocation) { return allocation::AllocatorFacade::Instance().GetStream(allocation); } diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h index 601fe3f2a42..e6d910579ba 100644 --- a/paddle/fluid/memory/malloc.h +++ b/paddle/fluid/memory/malloc.h @@ -56,7 +56,7 @@ extern uint64_t Release(const platform::CUDAPlace& place, void RecordStream(std::shared_ptr allocation, const gpuStream_t& stream); -const gpuStream_t& GetStream(const std::shared_ptr& allocation); +const gpuStream_t GetStream(const std::shared_ptr& allocation); #endif } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu index 07577531d64..9837d3e4fab 100644 --- a/paddle/fluid/memory/malloc_test.cu +++ b/paddle/fluid/memory/malloc_test.cu @@ -12,6 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include // NOLINT +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/core/stream.h" + #ifdef PADDLE_WITH_CUDA #include #include @@ -21,14 +30,6 @@ #include #endif -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/memory/allocation/allocator_facade.h" -#include "paddle/fluid/memory/malloc.h" -#include "paddle/fluid/platform/device_context.h" - namespace paddle { namespace memory { @@ -196,5 +197,12 @@ TEST(Malloc, AllocZero) { AllocationPtr allocation_ptr = Alloc(place, 0); EXPECT_GE(allocation_ptr->size(), 0); } + +TEST(Malloc, AllocWithStream) { + size_t size = 1024; + AllocationPtr allocation = Alloc(platform::CUDAPlace(), size, phi::Stream(0)); + EXPECT_EQ(allocation->size(), 1024); +} + } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu index 5e4a4234bb4..3bf873bcfc2 100644 --- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu +++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu @@ -101,6 +101,19 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorInterfaceTest) { CheckMemLeak(place); } +TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorWithDefaultStreamTest) { + auto &instance = allocation::AllocatorFacade::Instance(); + platform::CUDAPlace place = platform::CUDAPlace(); + const std::shared_ptr allocator_implicit_stream = + instance.GetAllocator(place); + const std::shared_ptr allocator_default_stream = + instance.GetAllocator( + place, static_cast( + platform::DeviceContextPool::Instance().Get(place)) + ->stream()); + EXPECT_EQ(allocator_implicit_stream.get(), allocator_default_stream.get()); +} + TEST(StreamSafeCUDAAllocInterfaceTest, ZeroSizeRecordStreamTest) { platform::CUDAPlace place = platform::CUDAPlace(); std::shared_ptr zero_size_allocation = AllocShared(place, 0); -- GitLab