未验证 提交 d8bff988 编写于 作者: F From00 提交者: GitHub

Performance optimization for StreamSafeCudaAllocator (#40718)

* Performance optimize

* Optimize GetAllocator, RWLock and ProcessUnfreedAllocation

* Remove test file

* Fix CI error

* Fix CI errors

* Fix CI errors
上级 c15e3823
......@@ -34,6 +34,7 @@
#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
......@@ -210,13 +211,28 @@ class AllocatorFacadePrivate {
InitNaiveBestFitCPUAllocator();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
allow_free_idle_chunk_ = allow_free_idle_chunk;
if (!FLAGS_use_stream_safe_cuda_allocator) {
for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount();
++dev_id) {
InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
allow_free_idle_chunk_);
}
for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
allow_free_idle_chunk_);
}
// Note(Ruibiao): For GPU multi-stream case, the 'allocators_' map(place
// -> Allocator) hold the StreamSafeCUDAAllocator releate to default
// stream (i.e., the stream directly got from DeviceContex), while the
// 'cuda_allocators_' map(place -> map(stream -> Allocator)) hold the
// StreamSafeCUDAAllocator releate to non-default stream (i.e., the
// stream users pass in). The default stream Allocator is built in the
// structure of AllocatorFacadePrivate, while the non-default stream is
// build in a delayed manner in GetAllocator function with
// 'create_if_not_found = ture'. We make special treatment for the
// default stream for performance reasons. Since most Alloc calls are
// for default stream in application, treating it separately can avoid
// lots of overhead of acquiring default stream and applying read-write
// lock.
if (FLAGS_use_stream_safe_cuda_allocator) {
WrapStreamSafeCUDAAllocatorForDefault();
}
InitNaiveBestFitCUDAPinnedAllocator();
#endif
#ifdef PADDLE_WITH_ASCEND_CL
......@@ -301,7 +317,8 @@ class AllocatorFacadePrivate {
CheckAllocThreadSafe();
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
if (FLAGS_use_stream_safe_cuda_allocator == false &&
UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
WrapCUDAGraphAllocator();
}
#endif
......@@ -341,7 +358,12 @@ class AllocatorFacadePrivate {
const std::shared_ptr<Allocator>& GetAllocator(
const platform::CUDAPlace& place, const gpuStream_t& stream,
bool create_if_not_found = false) {
{ // shared_lock_guard
if (stream == GetDefaultStream(place)) {
VLOG(7) << "Get Allocator by passing in a default stream";
return GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
}
/* shared_lock_guard */ {
std::shared_lock<std::shared_timed_mutex> lock_guard(
cuda_allocator_mutex_);
if (LIKELY(HasCUDAAllocator(place, stream))) {
......@@ -355,7 +377,7 @@ class AllocatorFacadePrivate {
}
}
{ // unique_lock_guard
/* unique_lock_guard */ {
std::unique_lock<std::shared_timed_mutex> lock_guard(
cuda_allocator_mutex_);
InitStreamSafeCUDAAllocator(place, stream);
......@@ -363,9 +385,40 @@ class AllocatorFacadePrivate {
}
}
gpuStream_t GetDefaultStream(const platform::CUDAPlace& place) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
return static_cast<platform::CUDADeviceContext*>(pool.Get(place))->stream();
const std::shared_ptr<StreamSafeCUDAAllocator>
GetDefaultStreamSafeCUDAAllocator(const platform::CUDAPlace& place) const {
const auto iter = default_stream_safe_cuda_allocators_.find(place);
PADDLE_ENFORCE_NE(
iter, default_stream_safe_cuda_allocators_.end(),
platform::errors::NotFound(
"No StreamSafeCUDAAllocator found for the place, %s", place));
return iter->second;
}
const gpuStream_t& GetDefaultStream(const platform::CUDAPlace& place) const {
const std::shared_ptr<StreamSafeCUDAAllocator>& allocator =
GetDefaultStreamSafeCUDAAllocator(place);
return allocator->GetDefaultStream();
}
void SetDefaultStream(const platform::CUDAPlace& place,
const gpuStream_t& stream) {
const std::shared_ptr<StreamSafeCUDAAllocator>& allocator =
GetDefaultStreamSafeCUDAAllocator(place);
allocator->SetDefaultStream(stream);
VLOG(8) << "Set default stream to " << stream
<< " for StreamSafeCUDAAllocator(" << allocator.get() << ") in "
<< place;
}
void SetDefaultStreamFromDeviceContext() {
VLOG(8) << "Set default stream from DeviceContex";
for (auto& pair : default_stream_safe_cuda_allocators_) {
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
pair.second->SetDefaultStream(
static_cast<phi::GPUContext*>(pool.Get(pair.first))->stream());
}
}
void RecordStream(std::shared_ptr<phi::Allocation> allocation,
......@@ -635,6 +688,26 @@ class AllocatorFacadePrivate {
/* in_cuda_graph_capturing = */ !allow_free_idle_chunk_);
}
void WrapStreamSafeCUDAAllocatorForDefault() {
for (auto& pair : allocators_) {
auto& place = pair.first;
if (platform::is_gpu_place(place)) {
std::shared_ptr<StreamSafeCUDAAllocator>&& allocator =
std::make_shared<StreamSafeCUDAAllocator>(
pair.second, place, /* default_stream = */ nullptr,
/* in_cuda_graph_capturing = */ !allow_free_idle_chunk_);
pair.second = allocator;
// NOTE(Ruibiao): A tricky implement to give StreamSafeCUDAAllocator an
// ability to interact with the outside world, i.e., change default
// stream from outside
default_stream_safe_cuda_allocators_[place] = allocator;
VLOG(8) << "WrapStreamSafeCUDAAllocator for " << place
<< ", allocator address = " << pair.second.get();
}
}
}
void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream,
size_t retry_time) {
PADDLE_ENFORCE_GT(
......@@ -813,7 +886,6 @@ class AllocatorFacadePrivate {
#endif
}
// NOTE(Ruibiao): Old single-stream version, will be removed later
void WrapCUDARetryAllocator(size_t retry_time) {
PADDLE_ENFORCE_GT(
retry_time, 0,
......@@ -828,6 +900,8 @@ class AllocatorFacadePrivate {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// a standalone CUDA allocator to support multi-stream GC in new executor
std::map<platform::Place, std::shared_ptr<StreamSafeCUDAAllocator>>
default_stream_safe_cuda_allocators_;
CUDAAllocatorMap cuda_allocators_;
std::shared_timed_mutex cuda_allocator_mutex_;
#endif
......@@ -870,15 +944,6 @@ AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const {
const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
const platform::Place& place) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
FLAGS_use_system_allocator == false) {
AllocatorFacadePrivate* m = GetPrivate();
platform::CUDAPlace cuda_place(place.GetDeviceId());
return m->GetAllocator(cuda_place, m->GetDefaultStream(cuda_place));
}
#endif
return GetPrivate()->GetAllocator(
place, /* A non-zero num to choose allocator_ */ 1);
}
......@@ -898,19 +963,6 @@ void* AllocatorFacade::GetBasePtr(
return GetPrivate()->GetBasePtr(allocation);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
const platform::Place& place, const gpuStream_t& stream) {
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
FLAGS_use_system_allocator == false) {
return GetPrivate()->GetAllocator(place, stream,
/*create_if_not_found=*/true);
}
return GetPrivate()->GetAllocator(
place, /* A non-zero num to choose allocator_ */ 1);
}
#endif
const std::shared_ptr<Allocator>& AllocatorFacade::GetZeroAllocator(
const platform::Place& place) {
return GetPrivate()->GetAllocator(place, /* zero size */ 0);
......@@ -923,26 +975,10 @@ std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
size_t size) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
size > 0 && FLAGS_use_system_allocator == false) {
platform::CUDAPlace cuda_place(place.GetDeviceId());
phi::Stream default_stream = phi::Stream(reinterpret_cast<phi::StreamId>(
GetPrivate()->GetDefaultStream(cuda_place)));
return Alloc(cuda_place, size, default_stream);
}
#endif
return GetPrivate()->GetAllocator(place, size)->Allocate(size);
}
uint64_t AllocatorFacade::Release(const platform::Place& place) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
FLAGS_use_system_allocator == false) {
platform::CUDAPlace cuda_place(place.GetDeviceId());
return Release(cuda_place, GetPrivate()->GetDefaultStream(cuda_place));
}
#endif
return GetPrivate()
->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
->Release(place);
......@@ -1028,6 +1064,17 @@ void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
GetPrivate()->RecordStream(allocation, stream);
}
const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
const platform::Place& place, const gpuStream_t& stream) {
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
FLAGS_use_system_allocator == false) {
return GetPrivate()->GetAllocator(place, stream,
/*create_if_not_found=*/true);
}
return GetPrivate()->GetAllocator(
place, /* A non-zero num to choose allocator_ */ 1);
}
const gpuStream_t& AllocatorFacade::GetStream(
const std::shared_ptr<phi::Allocation>& allocation) const {
PADDLE_ENFORCE_EQ(
......@@ -1040,6 +1087,13 @@ const gpuStream_t& AllocatorFacade::GetStream(
return GetPrivate()->GetStream(allocation);
}
void AllocatorFacade::SetDefaultStream(const platform::CUDAPlace& place,
const gpuStream_t& stream) {
if (FLAGS_use_stream_safe_cuda_allocator) {
GetPrivate()->SetDefaultStream(place, stream);
}
}
#ifdef PADDLE_WITH_CUDA
void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth,
......@@ -1055,6 +1109,8 @@ void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
"The memory pool of the CUDA Graph with ID %d have been prepared.",
id));
allocator.reset(new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
allocator->SetDefaultStreamFromDeviceContext();
VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
}
......
......@@ -55,11 +55,6 @@ class AllocatorFacade {
void* GetBasePtr(const std::shared_ptr<Allocation>& allocation);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place,
const gpuStream_t& stream);
#endif
const std::shared_ptr<Allocator>& GetZeroAllocator(
const platform::Place& place);
......@@ -86,8 +81,12 @@ class AllocatorFacade {
uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream);
void RecordStream(std::shared_ptr<Allocation> allocation,
const gpuStream_t& stream);
const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place,
const gpuStream_t& stream);
const gpuStream_t& GetStream(
const std::shared_ptr<Allocation>& allocation) const;
void SetDefaultStream(const platform::CUDAPlace& place,
const gpuStream_t& stream);
#endif
#ifdef PADDLE_WITH_CUDA
......
......@@ -154,6 +154,14 @@ StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() {
bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; }
const gpuStream_t& StreamSafeCUDAAllocator::GetDefaultStream() const {
return default_stream_;
}
void StreamSafeCUDAAllocator::SetDefaultStream(const gpuStream_t& stream) {
default_stream_ = stream;
}
phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
platform::RecordEvent("StreamSafeCUDAAllocator::Allocate",
platform::TracerEventType::UserDefined, 9 /*level*/);
......@@ -187,12 +195,8 @@ void StreamSafeCUDAAllocator::FreeImpl(phi::Allocation* allocation) {
platform::RecordEvent("StreamSafeCUDAAllocator::Free",
platform::TracerEventType::UserDefined, 9 /*level*/);
StreamSafeCUDAAllocation* stream_safe_cuda_allocation =
dynamic_cast<StreamSafeCUDAAllocation*>(allocation);
PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation,
platform::errors::InvalidArgument(
"Failed to dynamic cast %p from Allocation* to "
"StreamSafeCUDAAllocation*",
allocation));
static_cast<StreamSafeCUDAAllocation*>(allocation);
VLOG(8) << "Try free allocation " << stream_safe_cuda_allocation->ptr();
if (stream_safe_cuda_allocation->CanBeFreed()) {
VLOG(9) << "Directly delete allocation";
......@@ -221,6 +225,12 @@ uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) {
}
void StreamSafeCUDAAllocator::ProcessUnfreedAllocations() {
// NOTE(Ruibiao): This condition is to reduce lock competion. It does not need
// to be thread-safe since here occasional misjudgments are permissible.
if (unfreed_allocations_.empty()) {
return;
}
std::lock_guard<SpinLock> lock_guard(unfreed_allocation_lock_);
for (auto it = unfreed_allocations_.begin();
it != unfreed_allocations_.end();) {
......
......@@ -64,7 +64,10 @@ class StreamSafeCUDAAllocator
platform::CUDAPlace place, gpuStream_t default_stream,
bool in_cuda_graph_capturing = false);
~StreamSafeCUDAAllocator();
bool IsAllocThreadSafe() const override;
const gpuStream_t &GetDefaultStream() const;
void SetDefaultStream(const gpuStream_t &stream);
protected:
phi::Allocation *AllocateImpl(size_t size) override;
......
......@@ -159,10 +159,8 @@ inline void EmplaceDeviceContext(
cuda_ctx,
platform::errors::InvalidArgument(
"Failed to dynamic_cast dev_ctx into CUDADeviceContext."));
// Note: A trick method to init context, why GetAllocator interface
// needs a stream parameter?
dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
.GetAllocator(p, cuda_ctx->stream())
.GetAllocator(p)
.get());
cuda_ctx->PartialInitWithAllocator();
dev_ctx->SetGenerator(
......@@ -517,10 +515,10 @@ CUDAContext::~CUDAContext() {
CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : phi::GPUContext(place) {
phi::GPUContext::PartialInitWithoutAllocator();
cuda_stream_.reset(new stream::CUDAStream(phi::GPUContext::stream(), place));
workspace_.reset(new phi::DnnWorkspaceHandle(
memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, phi::GPUContext::stream())
.get()));
auto& instance = memory::allocation::AllocatorFacade::Instance();
instance.SetDefaultStream(place, phi::GPUContext::stream());
workspace_.reset(
new phi::DnnWorkspaceHandle(instance.GetAllocator(place).get()));
}
CUDADeviceContext::~CUDADeviceContext() = default;
......@@ -618,7 +616,7 @@ phi::DnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
// return workspace_.get();
return phi::DnnWorkspaceHandle(
memory::allocation::AllocatorFacade::Instance()
.GetAllocator(GetPlace(), phi::GPUContext::stream())
.GetAllocator(GetPlace())
.get());
}
return phi::GPUContext::cudnn_workspace_handle();
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册