未验证 提交 d8bff988 编写于 作者: F From00 提交者: GitHub

Performance optimization for StreamSafeCudaAllocator (#40718)

* Performance optimize

* Optimize GetAllocator, RWLock and ProcessUnfreedAllocation

* Remove test file

* Fix CI error

* Fix CI errors

* Fix CI errors
上级 c15e3823
...@@ -34,6 +34,7 @@ ...@@ -34,6 +34,7 @@
#include "paddle/fluid/memory/allocation/thread_local_allocator.h" #include "paddle/fluid/memory/allocation/thread_local_allocator.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h" #include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
...@@ -210,13 +211,28 @@ class AllocatorFacadePrivate { ...@@ -210,13 +211,28 @@ class AllocatorFacadePrivate {
InitNaiveBestFitCPUAllocator(); InitNaiveBestFitCPUAllocator();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
allow_free_idle_chunk_ = allow_free_idle_chunk; allow_free_idle_chunk_ = allow_free_idle_chunk;
if (!FLAGS_use_stream_safe_cuda_allocator) { for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
++dev_id) { allow_free_idle_chunk_);
InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id), }
allow_free_idle_chunk_);
} // Note(Ruibiao): For GPU multi-stream case, the 'allocators_' map(place
// -> Allocator) hold the StreamSafeCUDAAllocator releate to default
// stream (i.e., the stream directly got from DeviceContex), while the
// 'cuda_allocators_' map(place -> map(stream -> Allocator)) hold the
// StreamSafeCUDAAllocator releate to non-default stream (i.e., the
// stream users pass in). The default stream Allocator is built in the
// structure of AllocatorFacadePrivate, while the non-default stream is
// build in a delayed manner in GetAllocator function with
// 'create_if_not_found = ture'. We make special treatment for the
// default stream for performance reasons. Since most Alloc calls are
// for default stream in application, treating it separately can avoid
// lots of overhead of acquiring default stream and applying read-write
// lock.
if (FLAGS_use_stream_safe_cuda_allocator) {
WrapStreamSafeCUDAAllocatorForDefault();
} }
InitNaiveBestFitCUDAPinnedAllocator(); InitNaiveBestFitCUDAPinnedAllocator();
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
...@@ -301,7 +317,8 @@ class AllocatorFacadePrivate { ...@@ -301,7 +317,8 @@ class AllocatorFacadePrivate {
CheckAllocThreadSafe(); CheckAllocThreadSafe();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) { if (FLAGS_use_stream_safe_cuda_allocator == false &&
UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
WrapCUDAGraphAllocator(); WrapCUDAGraphAllocator();
} }
#endif #endif
...@@ -341,7 +358,12 @@ class AllocatorFacadePrivate { ...@@ -341,7 +358,12 @@ class AllocatorFacadePrivate {
const std::shared_ptr<Allocator>& GetAllocator( const std::shared_ptr<Allocator>& GetAllocator(
const platform::CUDAPlace& place, const gpuStream_t& stream, const platform::CUDAPlace& place, const gpuStream_t& stream,
bool create_if_not_found = false) { bool create_if_not_found = false) {
{ // shared_lock_guard if (stream == GetDefaultStream(place)) {
VLOG(7) << "Get Allocator by passing in a default stream";
return GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
}
/* shared_lock_guard */ {
std::shared_lock<std::shared_timed_mutex> lock_guard( std::shared_lock<std::shared_timed_mutex> lock_guard(
cuda_allocator_mutex_); cuda_allocator_mutex_);
if (LIKELY(HasCUDAAllocator(place, stream))) { if (LIKELY(HasCUDAAllocator(place, stream))) {
...@@ -355,7 +377,7 @@ class AllocatorFacadePrivate { ...@@ -355,7 +377,7 @@ class AllocatorFacadePrivate {
} }
} }
{ // unique_lock_guard /* unique_lock_guard */ {
std::unique_lock<std::shared_timed_mutex> lock_guard( std::unique_lock<std::shared_timed_mutex> lock_guard(
cuda_allocator_mutex_); cuda_allocator_mutex_);
InitStreamSafeCUDAAllocator(place, stream); InitStreamSafeCUDAAllocator(place, stream);
...@@ -363,9 +385,40 @@ class AllocatorFacadePrivate { ...@@ -363,9 +385,40 @@ class AllocatorFacadePrivate {
} }
} }
gpuStream_t GetDefaultStream(const platform::CUDAPlace& place) { const std::shared_ptr<StreamSafeCUDAAllocator>
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); GetDefaultStreamSafeCUDAAllocator(const platform::CUDAPlace& place) const {
return static_cast<platform::CUDADeviceContext*>(pool.Get(place))->stream(); const auto iter = default_stream_safe_cuda_allocators_.find(place);
PADDLE_ENFORCE_NE(
iter, default_stream_safe_cuda_allocators_.end(),
platform::errors::NotFound(
"No StreamSafeCUDAAllocator found for the place, %s", place));
return iter->second;
}
const gpuStream_t& GetDefaultStream(const platform::CUDAPlace& place) const {
const std::shared_ptr<StreamSafeCUDAAllocator>& allocator =
GetDefaultStreamSafeCUDAAllocator(place);
return allocator->GetDefaultStream();
}
void SetDefaultStream(const platform::CUDAPlace& place,
const gpuStream_t& stream) {
const std::shared_ptr<StreamSafeCUDAAllocator>& allocator =
GetDefaultStreamSafeCUDAAllocator(place);
allocator->SetDefaultStream(stream);
VLOG(8) << "Set default stream to " << stream
<< " for StreamSafeCUDAAllocator(" << allocator.get() << ") in "
<< place;
}
void SetDefaultStreamFromDeviceContext() {
VLOG(8) << "Set default stream from DeviceContex";
for (auto& pair : default_stream_safe_cuda_allocators_) {
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
pair.second->SetDefaultStream(
static_cast<phi::GPUContext*>(pool.Get(pair.first))->stream());
}
} }
void RecordStream(std::shared_ptr<phi::Allocation> allocation, void RecordStream(std::shared_ptr<phi::Allocation> allocation,
...@@ -635,6 +688,26 @@ class AllocatorFacadePrivate { ...@@ -635,6 +688,26 @@ class AllocatorFacadePrivate {
/* in_cuda_graph_capturing = */ !allow_free_idle_chunk_); /* in_cuda_graph_capturing = */ !allow_free_idle_chunk_);
} }
void WrapStreamSafeCUDAAllocatorForDefault() {
for (auto& pair : allocators_) {
auto& place = pair.first;
if (platform::is_gpu_place(place)) {
std::shared_ptr<StreamSafeCUDAAllocator>&& allocator =
std::make_shared<StreamSafeCUDAAllocator>(
pair.second, place, /* default_stream = */ nullptr,
/* in_cuda_graph_capturing = */ !allow_free_idle_chunk_);
pair.second = allocator;
// NOTE(Ruibiao): A tricky implement to give StreamSafeCUDAAllocator an
// ability to interact with the outside world, i.e., change default
// stream from outside
default_stream_safe_cuda_allocators_[place] = allocator;
VLOG(8) << "WrapStreamSafeCUDAAllocator for " << place
<< ", allocator address = " << pair.second.get();
}
}
}
void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream, void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream,
size_t retry_time) { size_t retry_time) {
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
...@@ -813,7 +886,6 @@ class AllocatorFacadePrivate { ...@@ -813,7 +886,6 @@ class AllocatorFacadePrivate {
#endif #endif
} }
// NOTE(Ruibiao): Old single-stream version, will be removed later
void WrapCUDARetryAllocator(size_t retry_time) { void WrapCUDARetryAllocator(size_t retry_time) {
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
retry_time, 0, retry_time, 0,
...@@ -828,6 +900,8 @@ class AllocatorFacadePrivate { ...@@ -828,6 +900,8 @@ class AllocatorFacadePrivate {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// a standalone CUDA allocator to support multi-stream GC in new executor // a standalone CUDA allocator to support multi-stream GC in new executor
std::map<platform::Place, std::shared_ptr<StreamSafeCUDAAllocator>>
default_stream_safe_cuda_allocators_;
CUDAAllocatorMap cuda_allocators_; CUDAAllocatorMap cuda_allocators_;
std::shared_timed_mutex cuda_allocator_mutex_; std::shared_timed_mutex cuda_allocator_mutex_;
#endif #endif
...@@ -870,15 +944,6 @@ AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const { ...@@ -870,15 +944,6 @@ AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const {
const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator( const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
const platform::Place& place) { const platform::Place& place) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
FLAGS_use_system_allocator == false) {
AllocatorFacadePrivate* m = GetPrivate();
platform::CUDAPlace cuda_place(place.GetDeviceId());
return m->GetAllocator(cuda_place, m->GetDefaultStream(cuda_place));
}
#endif
return GetPrivate()->GetAllocator( return GetPrivate()->GetAllocator(
place, /* A non-zero num to choose allocator_ */ 1); place, /* A non-zero num to choose allocator_ */ 1);
} }
...@@ -898,19 +963,6 @@ void* AllocatorFacade::GetBasePtr( ...@@ -898,19 +963,6 @@ void* AllocatorFacade::GetBasePtr(
return GetPrivate()->GetBasePtr(allocation); return GetPrivate()->GetBasePtr(allocation);
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
const platform::Place& place, const gpuStream_t& stream) {
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
FLAGS_use_system_allocator == false) {
return GetPrivate()->GetAllocator(place, stream,
/*create_if_not_found=*/true);
}
return GetPrivate()->GetAllocator(
place, /* A non-zero num to choose allocator_ */ 1);
}
#endif
const std::shared_ptr<Allocator>& AllocatorFacade::GetZeroAllocator( const std::shared_ptr<Allocator>& AllocatorFacade::GetZeroAllocator(
const platform::Place& place) { const platform::Place& place) {
return GetPrivate()->GetAllocator(place, /* zero size */ 0); return GetPrivate()->GetAllocator(place, /* zero size */ 0);
...@@ -923,26 +975,10 @@ std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared( ...@@ -923,26 +975,10 @@ std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
size_t size) { size_t size) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
size > 0 && FLAGS_use_system_allocator == false) {
platform::CUDAPlace cuda_place(place.GetDeviceId());
phi::Stream default_stream = phi::Stream(reinterpret_cast<phi::StreamId>(
GetPrivate()->GetDefaultStream(cuda_place)));
return Alloc(cuda_place, size, default_stream);
}
#endif
return GetPrivate()->GetAllocator(place, size)->Allocate(size); return GetPrivate()->GetAllocator(place, size)->Allocate(size);
} }
uint64_t AllocatorFacade::Release(const platform::Place& place) { uint64_t AllocatorFacade::Release(const platform::Place& place) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
FLAGS_use_system_allocator == false) {
platform::CUDAPlace cuda_place(place.GetDeviceId());
return Release(cuda_place, GetPrivate()->GetDefaultStream(cuda_place));
}
#endif
return GetPrivate() return GetPrivate()
->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1) ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
->Release(place); ->Release(place);
...@@ -1028,6 +1064,17 @@ void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation, ...@@ -1028,6 +1064,17 @@ void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
GetPrivate()->RecordStream(allocation, stream); GetPrivate()->RecordStream(allocation, stream);
} }
const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
const platform::Place& place, const gpuStream_t& stream) {
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
FLAGS_use_system_allocator == false) {
return GetPrivate()->GetAllocator(place, stream,
/*create_if_not_found=*/true);
}
return GetPrivate()->GetAllocator(
place, /* A non-zero num to choose allocator_ */ 1);
}
const gpuStream_t& AllocatorFacade::GetStream( const gpuStream_t& AllocatorFacade::GetStream(
const std::shared_ptr<phi::Allocation>& allocation) const { const std::shared_ptr<phi::Allocation>& allocation) const {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
...@@ -1040,6 +1087,13 @@ const gpuStream_t& AllocatorFacade::GetStream( ...@@ -1040,6 +1087,13 @@ const gpuStream_t& AllocatorFacade::GetStream(
return GetPrivate()->GetStream(allocation); return GetPrivate()->GetStream(allocation);
} }
void AllocatorFacade::SetDefaultStream(const platform::CUDAPlace& place,
const gpuStream_t& stream) {
if (FLAGS_use_stream_safe_cuda_allocator) {
GetPrivate()->SetDefaultStream(place, stream);
}
}
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth, PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth,
...@@ -1055,6 +1109,8 @@ void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { ...@@ -1055,6 +1109,8 @@ void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
"The memory pool of the CUDA Graph with ID %d have been prepared.", "The memory pool of the CUDA Graph with ID %d have been prepared.",
id)); id));
allocator.reset(new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false)); allocator.reset(new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
allocator->SetDefaultStreamFromDeviceContext();
VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id; VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
} }
......
...@@ -55,11 +55,6 @@ class AllocatorFacade { ...@@ -55,11 +55,6 @@ class AllocatorFacade {
void* GetBasePtr(const std::shared_ptr<Allocation>& allocation); void* GetBasePtr(const std::shared_ptr<Allocation>& allocation);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place,
const gpuStream_t& stream);
#endif
const std::shared_ptr<Allocator>& GetZeroAllocator( const std::shared_ptr<Allocator>& GetZeroAllocator(
const platform::Place& place); const platform::Place& place);
...@@ -86,8 +81,12 @@ class AllocatorFacade { ...@@ -86,8 +81,12 @@ class AllocatorFacade {
uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream); uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream);
void RecordStream(std::shared_ptr<Allocation> allocation, void RecordStream(std::shared_ptr<Allocation> allocation,
const gpuStream_t& stream); const gpuStream_t& stream);
const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place,
const gpuStream_t& stream);
const gpuStream_t& GetStream( const gpuStream_t& GetStream(
const std::shared_ptr<Allocation>& allocation) const; const std::shared_ptr<Allocation>& allocation) const;
void SetDefaultStream(const platform::CUDAPlace& place,
const gpuStream_t& stream);
#endif #endif
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
......
...@@ -154,6 +154,14 @@ StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() { ...@@ -154,6 +154,14 @@ StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() {
bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; } bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; }
const gpuStream_t& StreamSafeCUDAAllocator::GetDefaultStream() const {
return default_stream_;
}
void StreamSafeCUDAAllocator::SetDefaultStream(const gpuStream_t& stream) {
default_stream_ = stream;
}
phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) { phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
platform::RecordEvent("StreamSafeCUDAAllocator::Allocate", platform::RecordEvent("StreamSafeCUDAAllocator::Allocate",
platform::TracerEventType::UserDefined, 9 /*level*/); platform::TracerEventType::UserDefined, 9 /*level*/);
...@@ -187,12 +195,8 @@ void StreamSafeCUDAAllocator::FreeImpl(phi::Allocation* allocation) { ...@@ -187,12 +195,8 @@ void StreamSafeCUDAAllocator::FreeImpl(phi::Allocation* allocation) {
platform::RecordEvent("StreamSafeCUDAAllocator::Free", platform::RecordEvent("StreamSafeCUDAAllocator::Free",
platform::TracerEventType::UserDefined, 9 /*level*/); platform::TracerEventType::UserDefined, 9 /*level*/);
StreamSafeCUDAAllocation* stream_safe_cuda_allocation = StreamSafeCUDAAllocation* stream_safe_cuda_allocation =
dynamic_cast<StreamSafeCUDAAllocation*>(allocation); static_cast<StreamSafeCUDAAllocation*>(allocation);
PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation,
platform::errors::InvalidArgument(
"Failed to dynamic cast %p from Allocation* to "
"StreamSafeCUDAAllocation*",
allocation));
VLOG(8) << "Try free allocation " << stream_safe_cuda_allocation->ptr(); VLOG(8) << "Try free allocation " << stream_safe_cuda_allocation->ptr();
if (stream_safe_cuda_allocation->CanBeFreed()) { if (stream_safe_cuda_allocation->CanBeFreed()) {
VLOG(9) << "Directly delete allocation"; VLOG(9) << "Directly delete allocation";
...@@ -221,6 +225,12 @@ uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) { ...@@ -221,6 +225,12 @@ uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) {
} }
void StreamSafeCUDAAllocator::ProcessUnfreedAllocations() { void StreamSafeCUDAAllocator::ProcessUnfreedAllocations() {
// NOTE(Ruibiao): This condition is to reduce lock competion. It does not need
// to be thread-safe since here occasional misjudgments are permissible.
if (unfreed_allocations_.empty()) {
return;
}
std::lock_guard<SpinLock> lock_guard(unfreed_allocation_lock_); std::lock_guard<SpinLock> lock_guard(unfreed_allocation_lock_);
for (auto it = unfreed_allocations_.begin(); for (auto it = unfreed_allocations_.begin();
it != unfreed_allocations_.end();) { it != unfreed_allocations_.end();) {
......
...@@ -64,7 +64,10 @@ class StreamSafeCUDAAllocator ...@@ -64,7 +64,10 @@ class StreamSafeCUDAAllocator
platform::CUDAPlace place, gpuStream_t default_stream, platform::CUDAPlace place, gpuStream_t default_stream,
bool in_cuda_graph_capturing = false); bool in_cuda_graph_capturing = false);
~StreamSafeCUDAAllocator(); ~StreamSafeCUDAAllocator();
bool IsAllocThreadSafe() const override; bool IsAllocThreadSafe() const override;
const gpuStream_t &GetDefaultStream() const;
void SetDefaultStream(const gpuStream_t &stream);
protected: protected:
phi::Allocation *AllocateImpl(size_t size) override; phi::Allocation *AllocateImpl(size_t size) override;
......
...@@ -159,10 +159,8 @@ inline void EmplaceDeviceContext( ...@@ -159,10 +159,8 @@ inline void EmplaceDeviceContext(
cuda_ctx, cuda_ctx,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Failed to dynamic_cast dev_ctx into CUDADeviceContext.")); "Failed to dynamic_cast dev_ctx into CUDADeviceContext."));
// Note: A trick method to init context, why GetAllocator interface
// needs a stream parameter?
dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance() dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
.GetAllocator(p, cuda_ctx->stream()) .GetAllocator(p)
.get()); .get());
cuda_ctx->PartialInitWithAllocator(); cuda_ctx->PartialInitWithAllocator();
dev_ctx->SetGenerator( dev_ctx->SetGenerator(
...@@ -517,10 +515,10 @@ CUDAContext::~CUDAContext() { ...@@ -517,10 +515,10 @@ CUDAContext::~CUDAContext() {
CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : phi::GPUContext(place) { CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : phi::GPUContext(place) {
phi::GPUContext::PartialInitWithoutAllocator(); phi::GPUContext::PartialInitWithoutAllocator();
cuda_stream_.reset(new stream::CUDAStream(phi::GPUContext::stream(), place)); cuda_stream_.reset(new stream::CUDAStream(phi::GPUContext::stream(), place));
workspace_.reset(new phi::DnnWorkspaceHandle( auto& instance = memory::allocation::AllocatorFacade::Instance();
memory::allocation::AllocatorFacade::Instance() instance.SetDefaultStream(place, phi::GPUContext::stream());
.GetAllocator(place, phi::GPUContext::stream()) workspace_.reset(
.get())); new phi::DnnWorkspaceHandle(instance.GetAllocator(place).get()));
} }
CUDADeviceContext::~CUDADeviceContext() = default; CUDADeviceContext::~CUDADeviceContext() = default;
...@@ -618,7 +616,7 @@ phi::DnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const { ...@@ -618,7 +616,7 @@ phi::DnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
// return workspace_.get(); // return workspace_.get();
return phi::DnnWorkspaceHandle( return phi::DnnWorkspaceHandle(
memory::allocation::AllocatorFacade::Instance() memory::allocation::AllocatorFacade::Instance()
.GetAllocator(GetPlace(), phi::GPUContext::stream()) .GetAllocator(GetPlace())
.get()); .get());
} }
return phi::GPUContext::cudnn_workspace_handle(); return phi::GPUContext::cudnn_workspace_handle();
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册