未验证 提交 4c0511fa 编写于 作者: F From00 提交者: GitHub

Support cuda graph in StreamSafeCudaAllocator (#39594)

* Support cuda graph in StreamSafeCudaAllocator

* Fix CI error

* Arrange AllocatorFacade

* Fix CI error

* Fix CI error

* Fix ROCM Compile error

* Fix ROCM Compile error
上级 97ccaa79
...@@ -17,7 +17,7 @@ if (WITH_GPU) ...@@ -17,7 +17,7 @@ if (WITH_GPU)
nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard) nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
nv_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info) nv_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info)
nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator) nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator) nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator cuda_graph)
nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator) nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator) cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)
......
...@@ -210,12 +210,7 @@ class AllocatorFacadePrivate { ...@@ -210,12 +210,7 @@ class AllocatorFacadePrivate {
InitNaiveBestFitCPUAllocator(); InitNaiveBestFitCPUAllocator();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
allow_free_idle_chunk_ = allow_free_idle_chunk; allow_free_idle_chunk_ = allow_free_idle_chunk;
if (FLAGS_use_stream_safe_cuda_allocator) { if (!FLAGS_use_stream_safe_cuda_allocator) {
for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount();
++dev_id) {
InitStreamSafeCUDAAllocator(platform::CUDAPlace(dev_id), nullptr);
}
} else {
for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount();
++dev_id) { ++dev_id) {
InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id), InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
...@@ -298,6 +293,12 @@ class AllocatorFacadePrivate { ...@@ -298,6 +293,12 @@ class AllocatorFacadePrivate {
} }
CheckAllocThreadSafe(); CheckAllocThreadSafe();
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
WrapCUDAGraphAllocator();
}
#endif
} }
inline const std::shared_ptr<Allocator>& GetAllocator( inline const std::shared_ptr<Allocator>& GetAllocator(
...@@ -388,39 +389,6 @@ class AllocatorFacadePrivate { ...@@ -388,39 +389,6 @@ class AllocatorFacadePrivate {
allocation.get())); allocation.get()));
return stream_safe_cuda_allocation->GetOwningStream(); return stream_safe_cuda_allocation->GetOwningStream();
} }
#ifdef PADDLE_WITH_CUDA
void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth,
platform::errors::InvalidArgument(
"CUDA Graph is only supported when the "
"FLAGS_allocator_strategy=\"auto_growth\", but got "
"FLAGS_allocator_strategy=\"%s\"",
FLAGS_allocator_strategy));
auto& allocator = cuda_graph_allocator_map_[id];
PADDLE_ENFORCE_EQ(
allocator.get(), nullptr,
platform::errors::InvalidArgument(
"The memory pool of the CUDA Graph with ID %d have been prepared.",
id));
allocator.reset(
new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
for (auto& item : allocator->allocators_) {
auto& old_allocator = item.second;
old_allocator = CUDAGraphAllocator::Create(old_allocator);
}
VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
}
void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
auto iter = cuda_graph_allocator_map_.find(id);
PADDLE_ENFORCE_NE(iter, cuda_graph_allocator_map_.end(),
platform::errors::InvalidArgument(
"Cannot find CUDA Graph with ID = %d", id));
cuda_graph_allocator_map_.erase(iter);
VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id;
}
#endif
#endif #endif
private: private:
...@@ -439,24 +407,7 @@ class AllocatorFacadePrivate { ...@@ -439,24 +407,7 @@ class AllocatorFacadePrivate {
platform::Place place_; platform::Place place_;
}; };
const AllocatorMap& GetAllocatorMap() { const AllocatorMap& GetAllocatorMap() { return allocators_; }
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
auto id = platform::CUDAGraph::CapturingID();
auto iter = cuda_graph_allocator_map_.find(id);
PADDLE_ENFORCE_NE(
iter, cuda_graph_allocator_map_.end(),
platform::errors::PermissionDenied(
"No memory pool is prepared for CUDA Graph capturing."));
VLOG(10) << "Choose CUDA Graph memory pool to allocate memory";
return iter->second->allocators_;
} else {
return allocators_;
}
#else
return allocators_;
#endif
}
void InitNaiveBestFitCPUAllocator() { void InitNaiveBestFitCPUAllocator() {
allocators_[platform::CPUPlace()] = allocators_[platform::CPUPlace()] =
...@@ -672,10 +623,10 @@ class AllocatorFacadePrivate { ...@@ -672,10 +623,10 @@ class AllocatorFacadePrivate {
} }
void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) { void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
const std::shared_ptr<Allocator>& underlying_allocator = std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
cuda_allocators_[p][stream]; allocator = std::make_shared<StreamSafeCUDAAllocator>(
cuda_allocators_[p][stream] = std::make_shared<StreamSafeCUDAAllocator>( allocator, p, stream,
underlying_allocator, p, stream); /* in_cuda_graph_capturing = */ !allow_free_idle_chunk_);
} }
void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream, void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream,
...@@ -684,10 +635,19 @@ class AllocatorFacadePrivate { ...@@ -684,10 +635,19 @@ class AllocatorFacadePrivate {
retry_time, 0, retry_time, 0,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Retry time should be larger than 0, but got %d", retry_time)); "Retry time should be larger than 0, but got %d", retry_time));
std::shared_ptr<Allocator> allocator = cuda_allocators_[p][stream]; std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
allocator = std::make_shared<RetryAllocator>(allocator, retry_time); allocator = std::make_shared<RetryAllocator>(allocator, retry_time);
} }
#ifdef PADDLE_WITH_CUDA
void WrapCUDAGraphAllocator() {
for (auto& item : allocators_) {
auto& allocator = item.second;
allocator = CUDAGraphAllocator::Create(allocator);
}
}
#endif
static void CheckCUDAAllocThreadSafe(const CUDAAllocatorMap& allocators) { static void CheckCUDAAllocThreadSafe(const CUDAAllocatorMap& allocators) {
for (auto& place_pair : allocators) { for (auto& place_pair : allocators) {
for (auto& stream_pair : place_pair.second) { for (auto& stream_pair : place_pair.second) {
...@@ -864,10 +824,6 @@ class AllocatorFacadePrivate { ...@@ -864,10 +824,6 @@ class AllocatorFacadePrivate {
// a standalone CUDA allocator to support multi-stream GC in new executor // a standalone CUDA allocator to support multi-stream GC in new executor
CUDAAllocatorMap cuda_allocators_; CUDAAllocatorMap cuda_allocators_;
std::shared_timed_mutex cuda_allocator_mutex_; std::shared_timed_mutex cuda_allocator_mutex_;
#ifdef PADDLE_WITH_CUDA
std::unordered_map<CUDAGraphID, std::unique_ptr<AllocatorFacadePrivate>>
cuda_graph_allocator_map_;
#endif
#endif #endif
AllocatorStrategy strategy_; AllocatorStrategy strategy_;
AllocatorMap allocators_; AllocatorMap allocators_;
...@@ -886,8 +842,24 @@ AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {} ...@@ -886,8 +842,24 @@ AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {}
AllocatorFacade::~AllocatorFacade() {} AllocatorFacade::~AllocatorFacade() {}
AllocatorFacade& AllocatorFacade::Instance() { AllocatorFacade& AllocatorFacade::Instance() {
static AllocatorFacade instance; static AllocatorFacade* instance = new AllocatorFacade;
return instance; return *instance;
}
AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const {
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
auto id = platform::CUDAGraph::CapturingID();
auto iter = cuda_graph_map_.find(id);
PADDLE_ENFORCE_NE(
iter, cuda_graph_map_.end(),
platform::errors::PermissionDenied(
"No memory pool is prepared for CUDA Graph capturing."));
VLOG(10) << "Choose CUDA Graph memory pool";
return iter->second.get();
}
#endif
return m_;
} }
const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator( const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
...@@ -895,19 +867,14 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator( ...@@ -895,19 +867,14 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
FLAGS_use_system_allocator == false) { FLAGS_use_system_allocator == false) {
#ifdef PADDLE_WITH_CUDA AllocatorFacadePrivate* m = GetPrivate();
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
return m_->GetAllocator(place,
/* A non-zero num to choose allocator_ */ 1);
}
#endif
platform::CUDAPlace cuda_place(place.GetDeviceId()); platform::CUDAPlace cuda_place(place.GetDeviceId());
return m_->GetAllocator(cuda_place, m_->GetDefaultStream(cuda_place)); return m->GetAllocator(cuda_place, m->GetDefaultStream(cuda_place));
} }
#endif #endif
return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); return GetPrivate()->GetAllocator(
place, /* A non-zero num to choose allocator_ */ 1);
} }
void* AllocatorFacade::GetBasePtr( void* AllocatorFacade::GetBasePtr(
...@@ -922,7 +889,7 @@ void* AllocatorFacade::GetBasePtr( ...@@ -922,7 +889,7 @@ void* AllocatorFacade::GetBasePtr(
"GetBasePtr() is only implemented for CUDAPlace(), not " "GetBasePtr() is only implemented for CUDAPlace(), not "
"suppot place: %s", "suppot place: %s",
allocation->place())); allocation->place()));
return m_->GetBasePtr(allocation); return GetPrivate()->GetBasePtr(allocation);
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
...@@ -930,21 +897,17 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator( ...@@ -930,21 +897,17 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
const platform::Place& place, const gpuStream_t& stream) { const platform::Place& place, const gpuStream_t& stream) {
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
FLAGS_use_system_allocator == false) { FLAGS_use_system_allocator == false) {
#ifdef PADDLE_WITH_CUDA return GetPrivate()->GetAllocator(place, stream,
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { /*create_if_not_found=*/true);
return m_->GetAllocator(place,
/* A non-zero num to choose allocator_ */ 1);
}
#endif
return m_->GetAllocator(place, stream, /*create_if_not_found=*/true);
} }
return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); return GetPrivate()->GetAllocator(
place, /* A non-zero num to choose allocator_ */ 1);
} }
#endif #endif
const std::shared_ptr<Allocator>& AllocatorFacade::GetZeroAllocator( const std::shared_ptr<Allocator>& AllocatorFacade::GetZeroAllocator(
const platform::Place& place) { const platform::Place& place) {
return m_->GetAllocator(place, /* zero size */ 0); return GetPrivate()->GetAllocator(place, /* zero size */ 0);
} }
std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared( std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
...@@ -957,43 +920,30 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, ...@@ -957,43 +920,30 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
size > 0 && FLAGS_use_system_allocator == false) { size > 0 && FLAGS_use_system_allocator == false) {
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
return m_->GetAllocator(place, size)->Allocate(size);
}
#endif
platform::CUDAPlace cuda_place(place.GetDeviceId()); platform::CUDAPlace cuda_place(place.GetDeviceId());
return Alloc(cuda_place, size, m_->GetDefaultStream(cuda_place)); phi::Stream default_stream = phi::Stream(reinterpret_cast<phi::StreamId>(
GetPrivate()->GetDefaultStream(cuda_place)));
return Alloc(cuda_place, size, default_stream);
} }
#endif #endif
return GetPrivate()->GetAllocator(place, size)->Allocate(size);
return m_->GetAllocator(place, size)->Allocate(size);
} }
uint64_t AllocatorFacade::Release(const platform::Place& place) { uint64_t AllocatorFacade::Release(const platform::Place& place) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
FLAGS_use_system_allocator == false) { FLAGS_use_system_allocator == false) {
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
return m_
->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
->Release(place);
}
#endif
platform::CUDAPlace cuda_place(place.GetDeviceId()); platform::CUDAPlace cuda_place(place.GetDeviceId());
return Release(cuda_place, m_->GetDefaultStream(cuda_place)); return Release(cuda_place, GetPrivate()->GetDefaultStream(cuda_place));
} }
#endif #endif
return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1) return GetPrivate()
->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
->Release(place); ->Release(place);
} }
std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared( std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
const platform::Place& place, size_t size, const phi::Stream& stream) { const platform::Place& place, size_t size, const phi::Stream& stream) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
FLAGS_use_stream_safe_cuda_allocator, true, FLAGS_use_stream_safe_cuda_allocator, true,
platform::errors::Unimplemented( platform::errors::Unimplemented(
...@@ -1001,71 +951,53 @@ std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared( ...@@ -1001,71 +951,53 @@ std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
"multi-stream 'AllocaShared' function. To enable it, you can enter" "multi-stream 'AllocaShared' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the " "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal.")); "terminal."));
return std::shared_ptr<phi::Allocation>(Alloc(place, size, stream));
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
PADDLE_THROW(platform::errors::Unavailable(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
}
#endif
gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
return std::shared_ptr<phi::Allocation>(Alloc(place, size, s));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
#endif
} }
bool AllocatorFacade::InSameStream( AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
const std::shared_ptr<phi::Allocation>& allocation,
const phi::Stream& stream) { const phi::Stream& stream) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
FLAGS_use_stream_safe_cuda_allocator, true, FLAGS_use_stream_safe_cuda_allocator, true,
platform::errors::Unimplemented( platform::errors::Unimplemented(
"StreamSafeCUDAAllocator is disabled, you should not call this " "StreamSafeCUDAAllocator is disabled, you should not call this "
"multi-stream 'InSameStream' function. To enable it, you can enter" "multi-stream 'Alloc' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the " "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal.")); "terminal."));
#ifdef PADDLE_WITH_CUDA platform::CUDAPlace p(place.GetDeviceId());
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
PADDLE_THROW(platform::errors::Unavailable(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
}
#endif
gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id()); gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
return s == GetStream(allocation); return GetPrivate()
->GetAllocator(p, s, /* create_if_not_found = */ true)
->Allocate(size);
} else {
return GetPrivate()->GetAllocator(p, size)->Allocate(size);
}
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU.")); PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
#endif #endif
} }
bool AllocatorFacade::InSameStream(
const std::shared_ptr<phi::Allocation>& allocation,
const phi::Stream& stream) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
const gpuStream_t& stream) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
FLAGS_use_stream_safe_cuda_allocator, true, FLAGS_use_stream_safe_cuda_allocator, true,
platform::errors::Unimplemented( platform::errors::Unimplemented(
"StreamSafeCUDAAllocator is disabled, you should not call this " "StreamSafeCUDAAllocator is disabled, you should not call this "
"multi-stream 'Alloc' function. To enable it, you can enter" "multi-stream 'InSameStream' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the " "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal.")); "terminal."));
gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
#ifdef PADDLE_WITH_CUDA return s == GetStream(allocation);
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { #else
PADDLE_THROW(platform::errors::Unavailable( PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
}
#endif #endif
platform::CUDAPlace p(place.GetDeviceId());
if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
return m_->GetAllocator(p, stream, /* create_if_not_found = */ true)
->Allocate(size);
} else {
return m_->GetAllocator(p, size)->Allocate(size);
}
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place, uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
const gpuStream_t& stream) { const gpuStream_t& stream) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
...@@ -1075,15 +1007,7 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place, ...@@ -1075,15 +1007,7 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
"multi-stream 'Release' function. To enable it, you can enter" "multi-stream 'Release' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the " "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal.")); "terminal."));
return GetPrivate()->GetAllocator(place, stream)->Release(place);
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
PADDLE_THROW(platform::errors::Unavailable(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
}
#endif
return m_->GetAllocator(place, stream)->Release(place);
} }
void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation, void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
...@@ -1095,15 +1019,7 @@ void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation, ...@@ -1095,15 +1019,7 @@ void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
"'RecordStream' function. To enable it, you can enter" "'RecordStream' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the " "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal.")); "terminal."));
GetPrivate()->RecordStream(allocation, stream);
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
PADDLE_THROW(platform::errors::Unavailable(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
}
#endif
m_->RecordStream(allocation, stream);
} }
const gpuStream_t& AllocatorFacade::GetStream( const gpuStream_t& AllocatorFacade::GetStream(
...@@ -1115,24 +1031,34 @@ const gpuStream_t& AllocatorFacade::GetStream( ...@@ -1115,24 +1031,34 @@ const gpuStream_t& AllocatorFacade::GetStream(
"'GetStream' function. To enable it, you can enter" "'GetStream' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the " "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal.")); "terminal."));
return GetPrivate()->GetStream(allocation);
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
PADDLE_THROW(platform::errors::Unavailable(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
}
#endif
return m_->GetStream(allocation);
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
return m_->PrepareMemoryPoolForCUDAGraph(id); PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth,
platform::errors::InvalidArgument(
"CUDA Graph is only supported when the "
"FLAGS_allocator_strategy=\"auto_growth\", but got "
"FLAGS_allocator_strategy=\"%s\"",
FLAGS_allocator_strategy));
auto& allocator = cuda_graph_map_[id];
PADDLE_ENFORCE_EQ(
allocator.get(), nullptr,
platform::errors::InvalidArgument(
"The memory pool of the CUDA Graph with ID %d have been prepared.",
id));
allocator.reset(new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
} }
void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) { void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
return m_->RemoveMemoryPoolOfCUDAGraph(id); auto iter = cuda_graph_map_.find(id);
PADDLE_ENFORCE_NE(iter, cuda_graph_map_.end(),
platform::errors::InvalidArgument(
"Cannot find CUDA Graph with ID = %d", id));
cuda_graph_map_.erase(iter);
VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id;
} }
#endif #endif
#endif #endif
......
...@@ -49,6 +49,8 @@ class AllocatorFacade { ...@@ -49,6 +49,8 @@ class AllocatorFacade {
static AllocatorFacade& Instance(); static AllocatorFacade& Instance();
AllocatorFacadePrivate* GetPrivate() const;
const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place); const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place);
void* GetBasePtr(const std::shared_ptr<Allocation>& allocation); void* GetBasePtr(const std::shared_ptr<Allocation>& allocation);
...@@ -73,13 +75,14 @@ class AllocatorFacade { ...@@ -73,13 +75,14 @@ class AllocatorFacade {
size_t size, size_t size,
const phi::Stream& stream); const phi::Stream& stream);
AllocationPtr Alloc(const platform::Place& place, size_t size,
const phi::Stream& stream);
bool InSameStream(const std::shared_ptr<Allocation>& allocation, bool InSameStream(const std::shared_ptr<Allocation>& allocation,
const phi::Stream& stream); const phi::Stream& stream);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// TODO(zhiqiu): change gpuStream_t to phi::Stream if needed. // TODO(zhiqiu): change gpuStream_t to phi::Stream if needed.
AllocationPtr Alloc(const platform::Place& place, size_t size,
const gpuStream_t& stream);
uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream); uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream);
void RecordStream(std::shared_ptr<Allocation> allocation, void RecordStream(std::shared_ptr<Allocation> allocation,
const gpuStream_t& stream); const gpuStream_t& stream);
...@@ -96,6 +99,10 @@ class AllocatorFacade { ...@@ -96,6 +99,10 @@ class AllocatorFacade {
private: private:
AllocatorFacade(); AllocatorFacade();
AllocatorFacadePrivate* m_; AllocatorFacadePrivate* m_;
#ifdef PADDLE_WITH_CUDA
std::unordered_map<CUDAGraphID, std::unique_ptr<AllocatorFacadePrivate>>
cuda_graph_map_;
#endif
}; };
} // namespace allocation } // namespace allocation
......
...@@ -15,56 +15,52 @@ ...@@ -15,56 +15,52 @@
#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h" #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/event_tracing.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
#endif
namespace paddle { namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
StreamSafeCUDAAllocation::StreamSafeCUDAAllocation( StreamSafeCUDAAllocation::StreamSafeCUDAAllocation(
DecoratedAllocationPtr underlying_allocation, gpuStream_t owning_stream) DecoratedAllocationPtr underlying_allocation, gpuStream_t owning_stream,
StreamSafeCUDAAllocator* allocator)
: Allocation(underlying_allocation->ptr(), : Allocation(underlying_allocation->ptr(),
underlying_allocation->base_ptr(), underlying_allocation->base_ptr(),
underlying_allocation->size(), underlying_allocation->place()), underlying_allocation->size(), underlying_allocation->place()),
underlying_allocation_(std::move(underlying_allocation)), underlying_allocation_(std::move(underlying_allocation)),
owning_stream_(std::move(owning_stream)) {} owning_stream_(std::move(owning_stream)),
allocator_(allocator->shared_from_this()) {}
void StreamSafeCUDAAllocation::RecordStream(const gpuStream_t& stream) { void StreamSafeCUDAAllocation::RecordStream(const gpuStream_t& stream) {
VLOG(8) << "Try record stream " << stream << " for address " << ptr(); VLOG(8) << "Try record stream " << stream << " for address " << ptr();
if (stream == owning_stream_) { if (stream == owning_stream_) {
VLOG(9) << "Record the same stream of " << stream;
return; return;
} }
std::lock_guard<SpinLock> lock_guard(outstanding_event_map_lock_); std::lock_guard<SpinLock> lock_guard(outstanding_event_map_lock_);
gpuEvent_t record_event;
auto it = outstanding_event_map_.find(stream);
if (it == outstanding_event_map_.end()) {
gpuEvent_t new_event;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS( if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming)); graph_capturing_stream_set_.insert(stream);
#else return;
PADDLE_ENFORCE_GPU_SUCCESS(
hipEventCreateWithFlags(&new_event, hipEventDisableTiming));
#endif
outstanding_event_map_[stream] = new_event;
record_event = new_event;
VLOG(9) << "Create a new event " << new_event;
} else {
record_event = it->second;
VLOG(9) << "Reuse event " << record_event;
} }
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream));
#else
PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream));
#endif #endif
VLOG(8) << "Record event " << record_event << " to stream " << stream;
RecordStreamWithNoGraphCapturing(stream);
RecordGraphCapturingStreams();
} }
bool StreamSafeCUDAAllocation::CanBeFreed() { bool StreamSafeCUDAAllocation::CanBeFreed() {
// NOTE(Ruibiao): This function will not execute concurrently, #ifdef PADDLE_WITH_CUDA
// so outstanding_event_lock_ is not required here if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
return graph_capturing_stream_set_.empty() &&
outstanding_event_map_.empty();
}
#endif
RecordGraphCapturingStreams();
for (auto it = outstanding_event_map_.begin(); for (auto it = outstanding_event_map_.begin();
it != outstanding_event_map_.end(); ++it) { it != outstanding_event_map_.end(); ++it) {
gpuEvent_t& event = it->second; gpuEvent_t& event = it->second;
...@@ -98,21 +94,62 @@ const gpuStream_t& StreamSafeCUDAAllocation::GetOwningStream() const { ...@@ -98,21 +94,62 @@ const gpuStream_t& StreamSafeCUDAAllocation::GetOwningStream() const {
return owning_stream_; return owning_stream_;
} }
void StreamSafeCUDAAllocation::RecordGraphCapturingStreams() {
for (gpuStream_t stream : graph_capturing_stream_set_) {
RecordStreamWithNoGraphCapturing(stream);
}
graph_capturing_stream_set_.clear();
}
void StreamSafeCUDAAllocation::RecordStreamWithNoGraphCapturing(
const gpuStream_t& stream) {
gpuEvent_t record_event;
auto it = outstanding_event_map_.find(stream);
if (it == outstanding_event_map_.end()) {
gpuEvent_t new_event;
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(
cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
hipEventCreateWithFlags(&new_event, hipEventDisableTiming));
#endif
outstanding_event_map_[stream] = new_event;
record_event = new_event;
VLOG(9) << "Create a new event " << new_event;
} else {
record_event = it->second;
VLOG(9) << "Reuse event " << record_event;
}
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream));
#else
PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream));
#endif
VLOG(8) << "Record event " << record_event << " to stream " << stream;
}
StreamSafeCUDAAllocator::StreamSafeCUDAAllocator( StreamSafeCUDAAllocator::StreamSafeCUDAAllocator(
std::shared_ptr<Allocator> underlying_allocator, platform::CUDAPlace place, std::shared_ptr<Allocator> underlying_allocator, platform::CUDAPlace place,
gpuStream_t default_stream) gpuStream_t default_stream, bool in_cuda_graph_capturing)
: underlying_allocator_(std::move(underlying_allocator)), : underlying_allocator_(std::move(underlying_allocator)),
place_(std::move(place)), place_(std::move(place)),
default_stream_(std::move(default_stream)) { default_stream_(std::move(default_stream)),
in_cuda_graph_capturing_(in_cuda_graph_capturing) {
if (LIKELY(!in_cuda_graph_capturing)) {
std::lock_guard<SpinLock> lock_guard(allocator_map_lock_); std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
allocator_map_[place].emplace_back(this); allocator_map_[place].emplace_back(this);
}
} }
StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() { StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() {
if (LIKELY(!in_cuda_graph_capturing_)) {
std::lock_guard<SpinLock> lock_guard(allocator_map_lock_); std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
std::vector<StreamSafeCUDAAllocator*>& allocators = allocator_map_[place_]; std::vector<StreamSafeCUDAAllocator*>& allocators = allocator_map_[place_];
allocators.erase(std::remove(allocators.begin(), allocators.end(), this), allocators.erase(std::remove(allocators.begin(), allocators.end(), this),
allocators.end()); allocators.end());
}
} }
bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; } bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; }
...@@ -140,7 +177,7 @@ phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) { ...@@ -140,7 +177,7 @@ phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
} }
StreamSafeCUDAAllocation* allocation = new StreamSafeCUDAAllocation( StreamSafeCUDAAllocation* allocation = new StreamSafeCUDAAllocation(
static_unique_ptr_cast<Allocation>(std::move(underlying_allocation)), static_unique_ptr_cast<Allocation>(std::move(underlying_allocation)),
default_stream_); default_stream_, this);
VLOG(8) << "Allocate " << allocation->size() << " bytes at address " VLOG(8) << "Allocate " << allocation->size() << " bytes at address "
<< allocation->ptr(); << allocation->ptr();
return allocation; return allocation;
...@@ -157,22 +194,27 @@ void StreamSafeCUDAAllocator::FreeImpl(phi::Allocation* allocation) { ...@@ -157,22 +194,27 @@ void StreamSafeCUDAAllocator::FreeImpl(phi::Allocation* allocation) {
"StreamSafeCUDAAllocation*", "StreamSafeCUDAAllocation*",
allocation)); allocation));
VLOG(8) << "Try free allocation " << stream_safe_cuda_allocation->ptr(); VLOG(8) << "Try free allocation " << stream_safe_cuda_allocation->ptr();
std::lock_guard<SpinLock> lock_guard(unfreed_allocation_lock_);
if (stream_safe_cuda_allocation->CanBeFreed()) { if (stream_safe_cuda_allocation->CanBeFreed()) {
VLOG(9) << "Directly delete allocation"; VLOG(9) << "Directly delete allocation";
delete stream_safe_cuda_allocation; delete stream_safe_cuda_allocation;
} else { } else {
VLOG(9) << "Put into unfreed_allocation list"; VLOG(9) << "Put into unfreed_allocation list";
std::lock_guard<SpinLock> lock_guard(unfreed_allocation_lock_);
unfreed_allocations_.emplace_back(stream_safe_cuda_allocation); unfreed_allocations_.emplace_back(stream_safe_cuda_allocation);
} }
} }
uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) { uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) {
if (UNLIKELY(in_cuda_graph_capturing_)) {
VLOG(7) << "Memory release forbidden in CUDA Graph Captruing";
return 0;
}
std::lock_guard<SpinLock> lock_guard(allocator_map_lock_); std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
std::vector<StreamSafeCUDAAllocator*>& allocators = allocator_map_[place]; std::vector<StreamSafeCUDAAllocator*>& allocators = allocator_map_[place];
uint64_t released_size = 0; uint64_t released_size = 0;
for (StreamSafeCUDAAllocator* allocator : allocators) { for (StreamSafeCUDAAllocator* allocator : allocators) {
released_size += allocator->ProcessUnfreedAllocationsWithRelease(); released_size += allocator->ProcessUnfreedAllocationsAndRelease();
} }
VLOG(8) << "Release " << released_size << " bytes memory from all streams"; VLOG(8) << "Release " << released_size << " bytes memory from all streams";
return released_size; return released_size;
...@@ -191,7 +233,7 @@ void StreamSafeCUDAAllocator::ProcessUnfreedAllocations() { ...@@ -191,7 +233,7 @@ void StreamSafeCUDAAllocator::ProcessUnfreedAllocations() {
} }
} }
uint64_t StreamSafeCUDAAllocator::ProcessUnfreedAllocationsWithRelease() { uint64_t StreamSafeCUDAAllocator::ProcessUnfreedAllocationsAndRelease() {
ProcessUnfreedAllocations(); ProcessUnfreedAllocations();
return underlying_allocator_->Release(place_); return underlying_allocator_->Release(place_);
} }
......
...@@ -14,10 +14,9 @@ ...@@ -14,10 +14,9 @@
#pragma once #pragma once
#include <deque>
#include <list> #include <list>
#include <map> #include <map>
#include <mutex> #include <set>
#include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/memory/allocation/spin_lock.h" #include "paddle/fluid/memory/allocation/spin_lock.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
...@@ -32,27 +31,38 @@ namespace paddle { ...@@ -32,27 +31,38 @@ namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
class StreamSafeCUDAAllocator;
class StreamSafeCUDAAllocation : public Allocation { class StreamSafeCUDAAllocation : public Allocation {
public: public:
StreamSafeCUDAAllocation(DecoratedAllocationPtr underlying_allocation, StreamSafeCUDAAllocation(DecoratedAllocationPtr underlying_allocation,
gpuStream_t owning_stream); gpuStream_t owning_stream,
StreamSafeCUDAAllocator *allocator);
void RecordStream(const gpuStream_t &stream); void RecordStream(const gpuStream_t &stream);
bool CanBeFreed(); bool CanBeFreed();
const gpuStream_t &GetOwningStream() const; const gpuStream_t &GetOwningStream() const;
private: private:
void RecordGraphCapturingStreams();
void RecordStreamWithNoGraphCapturing(const gpuStream_t &stream);
DecoratedAllocationPtr underlying_allocation_; DecoratedAllocationPtr underlying_allocation_;
std::set<gpuStream_t> graph_capturing_stream_set_;
std::map<gpuStream_t, gpuEvent_t> outstanding_event_map_; std::map<gpuStream_t, gpuEvent_t> outstanding_event_map_;
gpuStream_t owning_stream_; gpuStream_t owning_stream_;
SpinLock outstanding_event_map_lock_; SpinLock outstanding_event_map_lock_;
// To compatiable with CUDA Graph, hold the allocator shared_ptr so that
// Allocator will not deconstruct before Allocation
std::shared_ptr<Allocator> allocator_;
}; };
class StreamSafeCUDAAllocator : public Allocator { class StreamSafeCUDAAllocator
: public Allocator,
public std::enable_shared_from_this<StreamSafeCUDAAllocator> {
public: public:
StreamSafeCUDAAllocator(std::shared_ptr<Allocator> underlying_allocator, StreamSafeCUDAAllocator(std::shared_ptr<Allocator> underlying_allocator,
platform::CUDAPlace place, platform::CUDAPlace place, gpuStream_t default_stream,
gpuStream_t default_stream); bool in_cuda_graph_capturing = false);
~StreamSafeCUDAAllocator(); ~StreamSafeCUDAAllocator();
bool IsAllocThreadSafe() const override; bool IsAllocThreadSafe() const override;
...@@ -63,7 +73,7 @@ class StreamSafeCUDAAllocator : public Allocator { ...@@ -63,7 +73,7 @@ class StreamSafeCUDAAllocator : public Allocator {
private: private:
void ProcessUnfreedAllocations(); void ProcessUnfreedAllocations();
uint64_t ProcessUnfreedAllocationsWithRelease(); uint64_t ProcessUnfreedAllocationsAndRelease();
static std::map<platform::Place, std::vector<StreamSafeCUDAAllocator *>> static std::map<platform::Place, std::vector<StreamSafeCUDAAllocator *>>
allocator_map_; allocator_map_;
...@@ -74,6 +84,8 @@ class StreamSafeCUDAAllocator : public Allocator { ...@@ -74,6 +84,8 @@ class StreamSafeCUDAAllocator : public Allocator {
gpuStream_t default_stream_; gpuStream_t default_stream_;
std::list<StreamSafeCUDAAllocation *> unfreed_allocations_; std::list<StreamSafeCUDAAllocation *> unfreed_allocations_;
SpinLock unfreed_allocation_lock_; SpinLock unfreed_allocation_lock_;
bool in_cuda_graph_capturing_;
}; };
} // namespace allocation } // namespace allocation
......
...@@ -41,6 +41,11 @@ std::shared_ptr<Allocation> AllocShared(const platform::Place& place, ...@@ -41,6 +41,11 @@ std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
stream); stream);
} }
AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
const phi::Stream& stream) {
return allocation::AllocatorFacade::Instance().Alloc(place, size, stream);
}
bool InSameStream(const std::shared_ptr<Allocation>& allocation, bool InSameStream(const std::shared_ptr<Allocation>& allocation,
const phi::Stream& stream) { const phi::Stream& stream) {
return allocation::AllocatorFacade::Instance().InSameStream(allocation, return allocation::AllocatorFacade::Instance().InSameStream(allocation,
...@@ -52,11 +57,6 @@ void* GetBasePtr(const std::shared_ptr<Allocation>& allocation) { ...@@ -52,11 +57,6 @@ void* GetBasePtr(const std::shared_ptr<Allocation>& allocation) {
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
const gpuStream_t& stream) {
return allocation::AllocatorFacade::Instance().Alloc(place, size, stream);
}
uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream) { uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream) {
return allocation::AllocatorFacade::Instance().Release(place, stream); return allocation::AllocatorFacade::Instance().Release(place, stream);
} }
......
...@@ -41,15 +41,15 @@ extern std::shared_ptr<Allocation> AllocShared(const platform::Place& place, ...@@ -41,15 +41,15 @@ extern std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
size_t size, size_t size,
const phi::Stream& stream); const phi::Stream& stream);
extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
const phi::Stream& stream);
extern bool InSameStream(const std::shared_ptr<Allocation>& allocation, extern bool InSameStream(const std::shared_ptr<Allocation>& allocation,
const phi::Stream& stream); const phi::Stream& stream);
extern void* GetBasePtr(const std::shared_ptr<Allocation>& allocation); extern void* GetBasePtr(const std::shared_ptr<Allocation>& allocation);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
const gpuStream_t& stream);
extern uint64_t Release(const platform::CUDAPlace& place, extern uint64_t Release(const platform::CUDAPlace& place,
const gpuStream_t& stream); const gpuStream_t& stream);
......
...@@ -12,34 +12,35 @@ ...@@ -12,34 +12,35 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include <thread> // NOLINT #include <thread> // NOLINT
#include <vector> #include <vector>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/core/stream.h" #include "paddle/phi/core/stream.h"
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
namespace paddle { namespace paddle {
namespace memory { namespace memory {
__global__ void add_kernel(int *x, int n) { // y += (x + 1)
__global__ void add_kernel(int *x, int *y, int n) {
int thread_num = gridDim.x * blockDim.x; int thread_num = gridDim.x * blockDim.x;
int thread_id = blockIdx.x * blockDim.x + threadIdx.x; int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = thread_id; i < n; i += thread_num) { for (int i = thread_id; i < n; i += thread_num) {
atomicAdd(x + i, thread_id); y[i] += x[i] + 1;
} }
} }
...@@ -51,153 +52,6 @@ void CheckMemLeak(const platform::CUDAPlace &place) { ...@@ -51,153 +52,6 @@ void CheckMemLeak(const platform::CUDAPlace &place) {
<< " there may be a memory leak problem"; << " there may be a memory leak problem";
} }
class StreamSafeCUDAAllocTest : public ::testing::Test {
protected:
void SetUp() override {
place_ = platform::CUDAPlace();
stream_num_ = 64;
grid_num_ = 1;
block_num_ = 32;
data_num_ = 131072;
workspace_size_ = data_num_ * sizeof(int);
// alloc workspace for each stream
for (size_t i = 0; i < stream_num_; ++i) {
gpuStream_t stream;
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream));
#else
PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream));
#endif
std::shared_ptr<Allocation> allocation =
AllocShared(place_, workspace_size_,
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemset(allocation->ptr(), 0, allocation->size()));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
hipMemset(allocation->ptr(), 0, allocation->size()));
#endif
streams_.emplace_back(stream);
workspaces_.emplace_back(allocation);
}
result_ = Alloc(place_, stream_num_ * workspace_size_);
}
void SingleStreamRun(size_t idx) {
// for all stream i,
// stream idx lauch a kernel to add (j % thread_num) to workspaces_[i][j]
for (size_t i = 0; i < stream_num_; ++i) {
int *x = reinterpret_cast<int *>(workspaces_[i]->ptr());
add_kernel<<<grid_num_, block_num_, 0, streams_[idx]>>>(x, data_num_);
RecordStream(workspaces_[i], streams_[idx]);
}
}
void CopyResultAsync() {
for (size_t i = 0; i < stream_num_; ++i) {
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(
reinterpret_cast<int *>(result_->ptr()) + i * data_num_,
workspaces_[i]->ptr(), workspace_size_, cudaMemcpyDeviceToDevice));
#else
PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(
reinterpret_cast<int *>(result_->ptr()) + i * data_num_,
workspaces_[i]->ptr(), workspace_size_, hipMemcpyDeviceToDevice));
#endif
}
}
void MultiStreamRun() {
for (size_t i = 0; i < stream_num_; ++i) {
SingleStreamRun(i);
}
CopyResultAsync();
workspaces_.clear(); // fast_gc
cudaDeviceSynchronize();
}
void MultiThreadMUltiStreamRun() {
std::vector<std::thread> threads;
for (size_t i = 0; i < stream_num_; ++i) {
threads.push_back(
std::thread(&StreamSafeCUDAAllocTest::SingleStreamRun, this, i));
}
for (size_t i = 0; i < stream_num_; ++i) {
threads[i].join();
}
CopyResultAsync();
workspaces_.clear(); // fast_gc
cudaDeviceSynchronize();
}
void CheckResult() {
auto result_host = std::unique_ptr<int[]>(new int[result_->size()]);
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(result_host.get(), result_->ptr(),
result_->size(),
cudaMemcpyDeviceToHost));
#else
PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(result_host.get(), result_->ptr(),
result_->size(),
hipMemcpyDeviceToHost));
#endif
size_t thread_num = grid_num_ * block_num_;
for (size_t i = 0; i < stream_num_; ++i) {
for (size_t j = 0; j < data_num_; ++j) {
EXPECT_TRUE(result_host[i * stream_num_ + j] ==
(j % thread_num) * stream_num_);
}
}
result_.reset();
}
void TearDown() override {
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
#else
PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
#endif
for (gpuStream_t stream : streams_) {
Release(place_, stream);
}
for (size_t i = 1; i < stream_num_; ++i) {
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams_[i]));
#else
PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(streams_[i]));
#endif
}
CheckMemLeak(place_);
}
size_t stream_num_;
size_t grid_num_;
size_t block_num_;
size_t data_num_;
size_t workspace_size_;
platform::CUDAPlace place_;
std::vector<gpuStream_t> streams_;
std::vector<std::shared_ptr<Allocation>> workspaces_;
allocation::AllocationPtr result_;
};
TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) {
MultiStreamRun();
CheckResult();
}
TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStreamTest) {
MultiThreadMUltiStreamRun();
CheckResult();
}
TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) { TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
platform::CUDAPlace place = platform::CUDAPlace(); platform::CUDAPlace place = platform::CUDAPlace();
size_t alloc_size = 256; size_t alloc_size = 256;
...@@ -214,7 +68,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) { ...@@ -214,7 +68,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
paddle::platform::DeviceContextPool::Instance().Get(place)) paddle::platform::DeviceContextPool::Instance().Get(place))
->stream(); ->stream();
allocation::AllocationPtr allocation_unique = allocation::AllocationPtr allocation_unique =
Alloc(place, alloc_size, default_stream); Alloc(place, alloc_size,
phi::Stream(reinterpret_cast<phi::StreamId>(default_stream)));
EXPECT_GE(allocation_unique->size(), alloc_size); EXPECT_GE(allocation_unique->size(), alloc_size);
EXPECT_EQ(allocation_unique->ptr(), address); EXPECT_EQ(allocation_unique->ptr(), address);
allocation_unique.reset(); allocation_unique.reset();
...@@ -303,36 +158,6 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) { ...@@ -303,36 +158,6 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) {
CheckMemLeak(place); CheckMemLeak(place);
} }
#ifdef PADDLE_WITH_CUDA
TEST(StreamSafeCUDAAllocInterfaceTest, CUDAGraphExceptionTest) {
platform::CUDAPlace place = platform::CUDAPlace();
size_t alloc_size = 1;
std::shared_ptr<Allocation> allocation = AllocShared(place, alloc_size);
platform::BeginCUDAGraphCapture(place, cudaStreamCaptureModeGlobal);
EXPECT_THROW(AllocShared(place, alloc_size), paddle::platform::EnforceNotMet);
EXPECT_THROW(Alloc(place, alloc_size), paddle::platform::EnforceNotMet);
EXPECT_THROW(Release(place), paddle::platform::EnforceNotMet);
EXPECT_THROW(allocation::AllocatorFacade::Instance().GetAllocator(place),
paddle::platform::EnforceNotMet);
EXPECT_THROW(
AllocShared(place, alloc_size,
phi::Stream(reinterpret_cast<phi::StreamId>(nullptr))),
paddle::platform::EnforceNotMet);
EXPECT_THROW(Alloc(place, alloc_size, nullptr),
paddle::platform::EnforceNotMet);
EXPECT_THROW(Release(place, nullptr), paddle::platform::EnforceNotMet);
EXPECT_THROW(RecordStream(allocation, nullptr),
paddle::platform::EnforceNotMet);
EXPECT_THROW(GetStream(allocation), paddle::platform::EnforceNotMet);
platform::EndCUDAGraphCapture();
allocation.reset();
Release(place);
CheckMemLeak(place);
}
#endif
TEST(StreamSafeCUDAAllocRetryTest, RetryTest) { TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
platform::CUDAPlace place = platform::CUDAPlace(); platform::CUDAPlace place = platform::CUDAPlace();
gpuStream_t stream1, stream2; gpuStream_t stream1, stream2;
...@@ -348,12 +173,14 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) { ...@@ -348,12 +173,14 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
// so the second alloc will fail and retry // so the second alloc will fail and retry
size_t alloc_size = available_size / 4 * 3; size_t alloc_size = available_size / 4 * 3;
allocation::AllocationPtr allocation1 = Alloc(place, alloc_size, stream1); allocation::AllocationPtr allocation1 = Alloc(
place, alloc_size, phi::Stream(reinterpret_cast<phi::StreamId>(stream1)));
allocation::AllocationPtr allocation2; allocation::AllocationPtr allocation2;
std::thread th([&allocation2, &place, &stream2, alloc_size]() { std::thread th([&allocation2, &place, &stream2, alloc_size]() {
std::this_thread::sleep_for(std::chrono::seconds(1)); std::this_thread::sleep_for(std::chrono::seconds(1));
allocation2 = Alloc(place, alloc_size, stream2); allocation2 = Alloc(place, alloc_size,
phi::Stream(reinterpret_cast<phi::StreamId>(stream2)));
}); });
allocation1.reset(); // free but not release allocation1.reset(); // free but not release
th.join(); th.join();
...@@ -371,5 +198,201 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) { ...@@ -371,5 +198,201 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
CheckMemLeak(place); CheckMemLeak(place);
} }
class StreamSafeCUDAAllocTest : public ::testing::Test {
protected:
void SetUp() override {
place_ = platform::CUDAPlace();
stream_num_ = 64;
grid_num_ = 1;
block_num_ = 32;
data_num_ = 131072;
workspace_size_ = data_num_ * sizeof(int);
for (size_t i = 0; i < stream_num_; ++i) {
gpuStream_t stream;
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream));
#else
PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream));
#endif
std::shared_ptr<phi::Allocation> workspace_allocation =
AllocShared(place_, workspace_size_,
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
std::shared_ptr<phi::Allocation> result_allocation =
AllocShared(place_, workspace_size_,
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
std::shared_ptr<phi::Allocation> host_result_allocation =
AllocShared(platform::CPUPlace(), workspace_size_);
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemset(workspace_allocation->ptr(), 0,
workspace_allocation->size()));
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemset(result_allocation->ptr(), 0, result_allocation->size()));
#else
PADDLE_ENFORCE_GPU_SUCCESS(hipMemset(workspace_allocation->ptr(), 0,
workspace_allocation->size()));
PADDLE_ENFORCE_GPU_SUCCESS(
hipMemset(result_allocation->ptr(), 0, result_allocation->size()));
#endif
streams_.emplace_back(stream);
workspaces_.emplace_back(workspace_allocation);
results_.emplace_back(result_allocation);
host_results_.emplace_back(host_result_allocation);
}
}
void SingleStreamRun(size_t idx) {
int *y = reinterpret_cast<int *>(results_[idx]->ptr());
int neighbouring_idx = idx > 0 ? idx - 1 : idx;
add_kernel<<<grid_num_, block_num_, 0, streams_[idx]>>>(
reinterpret_cast<int *>(workspaces_[idx]->ptr()), y, data_num_);
add_kernel<<<grid_num_, block_num_, 0, streams_[idx]>>>(
reinterpret_cast<int *>(workspaces_[neighbouring_idx]->ptr()), y,
data_num_);
RecordStream(workspaces_[neighbouring_idx], streams_[idx]);
}
void MultiStreamRun() {
// Must run in reverse order, or the workspace_[i - 1] will be released
// before streams_[i]'s kernel launch
for (int i = stream_num_ - 1; i >= 0; --i) {
SingleStreamRun(i);
workspaces_[i].reset(); // fast GC
}
}
void MultiThreadMultiStreamRun() {
std::vector<std::thread> threads;
for (size_t i = 0; i < stream_num_; ++i) {
threads.push_back(
std::thread(&StreamSafeCUDAAllocTest::SingleStreamRun, this, i));
}
for (size_t i = 0; i < stream_num_; ++i) {
threads[i].join();
}
workspaces_.clear();
}
void CUDAGraphRun() {
testing_cuda_graph_ = true;
platform::BeginCUDAGraphCapture(platform::CUDAPlace(),
cudaStreamCaptureModeGlobal);
std::shared_ptr<Allocation> data_allocation =
AllocShared(platform::CUDAPlace(), workspace_size_);
std::shared_ptr<Allocation> result_allocation =
AllocShared(platform::CUDAPlace(), workspace_size_);
int *data = static_cast<int *>(data_allocation->ptr());
int *result = static_cast<int *>(result_allocation->ptr());
gpuStream_t main_stream = GetStream(data_allocation);
gpuStream_t other_stream;
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&other_stream));
add_kernel<<<grid_num_, block_num_, 0, main_stream>>>(data, result,
data_num_);
RecordStream(data_allocation, other_stream);
std::unique_ptr<platform::CUDAGraph> cuda_graph =
platform::EndCUDAGraphCapture();
int replay_times = 10;
for (int i = 0; i < replay_times; ++i) {
cuda_graph->Replay();
}
std::shared_ptr<Allocation> host_result_allocation =
AllocShared(platform::CPUPlace(), workspace_size_);
Copy(host_result_allocation->place(), host_result_allocation->ptr(),
result_allocation->place(), result_allocation->ptr(), workspace_size_,
main_stream);
cudaStreamSynchronize(main_stream);
int *host_result = static_cast<int *>(host_result_allocation->ptr());
for (int i = 0; i < data_num_; ++i) {
EXPECT_EQ(host_result[i], replay_times);
}
data_allocation.reset();
result_allocation.reset();
cuda_graph.release();
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(other_stream));
}
void CheckResult() {
for (size_t i = 0; i < stream_num_; ++i) {
Copy(host_results_[i]->place(), host_results_[i]->ptr(),
results_[i]->place(), results_[i]->ptr(), workspace_size_,
streams_[i]);
}
cudaDeviceSynchronize();
size_t thread_num = grid_num_ * block_num_;
for (size_t i = 0; i < stream_num_; ++i) {
int *result = static_cast<int *>(host_results_[i]->ptr());
for (size_t j = 0; j < data_num_; ++j) {
EXPECT_EQ(result[j], 2);
}
}
}
void TearDown() override {
workspaces_.clear();
results_.clear();
host_results_.clear();
for (gpuStream_t stream : streams_) {
Release(place_, stream);
}
for (size_t i = 0; i < stream_num_; ++i) {
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams_[i]));
#else
PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(streams_[i]));
#endif
}
// Memory release for CUDA Graph memory pool is forbidden
if (!testing_cuda_graph_) {
CheckMemLeak(place_);
}
}
bool testing_cuda_graph_{0};
size_t stream_num_;
size_t grid_num_;
size_t block_num_;
size_t data_num_;
size_t workspace_size_;
platform::CUDAPlace place_;
std::vector<gpuStream_t> streams_;
std::vector<std::shared_ptr<phi::Allocation>> workspaces_;
std::vector<std::shared_ptr<phi::Allocation>> results_;
std::vector<std::shared_ptr<phi::Allocation>> host_results_;
};
TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) {
MultiStreamRun();
CheckResult();
}
TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStreamTest) {
MultiThreadMultiStreamRun();
CheckResult();
}
#ifdef PADDLE_WITH_CUDA
TEST_F(StreamSafeCUDAAllocTest, CUDAGraphTest) {
MultiStreamRun();
CUDAGraphRun();
CheckResult();
}
#endif
} // namespace memory } // namespace memory
} // namespace paddle } // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册