未验证 提交 4c0511fa 编写于 作者: F From00 提交者: GitHub

Support cuda graph in StreamSafeCudaAllocator (#39594)

* Support cuda graph in StreamSafeCudaAllocator

* Fix CI error

* Arrange AllocatorFacade

* Fix CI error

* Fix CI error

* Fix ROCM Compile error

* Fix ROCM Compile error
上级 97ccaa79
......@@ -17,7 +17,7 @@ if (WITH_GPU)
nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
nv_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info)
nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator)
nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator cuda_graph)
nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)
......
......@@ -210,12 +210,7 @@ class AllocatorFacadePrivate {
InitNaiveBestFitCPUAllocator();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
allow_free_idle_chunk_ = allow_free_idle_chunk;
if (FLAGS_use_stream_safe_cuda_allocator) {
for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount();
++dev_id) {
InitStreamSafeCUDAAllocator(platform::CUDAPlace(dev_id), nullptr);
}
} else {
if (!FLAGS_use_stream_safe_cuda_allocator) {
for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount();
++dev_id) {
InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
......@@ -298,6 +293,12 @@ class AllocatorFacadePrivate {
}
CheckAllocThreadSafe();
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
WrapCUDAGraphAllocator();
}
#endif
}
inline const std::shared_ptr<Allocator>& GetAllocator(
......@@ -388,39 +389,6 @@ class AllocatorFacadePrivate {
allocation.get()));
return stream_safe_cuda_allocation->GetOwningStream();
}
#ifdef PADDLE_WITH_CUDA
void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth,
platform::errors::InvalidArgument(
"CUDA Graph is only supported when the "
"FLAGS_allocator_strategy=\"auto_growth\", but got "
"FLAGS_allocator_strategy=\"%s\"",
FLAGS_allocator_strategy));
auto& allocator = cuda_graph_allocator_map_[id];
PADDLE_ENFORCE_EQ(
allocator.get(), nullptr,
platform::errors::InvalidArgument(
"The memory pool of the CUDA Graph with ID %d have been prepared.",
id));
allocator.reset(
new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
for (auto& item : allocator->allocators_) {
auto& old_allocator = item.second;
old_allocator = CUDAGraphAllocator::Create(old_allocator);
}
VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
}
void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
auto iter = cuda_graph_allocator_map_.find(id);
PADDLE_ENFORCE_NE(iter, cuda_graph_allocator_map_.end(),
platform::errors::InvalidArgument(
"Cannot find CUDA Graph with ID = %d", id));
cuda_graph_allocator_map_.erase(iter);
VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id;
}
#endif
#endif
private:
......@@ -439,24 +407,7 @@ class AllocatorFacadePrivate {
platform::Place place_;
};
const AllocatorMap& GetAllocatorMap() {
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
auto id = platform::CUDAGraph::CapturingID();
auto iter = cuda_graph_allocator_map_.find(id);
PADDLE_ENFORCE_NE(
iter, cuda_graph_allocator_map_.end(),
platform::errors::PermissionDenied(
"No memory pool is prepared for CUDA Graph capturing."));
VLOG(10) << "Choose CUDA Graph memory pool to allocate memory";
return iter->second->allocators_;
} else {
return allocators_;
}
#else
return allocators_;
#endif
}
const AllocatorMap& GetAllocatorMap() { return allocators_; }
void InitNaiveBestFitCPUAllocator() {
allocators_[platform::CPUPlace()] =
......@@ -672,10 +623,10 @@ class AllocatorFacadePrivate {
}
void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
const std::shared_ptr<Allocator>& underlying_allocator =
cuda_allocators_[p][stream];
cuda_allocators_[p][stream] = std::make_shared<StreamSafeCUDAAllocator>(
underlying_allocator, p, stream);
std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
allocator = std::make_shared<StreamSafeCUDAAllocator>(
allocator, p, stream,
/* in_cuda_graph_capturing = */ !allow_free_idle_chunk_);
}
void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream,
......@@ -684,10 +635,19 @@ class AllocatorFacadePrivate {
retry_time, 0,
platform::errors::InvalidArgument(
"Retry time should be larger than 0, but got %d", retry_time));
std::shared_ptr<Allocator> allocator = cuda_allocators_[p][stream];
std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
allocator = std::make_shared<RetryAllocator>(allocator, retry_time);
}
#ifdef PADDLE_WITH_CUDA
void WrapCUDAGraphAllocator() {
for (auto& item : allocators_) {
auto& allocator = item.second;
allocator = CUDAGraphAllocator::Create(allocator);
}
}
#endif
static void CheckCUDAAllocThreadSafe(const CUDAAllocatorMap& allocators) {
for (auto& place_pair : allocators) {
for (auto& stream_pair : place_pair.second) {
......@@ -864,10 +824,6 @@ class AllocatorFacadePrivate {
// a standalone CUDA allocator to support multi-stream GC in new executor
CUDAAllocatorMap cuda_allocators_;
std::shared_timed_mutex cuda_allocator_mutex_;
#ifdef PADDLE_WITH_CUDA
std::unordered_map<CUDAGraphID, std::unique_ptr<AllocatorFacadePrivate>>
cuda_graph_allocator_map_;
#endif
#endif
AllocatorStrategy strategy_;
AllocatorMap allocators_;
......@@ -886,8 +842,24 @@ AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {}
AllocatorFacade::~AllocatorFacade() {}
AllocatorFacade& AllocatorFacade::Instance() {
static AllocatorFacade instance;
return instance;
static AllocatorFacade* instance = new AllocatorFacade;
return *instance;
}
AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const {
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
auto id = platform::CUDAGraph::CapturingID();
auto iter = cuda_graph_map_.find(id);
PADDLE_ENFORCE_NE(
iter, cuda_graph_map_.end(),
platform::errors::PermissionDenied(
"No memory pool is prepared for CUDA Graph capturing."));
VLOG(10) << "Choose CUDA Graph memory pool";
return iter->second.get();
}
#endif
return m_;
}
const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
......@@ -895,19 +867,14 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
FLAGS_use_system_allocator == false) {
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
return m_->GetAllocator(place,
/* A non-zero num to choose allocator_ */ 1);
}
#endif
AllocatorFacadePrivate* m = GetPrivate();
platform::CUDAPlace cuda_place(place.GetDeviceId());
return m_->GetAllocator(cuda_place, m_->GetDefaultStream(cuda_place));
return m->GetAllocator(cuda_place, m->GetDefaultStream(cuda_place));
}
#endif
return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
return GetPrivate()->GetAllocator(
place, /* A non-zero num to choose allocator_ */ 1);
}
void* AllocatorFacade::GetBasePtr(
......@@ -922,7 +889,7 @@ void* AllocatorFacade::GetBasePtr(
"GetBasePtr() is only implemented for CUDAPlace(), not "
"suppot place: %s",
allocation->place()));
return m_->GetBasePtr(allocation);
return GetPrivate()->GetBasePtr(allocation);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
......@@ -930,21 +897,17 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
const platform::Place& place, const gpuStream_t& stream) {
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
FLAGS_use_system_allocator == false) {
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
return m_->GetAllocator(place,
/* A non-zero num to choose allocator_ */ 1);
}
#endif
return m_->GetAllocator(place, stream, /*create_if_not_found=*/true);
return GetPrivate()->GetAllocator(place, stream,
/*create_if_not_found=*/true);
}
return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
return GetPrivate()->GetAllocator(
place, /* A non-zero num to choose allocator_ */ 1);
}
#endif
const std::shared_ptr<Allocator>& AllocatorFacade::GetZeroAllocator(
const platform::Place& place) {
return m_->GetAllocator(place, /* zero size */ 0);
return GetPrivate()->GetAllocator(place, /* zero size */ 0);
}
std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
......@@ -957,43 +920,30 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
size > 0 && FLAGS_use_system_allocator == false) {
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
return m_->GetAllocator(place, size)->Allocate(size);
}
#endif
platform::CUDAPlace cuda_place(place.GetDeviceId());
return Alloc(cuda_place, size, m_->GetDefaultStream(cuda_place));
phi::Stream default_stream = phi::Stream(reinterpret_cast<phi::StreamId>(
GetPrivate()->GetDefaultStream(cuda_place)));
return Alloc(cuda_place, size, default_stream);
}
#endif
return m_->GetAllocator(place, size)->Allocate(size);
return GetPrivate()->GetAllocator(place, size)->Allocate(size);
}
uint64_t AllocatorFacade::Release(const platform::Place& place) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
FLAGS_use_system_allocator == false) {
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
return m_
->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
->Release(place);
}
#endif
platform::CUDAPlace cuda_place(place.GetDeviceId());
return Release(cuda_place, m_->GetDefaultStream(cuda_place));
return Release(cuda_place, GetPrivate()->GetDefaultStream(cuda_place));
}
#endif
return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
return GetPrivate()
->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
->Release(place);
}
std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
const platform::Place& place, size_t size, const phi::Stream& stream) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_EQ(
FLAGS_use_stream_safe_cuda_allocator, true,
platform::errors::Unimplemented(
......@@ -1001,71 +951,53 @@ std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
"multi-stream 'AllocaShared' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal."));
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
PADDLE_THROW(platform::errors::Unavailable(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
}
#endif
gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
return std::shared_ptr<phi::Allocation>(Alloc(place, size, s));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
#endif
return std::shared_ptr<phi::Allocation>(Alloc(place, size, stream));
}
bool AllocatorFacade::InSameStream(
const std::shared_ptr<phi::Allocation>& allocation,
const phi::Stream& stream) {
AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
const phi::Stream& stream) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_EQ(
FLAGS_use_stream_safe_cuda_allocator, true,
platform::errors::Unimplemented(
"StreamSafeCUDAAllocator is disabled, you should not call this "
"multi-stream 'InSameStream' function. To enable it, you can enter"
"multi-stream 'Alloc' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal."));
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
PADDLE_THROW(platform::errors::Unavailable(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
platform::CUDAPlace p(place.GetDeviceId());
if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
return GetPrivate()
->GetAllocator(p, s, /* create_if_not_found = */ true)
->Allocate(size);
} else {
return GetPrivate()->GetAllocator(p, size)->Allocate(size);
}
#endif
gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
return s == GetStream(allocation);
#else
PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
#endif
}
bool AllocatorFacade::InSameStream(
const std::shared_ptr<phi::Allocation>& allocation,
const phi::Stream& stream) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
const gpuStream_t& stream) {
PADDLE_ENFORCE_EQ(
FLAGS_use_stream_safe_cuda_allocator, true,
platform::errors::Unimplemented(
"StreamSafeCUDAAllocator is disabled, you should not call this "
"multi-stream 'Alloc' function. To enable it, you can enter"
"multi-stream 'InSameStream' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal."));
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
PADDLE_THROW(platform::errors::Unavailable(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
}
gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
return s == GetStream(allocation);
#else
PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
#endif
platform::CUDAPlace p(place.GetDeviceId());
if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
return m_->GetAllocator(p, stream, /* create_if_not_found = */ true)
->Allocate(size);
} else {
return m_->GetAllocator(p, size)->Allocate(size);
}
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
const gpuStream_t& stream) {
PADDLE_ENFORCE_EQ(
......@@ -1075,15 +1007,7 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
"multi-stream 'Release' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal."));
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
PADDLE_THROW(platform::errors::Unavailable(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
}
#endif
return m_->GetAllocator(place, stream)->Release(place);
return GetPrivate()->GetAllocator(place, stream)->Release(place);
}
void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
......@@ -1095,15 +1019,7 @@ void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
"'RecordStream' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal."));
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
PADDLE_THROW(platform::errors::Unavailable(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
}
#endif
m_->RecordStream(allocation, stream);
GetPrivate()->RecordStream(allocation, stream);
}
const gpuStream_t& AllocatorFacade::GetStream(
......@@ -1115,24 +1031,34 @@ const gpuStream_t& AllocatorFacade::GetStream(
"'GetStream' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal."));
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
PADDLE_THROW(platform::errors::Unavailable(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
}
#endif
return m_->GetStream(allocation);
return GetPrivate()->GetStream(allocation);
}
#ifdef PADDLE_WITH_CUDA
void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
return m_->PrepareMemoryPoolForCUDAGraph(id);
PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth,
platform::errors::InvalidArgument(
"CUDA Graph is only supported when the "
"FLAGS_allocator_strategy=\"auto_growth\", but got "
"FLAGS_allocator_strategy=\"%s\"",
FLAGS_allocator_strategy));
auto& allocator = cuda_graph_map_[id];
PADDLE_ENFORCE_EQ(
allocator.get(), nullptr,
platform::errors::InvalidArgument(
"The memory pool of the CUDA Graph with ID %d have been prepared.",
id));
allocator.reset(new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
}
void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
return m_->RemoveMemoryPoolOfCUDAGraph(id);
auto iter = cuda_graph_map_.find(id);
PADDLE_ENFORCE_NE(iter, cuda_graph_map_.end(),
platform::errors::InvalidArgument(
"Cannot find CUDA Graph with ID = %d", id));
cuda_graph_map_.erase(iter);
VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id;
}
#endif
#endif
......
......@@ -49,6 +49,8 @@ class AllocatorFacade {
static AllocatorFacade& Instance();
AllocatorFacadePrivate* GetPrivate() const;
const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place);
void* GetBasePtr(const std::shared_ptr<Allocation>& allocation);
......@@ -73,13 +75,14 @@ class AllocatorFacade {
size_t size,
const phi::Stream& stream);
AllocationPtr Alloc(const platform::Place& place, size_t size,
const phi::Stream& stream);
bool InSameStream(const std::shared_ptr<Allocation>& allocation,
const phi::Stream& stream);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// TODO(zhiqiu): change gpuStream_t to phi::Stream if needed.
AllocationPtr Alloc(const platform::Place& place, size_t size,
const gpuStream_t& stream);
uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream);
void RecordStream(std::shared_ptr<Allocation> allocation,
const gpuStream_t& stream);
......@@ -96,6 +99,10 @@ class AllocatorFacade {
private:
AllocatorFacade();
AllocatorFacadePrivate* m_;
#ifdef PADDLE_WITH_CUDA
std::unordered_map<CUDAGraphID, std::unique_ptr<AllocatorFacadePrivate>>
cuda_graph_map_;
#endif
};
} // namespace allocation
......
......@@ -15,56 +15,52 @@
#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
#endif
namespace paddle {
namespace memory {
namespace allocation {
StreamSafeCUDAAllocation::StreamSafeCUDAAllocation(
DecoratedAllocationPtr underlying_allocation, gpuStream_t owning_stream)
DecoratedAllocationPtr underlying_allocation, gpuStream_t owning_stream,
StreamSafeCUDAAllocator* allocator)
: Allocation(underlying_allocation->ptr(),
underlying_allocation->base_ptr(),
underlying_allocation->size(), underlying_allocation->place()),
underlying_allocation_(std::move(underlying_allocation)),
owning_stream_(std::move(owning_stream)) {}
owning_stream_(std::move(owning_stream)),
allocator_(allocator->shared_from_this()) {}
void StreamSafeCUDAAllocation::RecordStream(const gpuStream_t& stream) {
VLOG(8) << "Try record stream " << stream << " for address " << ptr();
if (stream == owning_stream_) {
VLOG(9) << "Record the same stream of " << stream;
return;
}
std::lock_guard<SpinLock> lock_guard(outstanding_event_map_lock_);
gpuEvent_t record_event;
auto it = outstanding_event_map_.find(stream);
if (it == outstanding_event_map_.end()) {
gpuEvent_t new_event;
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(
cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
hipEventCreateWithFlags(&new_event, hipEventDisableTiming));
#endif
outstanding_event_map_[stream] = new_event;
record_event = new_event;
VLOG(9) << "Create a new event " << new_event;
} else {
record_event = it->second;
VLOG(9) << "Reuse event " << record_event;
if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
graph_capturing_stream_set_.insert(stream);
return;
}
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream));
#else
PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream));
#endif
VLOG(8) << "Record event " << record_event << " to stream " << stream;
RecordStreamWithNoGraphCapturing(stream);
RecordGraphCapturingStreams();
}
bool StreamSafeCUDAAllocation::CanBeFreed() {
// NOTE(Ruibiao): This function will not execute concurrently,
// so outstanding_event_lock_ is not required here
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
return graph_capturing_stream_set_.empty() &&
outstanding_event_map_.empty();
}
#endif
RecordGraphCapturingStreams();
for (auto it = outstanding_event_map_.begin();
it != outstanding_event_map_.end(); ++it) {
gpuEvent_t& event = it->second;
......@@ -98,21 +94,62 @@ const gpuStream_t& StreamSafeCUDAAllocation::GetOwningStream() const {
return owning_stream_;
}
void StreamSafeCUDAAllocation::RecordGraphCapturingStreams() {
for (gpuStream_t stream : graph_capturing_stream_set_) {
RecordStreamWithNoGraphCapturing(stream);
}
graph_capturing_stream_set_.clear();
}
void StreamSafeCUDAAllocation::RecordStreamWithNoGraphCapturing(
const gpuStream_t& stream) {
gpuEvent_t record_event;
auto it = outstanding_event_map_.find(stream);
if (it == outstanding_event_map_.end()) {
gpuEvent_t new_event;
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(
cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
hipEventCreateWithFlags(&new_event, hipEventDisableTiming));
#endif
outstanding_event_map_[stream] = new_event;
record_event = new_event;
VLOG(9) << "Create a new event " << new_event;
} else {
record_event = it->second;
VLOG(9) << "Reuse event " << record_event;
}
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream));
#else
PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream));
#endif
VLOG(8) << "Record event " << record_event << " to stream " << stream;
}
StreamSafeCUDAAllocator::StreamSafeCUDAAllocator(
std::shared_ptr<Allocator> underlying_allocator, platform::CUDAPlace place,
gpuStream_t default_stream)
gpuStream_t default_stream, bool in_cuda_graph_capturing)
: underlying_allocator_(std::move(underlying_allocator)),
place_(std::move(place)),
default_stream_(std::move(default_stream)) {
std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
allocator_map_[place].emplace_back(this);
default_stream_(std::move(default_stream)),
in_cuda_graph_capturing_(in_cuda_graph_capturing) {
if (LIKELY(!in_cuda_graph_capturing)) {
std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
allocator_map_[place].emplace_back(this);
}
}
StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() {
std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
std::vector<StreamSafeCUDAAllocator*>& allocators = allocator_map_[place_];
allocators.erase(std::remove(allocators.begin(), allocators.end(), this),
allocators.end());
if (LIKELY(!in_cuda_graph_capturing_)) {
std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
std::vector<StreamSafeCUDAAllocator*>& allocators = allocator_map_[place_];
allocators.erase(std::remove(allocators.begin(), allocators.end(), this),
allocators.end());
}
}
bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; }
......@@ -140,7 +177,7 @@ phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
}
StreamSafeCUDAAllocation* allocation = new StreamSafeCUDAAllocation(
static_unique_ptr_cast<Allocation>(std::move(underlying_allocation)),
default_stream_);
default_stream_, this);
VLOG(8) << "Allocate " << allocation->size() << " bytes at address "
<< allocation->ptr();
return allocation;
......@@ -157,22 +194,27 @@ void StreamSafeCUDAAllocator::FreeImpl(phi::Allocation* allocation) {
"StreamSafeCUDAAllocation*",
allocation));
VLOG(8) << "Try free allocation " << stream_safe_cuda_allocation->ptr();
std::lock_guard<SpinLock> lock_guard(unfreed_allocation_lock_);
if (stream_safe_cuda_allocation->CanBeFreed()) {
VLOG(9) << "Directly delete allocation";
delete stream_safe_cuda_allocation;
} else {
VLOG(9) << "Put into unfreed_allocation list";
std::lock_guard<SpinLock> lock_guard(unfreed_allocation_lock_);
unfreed_allocations_.emplace_back(stream_safe_cuda_allocation);
}
}
uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) {
if (UNLIKELY(in_cuda_graph_capturing_)) {
VLOG(7) << "Memory release forbidden in CUDA Graph Captruing";
return 0;
}
std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
std::vector<StreamSafeCUDAAllocator*>& allocators = allocator_map_[place];
uint64_t released_size = 0;
for (StreamSafeCUDAAllocator* allocator : allocators) {
released_size += allocator->ProcessUnfreedAllocationsWithRelease();
released_size += allocator->ProcessUnfreedAllocationsAndRelease();
}
VLOG(8) << "Release " << released_size << " bytes memory from all streams";
return released_size;
......@@ -191,7 +233,7 @@ void StreamSafeCUDAAllocator::ProcessUnfreedAllocations() {
}
}
uint64_t StreamSafeCUDAAllocator::ProcessUnfreedAllocationsWithRelease() {
uint64_t StreamSafeCUDAAllocator::ProcessUnfreedAllocationsAndRelease() {
ProcessUnfreedAllocations();
return underlying_allocator_->Release(place_);
}
......
......@@ -14,10 +14,9 @@
#pragma once
#include <deque>
#include <list>
#include <map>
#include <mutex>
#include <set>
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/memory/allocation/spin_lock.h"
#include "paddle/fluid/platform/place.h"
......@@ -32,27 +31,38 @@ namespace paddle {
namespace memory {
namespace allocation {
class StreamSafeCUDAAllocator;
class StreamSafeCUDAAllocation : public Allocation {
public:
StreamSafeCUDAAllocation(DecoratedAllocationPtr underlying_allocation,
gpuStream_t owning_stream);
gpuStream_t owning_stream,
StreamSafeCUDAAllocator *allocator);
void RecordStream(const gpuStream_t &stream);
bool CanBeFreed();
const gpuStream_t &GetOwningStream() const;
private:
void RecordGraphCapturingStreams();
void RecordStreamWithNoGraphCapturing(const gpuStream_t &stream);
DecoratedAllocationPtr underlying_allocation_;
std::set<gpuStream_t> graph_capturing_stream_set_;
std::map<gpuStream_t, gpuEvent_t> outstanding_event_map_;
gpuStream_t owning_stream_;
SpinLock outstanding_event_map_lock_;
// To compatiable with CUDA Graph, hold the allocator shared_ptr so that
// Allocator will not deconstruct before Allocation
std::shared_ptr<Allocator> allocator_;
};
class StreamSafeCUDAAllocator : public Allocator {
class StreamSafeCUDAAllocator
: public Allocator,
public std::enable_shared_from_this<StreamSafeCUDAAllocator> {
public:
StreamSafeCUDAAllocator(std::shared_ptr<Allocator> underlying_allocator,
platform::CUDAPlace place,
gpuStream_t default_stream);
platform::CUDAPlace place, gpuStream_t default_stream,
bool in_cuda_graph_capturing = false);
~StreamSafeCUDAAllocator();
bool IsAllocThreadSafe() const override;
......@@ -63,7 +73,7 @@ class StreamSafeCUDAAllocator : public Allocator {
private:
void ProcessUnfreedAllocations();
uint64_t ProcessUnfreedAllocationsWithRelease();
uint64_t ProcessUnfreedAllocationsAndRelease();
static std::map<platform::Place, std::vector<StreamSafeCUDAAllocator *>>
allocator_map_;
......@@ -74,6 +84,8 @@ class StreamSafeCUDAAllocator : public Allocator {
gpuStream_t default_stream_;
std::list<StreamSafeCUDAAllocation *> unfreed_allocations_;
SpinLock unfreed_allocation_lock_;
bool in_cuda_graph_capturing_;
};
} // namespace allocation
......
......@@ -41,6 +41,11 @@ std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
stream);
}
AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
const phi::Stream& stream) {
return allocation::AllocatorFacade::Instance().Alloc(place, size, stream);
}
bool InSameStream(const std::shared_ptr<Allocation>& allocation,
const phi::Stream& stream) {
return allocation::AllocatorFacade::Instance().InSameStream(allocation,
......@@ -52,11 +57,6 @@ void* GetBasePtr(const std::shared_ptr<Allocation>& allocation) {
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
const gpuStream_t& stream) {
return allocation::AllocatorFacade::Instance().Alloc(place, size, stream);
}
uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream) {
return allocation::AllocatorFacade::Instance().Release(place, stream);
}
......
......@@ -41,15 +41,15 @@ extern std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
size_t size,
const phi::Stream& stream);
extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
const phi::Stream& stream);
extern bool InSameStream(const std::shared_ptr<Allocation>& allocation,
const phi::Stream& stream);
extern void* GetBasePtr(const std::shared_ptr<Allocation>& allocation);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
const gpuStream_t& stream);
extern uint64_t Release(const platform::CUDAPlace& place,
const gpuStream_t& stream);
......
......@@ -12,34 +12,35 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/core/stream.h"
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
namespace paddle {
namespace memory {
__global__ void add_kernel(int *x, int n) {
// y += (x + 1)
__global__ void add_kernel(int *x, int *y, int n) {
int thread_num = gridDim.x * blockDim.x;
int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = thread_id; i < n; i += thread_num) {
atomicAdd(x + i, thread_id);
y[i] += x[i] + 1;
}
}
......@@ -51,153 +52,6 @@ void CheckMemLeak(const platform::CUDAPlace &place) {
<< " there may be a memory leak problem";
}
class StreamSafeCUDAAllocTest : public ::testing::Test {
protected:
void SetUp() override {
place_ = platform::CUDAPlace();
stream_num_ = 64;
grid_num_ = 1;
block_num_ = 32;
data_num_ = 131072;
workspace_size_ = data_num_ * sizeof(int);
// alloc workspace for each stream
for (size_t i = 0; i < stream_num_; ++i) {
gpuStream_t stream;
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream));
#else
PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream));
#endif
std::shared_ptr<Allocation> allocation =
AllocShared(place_, workspace_size_,
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemset(allocation->ptr(), 0, allocation->size()));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
hipMemset(allocation->ptr(), 0, allocation->size()));
#endif
streams_.emplace_back(stream);
workspaces_.emplace_back(allocation);
}
result_ = Alloc(place_, stream_num_ * workspace_size_);
}
void SingleStreamRun(size_t idx) {
// for all stream i,
// stream idx lauch a kernel to add (j % thread_num) to workspaces_[i][j]
for (size_t i = 0; i < stream_num_; ++i) {
int *x = reinterpret_cast<int *>(workspaces_[i]->ptr());
add_kernel<<<grid_num_, block_num_, 0, streams_[idx]>>>(x, data_num_);
RecordStream(workspaces_[i], streams_[idx]);
}
}
void CopyResultAsync() {
for (size_t i = 0; i < stream_num_; ++i) {
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(
reinterpret_cast<int *>(result_->ptr()) + i * data_num_,
workspaces_[i]->ptr(), workspace_size_, cudaMemcpyDeviceToDevice));
#else
PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(
reinterpret_cast<int *>(result_->ptr()) + i * data_num_,
workspaces_[i]->ptr(), workspace_size_, hipMemcpyDeviceToDevice));
#endif
}
}
void MultiStreamRun() {
for (size_t i = 0; i < stream_num_; ++i) {
SingleStreamRun(i);
}
CopyResultAsync();
workspaces_.clear(); // fast_gc
cudaDeviceSynchronize();
}
void MultiThreadMUltiStreamRun() {
std::vector<std::thread> threads;
for (size_t i = 0; i < stream_num_; ++i) {
threads.push_back(
std::thread(&StreamSafeCUDAAllocTest::SingleStreamRun, this, i));
}
for (size_t i = 0; i < stream_num_; ++i) {
threads[i].join();
}
CopyResultAsync();
workspaces_.clear(); // fast_gc
cudaDeviceSynchronize();
}
void CheckResult() {
auto result_host = std::unique_ptr<int[]>(new int[result_->size()]);
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(result_host.get(), result_->ptr(),
result_->size(),
cudaMemcpyDeviceToHost));
#else
PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(result_host.get(), result_->ptr(),
result_->size(),
hipMemcpyDeviceToHost));
#endif
size_t thread_num = grid_num_ * block_num_;
for (size_t i = 0; i < stream_num_; ++i) {
for (size_t j = 0; j < data_num_; ++j) {
EXPECT_TRUE(result_host[i * stream_num_ + j] ==
(j % thread_num) * stream_num_);
}
}
result_.reset();
}
void TearDown() override {
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
#else
PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
#endif
for (gpuStream_t stream : streams_) {
Release(place_, stream);
}
for (size_t i = 1; i < stream_num_; ++i) {
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams_[i]));
#else
PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(streams_[i]));
#endif
}
CheckMemLeak(place_);
}
size_t stream_num_;
size_t grid_num_;
size_t block_num_;
size_t data_num_;
size_t workspace_size_;
platform::CUDAPlace place_;
std::vector<gpuStream_t> streams_;
std::vector<std::shared_ptr<Allocation>> workspaces_;
allocation::AllocationPtr result_;
};
TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) {
MultiStreamRun();
CheckResult();
}
TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStreamTest) {
MultiThreadMUltiStreamRun();
CheckResult();
}
TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
platform::CUDAPlace place = platform::CUDAPlace();
size_t alloc_size = 256;
......@@ -214,7 +68,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
paddle::platform::DeviceContextPool::Instance().Get(place))
->stream();
allocation::AllocationPtr allocation_unique =
Alloc(place, alloc_size, default_stream);
Alloc(place, alloc_size,
phi::Stream(reinterpret_cast<phi::StreamId>(default_stream)));
EXPECT_GE(allocation_unique->size(), alloc_size);
EXPECT_EQ(allocation_unique->ptr(), address);
allocation_unique.reset();
......@@ -303,36 +158,6 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) {
CheckMemLeak(place);
}
#ifdef PADDLE_WITH_CUDA
TEST(StreamSafeCUDAAllocInterfaceTest, CUDAGraphExceptionTest) {
platform::CUDAPlace place = platform::CUDAPlace();
size_t alloc_size = 1;
std::shared_ptr<Allocation> allocation = AllocShared(place, alloc_size);
platform::BeginCUDAGraphCapture(place, cudaStreamCaptureModeGlobal);
EXPECT_THROW(AllocShared(place, alloc_size), paddle::platform::EnforceNotMet);
EXPECT_THROW(Alloc(place, alloc_size), paddle::platform::EnforceNotMet);
EXPECT_THROW(Release(place), paddle::platform::EnforceNotMet);
EXPECT_THROW(allocation::AllocatorFacade::Instance().GetAllocator(place),
paddle::platform::EnforceNotMet);
EXPECT_THROW(
AllocShared(place, alloc_size,
phi::Stream(reinterpret_cast<phi::StreamId>(nullptr))),
paddle::platform::EnforceNotMet);
EXPECT_THROW(Alloc(place, alloc_size, nullptr),
paddle::platform::EnforceNotMet);
EXPECT_THROW(Release(place, nullptr), paddle::platform::EnforceNotMet);
EXPECT_THROW(RecordStream(allocation, nullptr),
paddle::platform::EnforceNotMet);
EXPECT_THROW(GetStream(allocation), paddle::platform::EnforceNotMet);
platform::EndCUDAGraphCapture();
allocation.reset();
Release(place);
CheckMemLeak(place);
}
#endif
TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
platform::CUDAPlace place = platform::CUDAPlace();
gpuStream_t stream1, stream2;
......@@ -348,12 +173,14 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
// so the second alloc will fail and retry
size_t alloc_size = available_size / 4 * 3;
allocation::AllocationPtr allocation1 = Alloc(place, alloc_size, stream1);
allocation::AllocationPtr allocation1 = Alloc(
place, alloc_size, phi::Stream(reinterpret_cast<phi::StreamId>(stream1)));
allocation::AllocationPtr allocation2;
std::thread th([&allocation2, &place, &stream2, alloc_size]() {
std::this_thread::sleep_for(std::chrono::seconds(1));
allocation2 = Alloc(place, alloc_size, stream2);
allocation2 = Alloc(place, alloc_size,
phi::Stream(reinterpret_cast<phi::StreamId>(stream2)));
});
allocation1.reset(); // free but not release
th.join();
......@@ -371,5 +198,201 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
CheckMemLeak(place);
}
class StreamSafeCUDAAllocTest : public ::testing::Test {
protected:
void SetUp() override {
place_ = platform::CUDAPlace();
stream_num_ = 64;
grid_num_ = 1;
block_num_ = 32;
data_num_ = 131072;
workspace_size_ = data_num_ * sizeof(int);
for (size_t i = 0; i < stream_num_; ++i) {
gpuStream_t stream;
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream));
#else
PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream));
#endif
std::shared_ptr<phi::Allocation> workspace_allocation =
AllocShared(place_, workspace_size_,
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
std::shared_ptr<phi::Allocation> result_allocation =
AllocShared(place_, workspace_size_,
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
std::shared_ptr<phi::Allocation> host_result_allocation =
AllocShared(platform::CPUPlace(), workspace_size_);
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemset(workspace_allocation->ptr(), 0,
workspace_allocation->size()));
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemset(result_allocation->ptr(), 0, result_allocation->size()));
#else
PADDLE_ENFORCE_GPU_SUCCESS(hipMemset(workspace_allocation->ptr(), 0,
workspace_allocation->size()));
PADDLE_ENFORCE_GPU_SUCCESS(
hipMemset(result_allocation->ptr(), 0, result_allocation->size()));
#endif
streams_.emplace_back(stream);
workspaces_.emplace_back(workspace_allocation);
results_.emplace_back(result_allocation);
host_results_.emplace_back(host_result_allocation);
}
}
void SingleStreamRun(size_t idx) {
int *y = reinterpret_cast<int *>(results_[idx]->ptr());
int neighbouring_idx = idx > 0 ? idx - 1 : idx;
add_kernel<<<grid_num_, block_num_, 0, streams_[idx]>>>(
reinterpret_cast<int *>(workspaces_[idx]->ptr()), y, data_num_);
add_kernel<<<grid_num_, block_num_, 0, streams_[idx]>>>(
reinterpret_cast<int *>(workspaces_[neighbouring_idx]->ptr()), y,
data_num_);
RecordStream(workspaces_[neighbouring_idx], streams_[idx]);
}
void MultiStreamRun() {
// Must run in reverse order, or the workspace_[i - 1] will be released
// before streams_[i]'s kernel launch
for (int i = stream_num_ - 1; i >= 0; --i) {
SingleStreamRun(i);
workspaces_[i].reset(); // fast GC
}
}
void MultiThreadMultiStreamRun() {
std::vector<std::thread> threads;
for (size_t i = 0; i < stream_num_; ++i) {
threads.push_back(
std::thread(&StreamSafeCUDAAllocTest::SingleStreamRun, this, i));
}
for (size_t i = 0; i < stream_num_; ++i) {
threads[i].join();
}
workspaces_.clear();
}
void CUDAGraphRun() {
testing_cuda_graph_ = true;
platform::BeginCUDAGraphCapture(platform::CUDAPlace(),
cudaStreamCaptureModeGlobal);
std::shared_ptr<Allocation> data_allocation =
AllocShared(platform::CUDAPlace(), workspace_size_);
std::shared_ptr<Allocation> result_allocation =
AllocShared(platform::CUDAPlace(), workspace_size_);
int *data = static_cast<int *>(data_allocation->ptr());
int *result = static_cast<int *>(result_allocation->ptr());
gpuStream_t main_stream = GetStream(data_allocation);
gpuStream_t other_stream;
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&other_stream));
add_kernel<<<grid_num_, block_num_, 0, main_stream>>>(data, result,
data_num_);
RecordStream(data_allocation, other_stream);
std::unique_ptr<platform::CUDAGraph> cuda_graph =
platform::EndCUDAGraphCapture();
int replay_times = 10;
for (int i = 0; i < replay_times; ++i) {
cuda_graph->Replay();
}
std::shared_ptr<Allocation> host_result_allocation =
AllocShared(platform::CPUPlace(), workspace_size_);
Copy(host_result_allocation->place(), host_result_allocation->ptr(),
result_allocation->place(), result_allocation->ptr(), workspace_size_,
main_stream);
cudaStreamSynchronize(main_stream);
int *host_result = static_cast<int *>(host_result_allocation->ptr());
for (int i = 0; i < data_num_; ++i) {
EXPECT_EQ(host_result[i], replay_times);
}
data_allocation.reset();
result_allocation.reset();
cuda_graph.release();
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(other_stream));
}
void CheckResult() {
for (size_t i = 0; i < stream_num_; ++i) {
Copy(host_results_[i]->place(), host_results_[i]->ptr(),
results_[i]->place(), results_[i]->ptr(), workspace_size_,
streams_[i]);
}
cudaDeviceSynchronize();
size_t thread_num = grid_num_ * block_num_;
for (size_t i = 0; i < stream_num_; ++i) {
int *result = static_cast<int *>(host_results_[i]->ptr());
for (size_t j = 0; j < data_num_; ++j) {
EXPECT_EQ(result[j], 2);
}
}
}
void TearDown() override {
workspaces_.clear();
results_.clear();
host_results_.clear();
for (gpuStream_t stream : streams_) {
Release(place_, stream);
}
for (size_t i = 0; i < stream_num_; ++i) {
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams_[i]));
#else
PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(streams_[i]));
#endif
}
// Memory release for CUDA Graph memory pool is forbidden
if (!testing_cuda_graph_) {
CheckMemLeak(place_);
}
}
bool testing_cuda_graph_{0};
size_t stream_num_;
size_t grid_num_;
size_t block_num_;
size_t data_num_;
size_t workspace_size_;
platform::CUDAPlace place_;
std::vector<gpuStream_t> streams_;
std::vector<std::shared_ptr<phi::Allocation>> workspaces_;
std::vector<std::shared_ptr<phi::Allocation>> results_;
std::vector<std::shared_ptr<phi::Allocation>> host_results_;
};
TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) {
MultiStreamRun();
CheckResult();
}
TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStreamTest) {
MultiThreadMultiStreamRun();
CheckResult();
}
#ifdef PADDLE_WITH_CUDA
TEST_F(StreamSafeCUDAAllocTest, CUDAGraphTest) {
MultiStreamRun();
CUDAGraphRun();
CheckResult();
}
#endif
} // namespace memory
} // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册