未验证 提交 0ad2e192 编写于 作者: F From00 提交者: GitHub

Make StreamSafeCUDAAllocator compatible with NaiveBestFit strategy (#40886)

* Make StreamSafeCUDAAllocator compatible with NaiveBestFit strategy

* Set FLAGS_use_stream_safe_cuda_allocator to false

* Update

* Remove unnecessary code

* Fix CI errors

* Add UT
上级 f6b6b057
...@@ -32,7 +32,6 @@ PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope, true, ...@@ -32,7 +32,6 @@ PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope, true,
DECLARE_bool(check_nan_inf); DECLARE_bool(check_nan_inf);
DECLARE_bool(benchmark); DECLARE_bool(benchmark);
DECLARE_bool(fast_eager_deletion_mode); DECLARE_bool(fast_eager_deletion_mode);
DECLARE_bool(use_stream_safe_cuda_allocator);
constexpr const char* kExceptionCaught = "ExceptionCaught"; constexpr const char* kExceptionCaught = "ExceptionCaught";
constexpr const char* kTaskCompletion = "TaskCompletion"; constexpr const char* kTaskCompletion = "TaskCompletion";
...@@ -44,7 +43,9 @@ static constexpr size_t kHostNumThreads = 4; ...@@ -44,7 +43,9 @@ static constexpr size_t kHostNumThreads = 4;
static constexpr size_t kDeviceNumThreads = 1; static constexpr size_t kDeviceNumThreads = 1;
bool IsInterpretercoreFastGCEnabled() { bool IsInterpretercoreFastGCEnabled() {
return FLAGS_fast_eager_deletion_mode && FLAGS_use_stream_safe_cuda_allocator; return memory::allocation::AllocatorFacade::Instance()
.IsStreamSafeCUDAAllocatorUsed() &&
FLAGS_fast_eager_deletion_mode;
} }
InterpreterCore::InterpreterCore(const platform::Place& place, InterpreterCore::InterpreterCore(const platform::Place& place,
......
...@@ -145,6 +145,14 @@ class CUDAGraphAllocator ...@@ -145,6 +145,14 @@ class CUDAGraphAllocator
}; };
#endif #endif
static bool IsCUDAGraphCapturing() {
#ifdef PADDLE_WITH_CUDA
return UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing());
#else
return false;
#endif
}
class AllocatorFacadePrivate { class AllocatorFacadePrivate {
public: public:
using AllocatorMap = std::map<platform::Place, std::shared_ptr<Allocator>>; using AllocatorMap = std::map<platform::Place, std::shared_ptr<Allocator>>;
...@@ -157,6 +165,8 @@ class AllocatorFacadePrivate { ...@@ -157,6 +165,8 @@ class AllocatorFacadePrivate {
explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) { explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) {
strategy_ = GetAllocatorStrategy(); strategy_ = GetAllocatorStrategy();
is_stream_safe_cuda_allocator_used_ = false;
switch (strategy_) { switch (strategy_) {
case AllocatorStrategy::kNaiveBestFit: { case AllocatorStrategy::kNaiveBestFit: {
InitNaiveBestFitCPUAllocator(); InitNaiveBestFitCPUAllocator();
...@@ -166,12 +176,6 @@ class AllocatorFacadePrivate { ...@@ -166,12 +176,6 @@ class AllocatorFacadePrivate {
} }
#endif #endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_EQ(
FLAGS_use_stream_safe_cuda_allocator, false,
paddle::platform::errors::Unimplemented(
"StreamSafeCUDAAllocator is only implemented for auto_growth "
"strategy, not support naive_best_fit strategy"));
for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id)); InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
} }
...@@ -216,22 +220,25 @@ class AllocatorFacadePrivate { ...@@ -216,22 +220,25 @@ class AllocatorFacadePrivate {
allow_free_idle_chunk_); allow_free_idle_chunk_);
} }
// Note(Ruibiao): For GPU multi-stream case, the 'allocators_' map(place // Note(Ruibiao): For GPU multi-stream case without CUDA graph
// -> Allocator) hold the StreamSafeCUDAAllocator releate to default // capturing, the 'allocators_' map(place -> Allocator) hold the
// stream (i.e., the stream directly got from DeviceContex), while the // StreamSafeCUDAAllocator releate to defaultstream (i.e., the stream
// 'cuda_allocators_' map(place -> map(stream -> Allocator)) hold the // directly got from DeviceContex), while the 'cuda_allocators_' map
// StreamSafeCUDAAllocator releate to non-default stream (i.e., the // (place -> map(stream -> Allocator)) hold the StreamSafeCUDAAllocator
// stream users pass in). The default stream Allocator is built in the // releate to non-default stream (i.e., the stream users pass in). The
// structure of AllocatorFacadePrivate, while the non-default stream is // default stream Allocator is built in the structure of
// build in a delayed manner in GetAllocator function with // AllocatorFacadePrivate, while the non-default stream is build in a
// 'create_if_not_found = ture'. We make special treatment for the // manner in GetAllocator function with 'create_if_not_found = ture'.
// default stream for performance reasons. Since most Alloc calls are // We make special treatment for the default stream for performance
// for default stream in application, treating it separately can avoid // reasons. Since most Alloc calls are for default stream in
// lots of overhead of acquiring default stream and applying read-write // application, treating it separately can avoid lots of overhead of
// lock. // acquiring default stream and applying read-write lock.
if (FLAGS_use_stream_safe_cuda_allocator) { if (FLAGS_use_stream_safe_cuda_allocator) {
if (LIKELY(!IsCUDAGraphCapturing())) {
WrapStreamSafeCUDAAllocatorForDefault(); WrapStreamSafeCUDAAllocatorForDefault();
} }
is_stream_safe_cuda_allocator_used_ = true;
}
InitNaiveBestFitCUDAPinnedAllocator(); InitNaiveBestFitCUDAPinnedAllocator();
#endif #endif
...@@ -283,12 +290,6 @@ class AllocatorFacadePrivate { ...@@ -283,12 +290,6 @@ class AllocatorFacadePrivate {
} }
#endif #endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_EQ(
FLAGS_use_stream_safe_cuda_allocator, false,
paddle::platform::errors::Unimplemented(
"StreamSafeCUDAAllocator is only implemented for auto_growth "
"strategy, not support thread_local strategy"));
for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id)); InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
} }
...@@ -317,8 +318,9 @@ class AllocatorFacadePrivate { ...@@ -317,8 +318,9 @@ class AllocatorFacadePrivate {
CheckAllocThreadSafe(); CheckAllocThreadSafe();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (FLAGS_use_stream_safe_cuda_allocator == false && // No need to wrap CUDAGraphAllocator for StreamSafeCUDAAllocator
UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) { if (!is_stream_safe_cuda_allocator_used_ &&
UNLIKELY(IsCUDAGraphCapturing())) {
WrapCUDAGraphAllocator(); WrapCUDAGraphAllocator();
} }
#endif #endif
...@@ -343,6 +345,11 @@ class AllocatorFacadePrivate { ...@@ -343,6 +345,11 @@ class AllocatorFacadePrivate {
return static_cast<Allocation*>(allocation.get())->base_ptr(); return static_cast<Allocation*>(allocation.get())->base_ptr();
} }
bool IsStreamSafeCUDAAllocatorUsed() {
return is_stream_safe_cuda_allocator_used_ &&
LIKELY(FLAGS_use_system_allocator == false);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
bool HasCUDAAllocator(const platform::CUDAPlace& place, bool HasCUDAAllocator(const platform::CUDAPlace& place,
const gpuStream_t& stream) { const gpuStream_t& stream) {
...@@ -358,10 +365,12 @@ class AllocatorFacadePrivate { ...@@ -358,10 +365,12 @@ class AllocatorFacadePrivate {
const std::shared_ptr<Allocator>& GetAllocator( const std::shared_ptr<Allocator>& GetAllocator(
const platform::CUDAPlace& place, const gpuStream_t& stream, const platform::CUDAPlace& place, const gpuStream_t& stream,
bool create_if_not_found = false) { bool create_if_not_found = false) {
if (LIKELY(!IsCUDAGraphCapturing())) {
if (stream == GetDefaultStream(place)) { if (stream == GetDefaultStream(place)) {
VLOG(7) << "Get Allocator by passing in a default stream"; VLOG(7) << "Get Allocator by passing in a default stream";
return GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); return GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
} }
}
/* shared_lock_guard */ { /* shared_lock_guard */ {
std::shared_lock<std::shared_timed_mutex> lock_guard( std::shared_lock<std::shared_timed_mutex> lock_guard(
...@@ -411,43 +420,31 @@ class AllocatorFacadePrivate { ...@@ -411,43 +420,31 @@ class AllocatorFacadePrivate {
<< place; << place;
} }
void SetDefaultStreamFromDeviceContext() {
VLOG(8) << "Set default stream from DeviceContex";
for (auto& pair : default_stream_safe_cuda_allocators_) {
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
pair.second->SetDefaultStream(
static_cast<phi::GPUContext*>(pool.Get(pair.first))->stream());
}
}
void RecordStream(std::shared_ptr<phi::Allocation> allocation, void RecordStream(std::shared_ptr<phi::Allocation> allocation,
const gpuStream_t& stream) { const gpuStream_t& stream) {
if (allocation->size() == 0) { std::shared_ptr<StreamSafeCUDAAllocation> stream_safe_cuda_allocation =
return; std::dynamic_pointer_cast<StreamSafeCUDAAllocation>(allocation);
} if (stream_safe_cuda_allocation != nullptr) {
StreamSafeCUDAAllocation* stream_safe_cuda_allocation =
dynamic_cast<StreamSafeCUDAAllocation*>(allocation.get());
PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation,
platform::errors::InvalidArgument(
"Failed to dynamic cast %p from Allocation* to "
"StreamSafeCUDAAllocation*",
allocation.get()));
stream_safe_cuda_allocation->RecordStream(stream); stream_safe_cuda_allocation->RecordStream(stream);
} else {
VLOG(6) << "RecordStream for a non-StreamSafeCUDAAllocation";
}
} }
const gpuStream_t& GetStream( const gpuStream_t GetStream(
const std::shared_ptr<phi::Allocation>& allocation) const { const std::shared_ptr<phi::Allocation>& allocation) const {
const StreamSafeCUDAAllocation* stream_safe_cuda_allocation = const std::shared_ptr<StreamSafeCUDAAllocation>
dynamic_cast<const StreamSafeCUDAAllocation*>(allocation.get()); stream_safe_cuda_allocation =
PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation, std::dynamic_pointer_cast<StreamSafeCUDAAllocation>(allocation);
platform::errors::InvalidArgument( if (stream_safe_cuda_allocation != nullptr) {
"Failed to dynamic cast %p from Allocation* to "
"StreamSafeCUDAAllocation*",
allocation.get()));
return stream_safe_cuda_allocation->GetOwningStream(); return stream_safe_cuda_allocation->GetOwningStream();
} }
VLOG(6) << "GetStream for a non-StreamSafeCUDAAllocation";
return static_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(allocation->place()))
->stream();
}
#endif #endif
private: private:
...@@ -880,7 +877,7 @@ class AllocatorFacadePrivate { ...@@ -880,7 +877,7 @@ class AllocatorFacadePrivate {
CheckAllocThreadSafe(zero_size_allocators_); CheckAllocThreadSafe(zero_size_allocators_);
CheckAllocThreadSafe(system_allocators_); CheckAllocThreadSafe(system_allocators_);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (FLAGS_use_stream_safe_cuda_allocator) { if (is_stream_safe_cuda_allocator_used_) {
CheckCUDAAllocThreadSafe(cuda_allocators_); CheckCUDAAllocThreadSafe(cuda_allocators_);
} }
#endif #endif
...@@ -910,6 +907,7 @@ class AllocatorFacadePrivate { ...@@ -910,6 +907,7 @@ class AllocatorFacadePrivate {
static AllocatorMap zero_size_allocators_; static AllocatorMap zero_size_allocators_;
static AllocatorMap system_allocators_; static AllocatorMap system_allocators_;
bool allow_free_idle_chunk_; bool allow_free_idle_chunk_;
bool is_stream_safe_cuda_allocator_used_;
}; };
AllocatorFacadePrivate::AllocatorMap AllocatorFacadePrivate::AllocatorMap
AllocatorFacadePrivate::zero_size_allocators_; AllocatorFacadePrivate::zero_size_allocators_;
...@@ -928,7 +926,7 @@ AllocatorFacade& AllocatorFacade::Instance() { ...@@ -928,7 +926,7 @@ AllocatorFacade& AllocatorFacade::Instance() {
AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const { AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) { if (UNLIKELY(IsCUDAGraphCapturing())) {
auto id = platform::CUDAGraph::CapturingID(); auto id = platform::CUDAGraph::CapturingID();
auto iter = cuda_graph_map_.find(id); auto iter = cuda_graph_map_.find(id);
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
...@@ -986,35 +984,25 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) { ...@@ -986,35 +984,25 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) {
std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared( std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
const platform::Place& place, size_t size, const phi::Stream& stream) { const platform::Place& place, size_t size, const phi::Stream& stream) {
PADDLE_ENFORCE_EQ(
FLAGS_use_stream_safe_cuda_allocator, true,
platform::errors::Unimplemented(
"StreamSafeCUDAAllocator is disabled, you should not call this "
"multi-stream 'AllocaShared' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal."));
return std::shared_ptr<phi::Allocation>(Alloc(place, size, stream)); return std::shared_ptr<phi::Allocation>(Alloc(place, size, stream));
} }
AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size, AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
const phi::Stream& stream) { const phi::Stream& stream) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_EQ( AllocatorFacadePrivate* m = GetPrivate();
FLAGS_use_stream_safe_cuda_allocator, true, if (!m->IsStreamSafeCUDAAllocatorUsed()) {
platform::errors::Unimplemented( VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!";
"StreamSafeCUDAAllocator is disabled, you should not call this " return Alloc(place, size);
"multi-stream 'Alloc' function. To enable it, you can enter" }
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal."));
platform::CUDAPlace p(place.GetDeviceId()); platform::CUDAPlace p(place.GetDeviceId());
if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) { if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id()); gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
return GetPrivate() return m->GetAllocator(p, s, /* create_if_not_found = */ true)
->GetAllocator(p, s, /* create_if_not_found = */ true)
->Allocate(size); ->Allocate(size);
} else { } else {
return GetPrivate()->GetAllocator(p, size)->Allocate(size); return m->GetAllocator(p, size)->Allocate(size);
} }
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU.")); PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
...@@ -1025,13 +1013,6 @@ bool AllocatorFacade::InSameStream( ...@@ -1025,13 +1013,6 @@ bool AllocatorFacade::InSameStream(
const std::shared_ptr<phi::Allocation>& allocation, const std::shared_ptr<phi::Allocation>& allocation,
const phi::Stream& stream) { const phi::Stream& stream) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_EQ(
FLAGS_use_stream_safe_cuda_allocator, true,
platform::errors::Unimplemented(
"StreamSafeCUDAAllocator is disabled, you should not call this "
"multi-stream 'InSameStream' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal."));
gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id()); gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
return s == GetStream(allocation); return s == GetStream(allocation);
#else #else
...@@ -1039,58 +1020,52 @@ bool AllocatorFacade::InSameStream( ...@@ -1039,58 +1020,52 @@ bool AllocatorFacade::InSameStream(
#endif #endif
} }
bool AllocatorFacade::IsStreamSafeCUDAAllocatorUsed() {
return GetPrivate()->IsStreamSafeCUDAAllocatorUsed();
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place, uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
const gpuStream_t& stream) { const gpuStream_t& stream) {
PADDLE_ENFORCE_EQ( AllocatorFacadePrivate* m = GetPrivate();
FLAGS_use_stream_safe_cuda_allocator, true, if (!m->IsStreamSafeCUDAAllocatorUsed()) {
platform::errors::Unimplemented( VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!";
"StreamSafeCUDAAllocator is disabled, you should not call this " return Release(place);
"multi-stream 'Release' function. To enable it, you can enter" }
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal.")); return m->GetAllocator(place, stream)->Release(place);
return GetPrivate()->GetAllocator(place, stream)->Release(place);
} }
void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation, void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
const gpuStream_t& stream) { const gpuStream_t& stream) {
PADDLE_ENFORCE_EQ(
FLAGS_use_stream_safe_cuda_allocator, true,
platform::errors::Unimplemented(
"StreamSafeCUDAAllocator is disabled, you should not call this "
"'RecordStream' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal."));
GetPrivate()->RecordStream(allocation, stream); GetPrivate()->RecordStream(allocation, stream);
} }
const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator( const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
const platform::Place& place, const gpuStream_t& stream) { const platform::Place& place, const gpuStream_t& stream) {
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && AllocatorFacadePrivate* m = GetPrivate();
FLAGS_use_system_allocator == false) {
return GetPrivate()->GetAllocator(place, stream, if (!m->IsStreamSafeCUDAAllocatorUsed()) {
VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!";
return GetAllocator(place);
}
if (platform::is_gpu_place(place) && FLAGS_use_system_allocator == false) {
return m->GetAllocator(place, stream,
/*create_if_not_found=*/true); /*create_if_not_found=*/true);
} }
return GetPrivate()->GetAllocator( return m->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
place, /* A non-zero num to choose allocator_ */ 1);
} }
const gpuStream_t& AllocatorFacade::GetStream( const gpuStream_t AllocatorFacade::GetStream(
const std::shared_ptr<phi::Allocation>& allocation) const { const std::shared_ptr<phi::Allocation>& allocation) const {
PADDLE_ENFORCE_EQ(
FLAGS_use_stream_safe_cuda_allocator, true,
platform::errors::Unimplemented(
"StreamSafeCUDAAllocator is disabled, you should not call this "
"'GetStream' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal."));
return GetPrivate()->GetStream(allocation); return GetPrivate()->GetStream(allocation);
} }
void AllocatorFacade::SetDefaultStream(const platform::CUDAPlace& place, void AllocatorFacade::SetDefaultStream(const platform::CUDAPlace& place,
const gpuStream_t& stream) { const gpuStream_t& stream) {
if (FLAGS_use_stream_safe_cuda_allocator) { if (m_->IsStreamSafeCUDAAllocatorUsed()) {
GetPrivate()->SetDefaultStream(place, stream); m_->SetDefaultStream(place, stream);
} }
} }
...@@ -1109,7 +1084,6 @@ void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { ...@@ -1109,7 +1084,6 @@ void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
"The memory pool of the CUDA Graph with ID %d have been prepared.", "The memory pool of the CUDA Graph with ID %d have been prepared.",
id)); id));
allocator.reset(new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false)); allocator.reset(new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
allocator->SetDefaultStreamFromDeviceContext();
VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id; VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
} }
......
...@@ -76,6 +76,8 @@ class AllocatorFacade { ...@@ -76,6 +76,8 @@ class AllocatorFacade {
bool InSameStream(const std::shared_ptr<Allocation>& allocation, bool InSameStream(const std::shared_ptr<Allocation>& allocation,
const phi::Stream& stream); const phi::Stream& stream);
bool IsStreamSafeCUDAAllocatorUsed();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// TODO(zhiqiu): change gpuStream_t to phi::Stream if needed. // TODO(zhiqiu): change gpuStream_t to phi::Stream if needed.
uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream); uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream);
...@@ -83,7 +85,7 @@ class AllocatorFacade { ...@@ -83,7 +85,7 @@ class AllocatorFacade {
const gpuStream_t& stream); const gpuStream_t& stream);
const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place, const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place,
const gpuStream_t& stream); const gpuStream_t& stream);
const gpuStream_t& GetStream( const gpuStream_t GetStream(
const std::shared_ptr<Allocation>& allocation) const; const std::shared_ptr<Allocation>& allocation) const;
void SetDefaultStream(const platform::CUDAPlace& place, void SetDefaultStream(const platform::CUDAPlace& place,
const gpuStream_t& stream); const gpuStream_t& stream);
......
...@@ -67,7 +67,7 @@ void RecordStream(std::shared_ptr<Allocation> allocation, ...@@ -67,7 +67,7 @@ void RecordStream(std::shared_ptr<Allocation> allocation,
stream); stream);
} }
const gpuStream_t& GetStream(const std::shared_ptr<Allocation>& allocation) { const gpuStream_t GetStream(const std::shared_ptr<Allocation>& allocation) {
return allocation::AllocatorFacade::Instance().GetStream(allocation); return allocation::AllocatorFacade::Instance().GetStream(allocation);
} }
......
...@@ -56,7 +56,7 @@ extern uint64_t Release(const platform::CUDAPlace& place, ...@@ -56,7 +56,7 @@ extern uint64_t Release(const platform::CUDAPlace& place,
void RecordStream(std::shared_ptr<Allocation> allocation, void RecordStream(std::shared_ptr<Allocation> allocation,
const gpuStream_t& stream); const gpuStream_t& stream);
const gpuStream_t& GetStream(const std::shared_ptr<Allocation>& allocation); const gpuStream_t GetStream(const std::shared_ptr<Allocation>& allocation);
#endif #endif
} // namespace memory } // namespace memory
} // namespace paddle } // namespace paddle
...@@ -12,6 +12,15 @@ ...@@ -12,6 +12,15 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/core/stream.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
...@@ -21,14 +30,6 @@ ...@@ -21,14 +30,6 @@
#include <hip/hip_runtime.h> #include <hip/hip_runtime.h>
#endif #endif
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle { namespace paddle {
namespace memory { namespace memory {
...@@ -196,5 +197,12 @@ TEST(Malloc, AllocZero) { ...@@ -196,5 +197,12 @@ TEST(Malloc, AllocZero) {
AllocationPtr allocation_ptr = Alloc(place, 0); AllocationPtr allocation_ptr = Alloc(place, 0);
EXPECT_GE(allocation_ptr->size(), 0); EXPECT_GE(allocation_ptr->size(), 0);
} }
TEST(Malloc, AllocWithStream) {
size_t size = 1024;
AllocationPtr allocation = Alloc(platform::CUDAPlace(), size, phi::Stream(0));
EXPECT_EQ(allocation->size(), 1024);
}
} // namespace memory } // namespace memory
} // namespace paddle } // namespace paddle
...@@ -101,6 +101,19 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorInterfaceTest) { ...@@ -101,6 +101,19 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorInterfaceTest) {
CheckMemLeak(place); CheckMemLeak(place);
} }
TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorWithDefaultStreamTest) {
auto &instance = allocation::AllocatorFacade::Instance();
platform::CUDAPlace place = platform::CUDAPlace();
const std::shared_ptr<Allocator> allocator_implicit_stream =
instance.GetAllocator(place);
const std::shared_ptr<Allocator> allocator_default_stream =
instance.GetAllocator(
place, static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(place))
->stream());
EXPECT_EQ(allocator_implicit_stream.get(), allocator_default_stream.get());
}
TEST(StreamSafeCUDAAllocInterfaceTest, ZeroSizeRecordStreamTest) { TEST(StreamSafeCUDAAllocInterfaceTest, ZeroSizeRecordStreamTest) {
platform::CUDAPlace place = platform::CUDAPlace(); platform::CUDAPlace place = platform::CUDAPlace();
std::shared_ptr<Allocation> zero_size_allocation = AllocShared(place, 0); std::shared_ptr<Allocation> zero_size_allocation = AllocShared(place, 0);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册