未验证 提交 0ad2e192 编写于 作者: F From00 提交者: GitHub

Make StreamSafeCUDAAllocator compatible with NaiveBestFit strategy (#40886)

* Make StreamSafeCUDAAllocator compatible with NaiveBestFit strategy

* Set FLAGS_use_stream_safe_cuda_allocator to false

* Update

* Remove unnecessary code

* Fix CI errors

* Add UT
上级 f6b6b057
......@@ -32,7 +32,6 @@ PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope, true,
DECLARE_bool(check_nan_inf);
DECLARE_bool(benchmark);
DECLARE_bool(fast_eager_deletion_mode);
DECLARE_bool(use_stream_safe_cuda_allocator);
constexpr const char* kExceptionCaught = "ExceptionCaught";
constexpr const char* kTaskCompletion = "TaskCompletion";
......@@ -44,7 +43,9 @@ static constexpr size_t kHostNumThreads = 4;
static constexpr size_t kDeviceNumThreads = 1;
bool IsInterpretercoreFastGCEnabled() {
return FLAGS_fast_eager_deletion_mode && FLAGS_use_stream_safe_cuda_allocator;
return memory::allocation::AllocatorFacade::Instance()
.IsStreamSafeCUDAAllocatorUsed() &&
FLAGS_fast_eager_deletion_mode;
}
InterpreterCore::InterpreterCore(const platform::Place& place,
......
......@@ -145,6 +145,14 @@ class CUDAGraphAllocator
};
#endif
static bool IsCUDAGraphCapturing() {
#ifdef PADDLE_WITH_CUDA
return UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing());
#else
return false;
#endif
}
class AllocatorFacadePrivate {
public:
using AllocatorMap = std::map<platform::Place, std::shared_ptr<Allocator>>;
......@@ -157,6 +165,8 @@ class AllocatorFacadePrivate {
explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) {
strategy_ = GetAllocatorStrategy();
is_stream_safe_cuda_allocator_used_ = false;
switch (strategy_) {
case AllocatorStrategy::kNaiveBestFit: {
InitNaiveBestFitCPUAllocator();
......@@ -166,12 +176,6 @@ class AllocatorFacadePrivate {
}
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_EQ(
FLAGS_use_stream_safe_cuda_allocator, false,
paddle::platform::errors::Unimplemented(
"StreamSafeCUDAAllocator is only implemented for auto_growth "
"strategy, not support naive_best_fit strategy"));
for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
}
......@@ -216,22 +220,25 @@ class AllocatorFacadePrivate {
allow_free_idle_chunk_);
}
// Note(Ruibiao): For GPU multi-stream case, the 'allocators_' map(place
// -> Allocator) hold the StreamSafeCUDAAllocator releate to default
// stream (i.e., the stream directly got from DeviceContex), while the
// 'cuda_allocators_' map(place -> map(stream -> Allocator)) hold the
// StreamSafeCUDAAllocator releate to non-default stream (i.e., the
// stream users pass in). The default stream Allocator is built in the
// structure of AllocatorFacadePrivate, while the non-default stream is
// build in a delayed manner in GetAllocator function with
// 'create_if_not_found = ture'. We make special treatment for the
// default stream for performance reasons. Since most Alloc calls are
// for default stream in application, treating it separately can avoid
// lots of overhead of acquiring default stream and applying read-write
// lock.
// Note(Ruibiao): For GPU multi-stream case without CUDA graph
// capturing, the 'allocators_' map(place -> Allocator) hold the
// StreamSafeCUDAAllocator releate to defaultstream (i.e., the stream
// directly got from DeviceContex), while the 'cuda_allocators_' map
// (place -> map(stream -> Allocator)) hold the StreamSafeCUDAAllocator
// releate to non-default stream (i.e., the stream users pass in). The
// default stream Allocator is built in the structure of
// AllocatorFacadePrivate, while the non-default stream is build in a
// manner in GetAllocator function with 'create_if_not_found = ture'.
// We make special treatment for the default stream for performance
// reasons. Since most Alloc calls are for default stream in
// application, treating it separately can avoid lots of overhead of
// acquiring default stream and applying read-write lock.
if (FLAGS_use_stream_safe_cuda_allocator) {
if (LIKELY(!IsCUDAGraphCapturing())) {
WrapStreamSafeCUDAAllocatorForDefault();
}
is_stream_safe_cuda_allocator_used_ = true;
}
InitNaiveBestFitCUDAPinnedAllocator();
#endif
......@@ -283,12 +290,6 @@ class AllocatorFacadePrivate {
}
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_EQ(
FLAGS_use_stream_safe_cuda_allocator, false,
paddle::platform::errors::Unimplemented(
"StreamSafeCUDAAllocator is only implemented for auto_growth "
"strategy, not support thread_local strategy"));
for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
}
......@@ -317,8 +318,9 @@ class AllocatorFacadePrivate {
CheckAllocThreadSafe();
#ifdef PADDLE_WITH_CUDA
if (FLAGS_use_stream_safe_cuda_allocator == false &&
UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
// No need to wrap CUDAGraphAllocator for StreamSafeCUDAAllocator
if (!is_stream_safe_cuda_allocator_used_ &&
UNLIKELY(IsCUDAGraphCapturing())) {
WrapCUDAGraphAllocator();
}
#endif
......@@ -343,6 +345,11 @@ class AllocatorFacadePrivate {
return static_cast<Allocation*>(allocation.get())->base_ptr();
}
bool IsStreamSafeCUDAAllocatorUsed() {
return is_stream_safe_cuda_allocator_used_ &&
LIKELY(FLAGS_use_system_allocator == false);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
bool HasCUDAAllocator(const platform::CUDAPlace& place,
const gpuStream_t& stream) {
......@@ -358,10 +365,12 @@ class AllocatorFacadePrivate {
const std::shared_ptr<Allocator>& GetAllocator(
const platform::CUDAPlace& place, const gpuStream_t& stream,
bool create_if_not_found = false) {
if (LIKELY(!IsCUDAGraphCapturing())) {
if (stream == GetDefaultStream(place)) {
VLOG(7) << "Get Allocator by passing in a default stream";
return GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
}
}
/* shared_lock_guard */ {
std::shared_lock<std::shared_timed_mutex> lock_guard(
......@@ -411,43 +420,31 @@ class AllocatorFacadePrivate {
<< place;
}
void SetDefaultStreamFromDeviceContext() {
VLOG(8) << "Set default stream from DeviceContex";
for (auto& pair : default_stream_safe_cuda_allocators_) {
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
pair.second->SetDefaultStream(
static_cast<phi::GPUContext*>(pool.Get(pair.first))->stream());
}
}
void RecordStream(std::shared_ptr<phi::Allocation> allocation,
const gpuStream_t& stream) {
if (allocation->size() == 0) {
return;
}
StreamSafeCUDAAllocation* stream_safe_cuda_allocation =
dynamic_cast<StreamSafeCUDAAllocation*>(allocation.get());
PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation,
platform::errors::InvalidArgument(
"Failed to dynamic cast %p from Allocation* to "
"StreamSafeCUDAAllocation*",
allocation.get()));
std::shared_ptr<StreamSafeCUDAAllocation> stream_safe_cuda_allocation =
std::dynamic_pointer_cast<StreamSafeCUDAAllocation>(allocation);
if (stream_safe_cuda_allocation != nullptr) {
stream_safe_cuda_allocation->RecordStream(stream);
} else {
VLOG(6) << "RecordStream for a non-StreamSafeCUDAAllocation";
}
}
const gpuStream_t& GetStream(
const gpuStream_t GetStream(
const std::shared_ptr<phi::Allocation>& allocation) const {
const StreamSafeCUDAAllocation* stream_safe_cuda_allocation =
dynamic_cast<const StreamSafeCUDAAllocation*>(allocation.get());
PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation,
platform::errors::InvalidArgument(
"Failed to dynamic cast %p from Allocation* to "
"StreamSafeCUDAAllocation*",
allocation.get()));
const std::shared_ptr<StreamSafeCUDAAllocation>
stream_safe_cuda_allocation =
std::dynamic_pointer_cast<StreamSafeCUDAAllocation>(allocation);
if (stream_safe_cuda_allocation != nullptr) {
return stream_safe_cuda_allocation->GetOwningStream();
}
VLOG(6) << "GetStream for a non-StreamSafeCUDAAllocation";
return static_cast<phi::GPUContext*>(
platform::DeviceContextPool::Instance().Get(allocation->place()))
->stream();
}
#endif
private:
......@@ -880,7 +877,7 @@ class AllocatorFacadePrivate {
CheckAllocThreadSafe(zero_size_allocators_);
CheckAllocThreadSafe(system_allocators_);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (FLAGS_use_stream_safe_cuda_allocator) {
if (is_stream_safe_cuda_allocator_used_) {
CheckCUDAAllocThreadSafe(cuda_allocators_);
}
#endif
......@@ -910,6 +907,7 @@ class AllocatorFacadePrivate {
static AllocatorMap zero_size_allocators_;
static AllocatorMap system_allocators_;
bool allow_free_idle_chunk_;
bool is_stream_safe_cuda_allocator_used_;
};
AllocatorFacadePrivate::AllocatorMap
AllocatorFacadePrivate::zero_size_allocators_;
......@@ -928,7 +926,7 @@ AllocatorFacade& AllocatorFacade::Instance() {
AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const {
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
if (UNLIKELY(IsCUDAGraphCapturing())) {
auto id = platform::CUDAGraph::CapturingID();
auto iter = cuda_graph_map_.find(id);
PADDLE_ENFORCE_NE(
......@@ -986,35 +984,25 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) {
std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
const platform::Place& place, size_t size, const phi::Stream& stream) {
PADDLE_ENFORCE_EQ(
FLAGS_use_stream_safe_cuda_allocator, true,
platform::errors::Unimplemented(
"StreamSafeCUDAAllocator is disabled, you should not call this "
"multi-stream 'AllocaShared' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal."));
return std::shared_ptr<phi::Allocation>(Alloc(place, size, stream));
}
AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
const phi::Stream& stream) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_EQ(
FLAGS_use_stream_safe_cuda_allocator, true,
platform::errors::Unimplemented(
"StreamSafeCUDAAllocator is disabled, you should not call this "
"multi-stream 'Alloc' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal."));
AllocatorFacadePrivate* m = GetPrivate();
if (!m->IsStreamSafeCUDAAllocatorUsed()) {
VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!";
return Alloc(place, size);
}
platform::CUDAPlace p(place.GetDeviceId());
if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
return GetPrivate()
->GetAllocator(p, s, /* create_if_not_found = */ true)
return m->GetAllocator(p, s, /* create_if_not_found = */ true)
->Allocate(size);
} else {
return GetPrivate()->GetAllocator(p, size)->Allocate(size);
return m->GetAllocator(p, size)->Allocate(size);
}
#else
PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
......@@ -1025,13 +1013,6 @@ bool AllocatorFacade::InSameStream(
const std::shared_ptr<phi::Allocation>& allocation,
const phi::Stream& stream) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_EQ(
FLAGS_use_stream_safe_cuda_allocator, true,
platform::errors::Unimplemented(
"StreamSafeCUDAAllocator is disabled, you should not call this "
"multi-stream 'InSameStream' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal."));
gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
return s == GetStream(allocation);
#else
......@@ -1039,58 +1020,52 @@ bool AllocatorFacade::InSameStream(
#endif
}
bool AllocatorFacade::IsStreamSafeCUDAAllocatorUsed() {
return GetPrivate()->IsStreamSafeCUDAAllocatorUsed();
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
const gpuStream_t& stream) {
PADDLE_ENFORCE_EQ(
FLAGS_use_stream_safe_cuda_allocator, true,
platform::errors::Unimplemented(
"StreamSafeCUDAAllocator is disabled, you should not call this "
"multi-stream 'Release' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal."));
return GetPrivate()->GetAllocator(place, stream)->Release(place);
AllocatorFacadePrivate* m = GetPrivate();
if (!m->IsStreamSafeCUDAAllocatorUsed()) {
VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!";
return Release(place);
}
return m->GetAllocator(place, stream)->Release(place);
}
void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
const gpuStream_t& stream) {
PADDLE_ENFORCE_EQ(
FLAGS_use_stream_safe_cuda_allocator, true,
platform::errors::Unimplemented(
"StreamSafeCUDAAllocator is disabled, you should not call this "
"'RecordStream' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal."));
GetPrivate()->RecordStream(allocation, stream);
}
const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
const platform::Place& place, const gpuStream_t& stream) {
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
FLAGS_use_system_allocator == false) {
return GetPrivate()->GetAllocator(place, stream,
AllocatorFacadePrivate* m = GetPrivate();
if (!m->IsStreamSafeCUDAAllocatorUsed()) {
VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!";
return GetAllocator(place);
}
if (platform::is_gpu_place(place) && FLAGS_use_system_allocator == false) {
return m->GetAllocator(place, stream,
/*create_if_not_found=*/true);
}
return GetPrivate()->GetAllocator(
place, /* A non-zero num to choose allocator_ */ 1);
return m->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
}
const gpuStream_t& AllocatorFacade::GetStream(
const gpuStream_t AllocatorFacade::GetStream(
const std::shared_ptr<phi::Allocation>& allocation) const {
PADDLE_ENFORCE_EQ(
FLAGS_use_stream_safe_cuda_allocator, true,
platform::errors::Unimplemented(
"StreamSafeCUDAAllocator is disabled, you should not call this "
"'GetStream' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal."));
return GetPrivate()->GetStream(allocation);
}
void AllocatorFacade::SetDefaultStream(const platform::CUDAPlace& place,
const gpuStream_t& stream) {
if (FLAGS_use_stream_safe_cuda_allocator) {
GetPrivate()->SetDefaultStream(place, stream);
if (m_->IsStreamSafeCUDAAllocatorUsed()) {
m_->SetDefaultStream(place, stream);
}
}
......@@ -1109,7 +1084,6 @@ void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
"The memory pool of the CUDA Graph with ID %d have been prepared.",
id));
allocator.reset(new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
allocator->SetDefaultStreamFromDeviceContext();
VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
}
......
......@@ -76,6 +76,8 @@ class AllocatorFacade {
bool InSameStream(const std::shared_ptr<Allocation>& allocation,
const phi::Stream& stream);
bool IsStreamSafeCUDAAllocatorUsed();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// TODO(zhiqiu): change gpuStream_t to phi::Stream if needed.
uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream);
......@@ -83,7 +85,7 @@ class AllocatorFacade {
const gpuStream_t& stream);
const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place,
const gpuStream_t& stream);
const gpuStream_t& GetStream(
const gpuStream_t GetStream(
const std::shared_ptr<Allocation>& allocation) const;
void SetDefaultStream(const platform::CUDAPlace& place,
const gpuStream_t& stream);
......
......@@ -67,7 +67,7 @@ void RecordStream(std::shared_ptr<Allocation> allocation,
stream);
}
const gpuStream_t& GetStream(const std::shared_ptr<Allocation>& allocation) {
const gpuStream_t GetStream(const std::shared_ptr<Allocation>& allocation) {
return allocation::AllocatorFacade::Instance().GetStream(allocation);
}
......
......@@ -56,7 +56,7 @@ extern uint64_t Release(const platform::CUDAPlace& place,
void RecordStream(std::shared_ptr<Allocation> allocation,
const gpuStream_t& stream);
const gpuStream_t& GetStream(const std::shared_ptr<Allocation>& allocation);
const gpuStream_t GetStream(const std::shared_ptr<Allocation>& allocation);
#endif
} // namespace memory
} // namespace paddle
......@@ -12,6 +12,15 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/core/stream.h"
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
......@@ -21,14 +30,6 @@
#include <hip/hip_runtime.h>
#endif
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace memory {
......@@ -196,5 +197,12 @@ TEST(Malloc, AllocZero) {
AllocationPtr allocation_ptr = Alloc(place, 0);
EXPECT_GE(allocation_ptr->size(), 0);
}
TEST(Malloc, AllocWithStream) {
size_t size = 1024;
AllocationPtr allocation = Alloc(platform::CUDAPlace(), size, phi::Stream(0));
EXPECT_EQ(allocation->size(), 1024);
}
} // namespace memory
} // namespace paddle
......@@ -101,6 +101,19 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorInterfaceTest) {
CheckMemLeak(place);
}
TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorWithDefaultStreamTest) {
auto &instance = allocation::AllocatorFacade::Instance();
platform::CUDAPlace place = platform::CUDAPlace();
const std::shared_ptr<Allocator> allocator_implicit_stream =
instance.GetAllocator(place);
const std::shared_ptr<Allocator> allocator_default_stream =
instance.GetAllocator(
place, static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(place))
->stream());
EXPECT_EQ(allocator_implicit_stream.get(), allocator_default_stream.get());
}
TEST(StreamSafeCUDAAllocInterfaceTest, ZeroSizeRecordStreamTest) {
platform::CUDAPlace place = platform::CUDAPlace();
std::shared_ptr<Allocation> zero_size_allocation = AllocShared(place, 0);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册