Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
c9beb885
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
c9beb885
编写于
3月 03, 2022
作者:
P
phlrain
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into move_yolo_box_to_phi
上级
25fe41d0
4c0511fa
变更
8
显示空白变更内容
内联
并排
Showing
8 changed file
with
436 addition
and
426 deletion
+436
-426
paddle/fluid/memory/allocation/CMakeLists.txt
paddle/fluid/memory/allocation/CMakeLists.txt
+1
-1
paddle/fluid/memory/allocation/allocator_facade.cc
paddle/fluid/memory/allocation/allocator_facade.cc
+101
-175
paddle/fluid/memory/allocation/allocator_facade.h
paddle/fluid/memory/allocation/allocator_facade.h
+9
-2
paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+81
-39
paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+20
-8
paddle/fluid/memory/malloc.cc
paddle/fluid/memory/malloc.cc
+5
-5
paddle/fluid/memory/malloc.h
paddle/fluid/memory/malloc.h
+3
-3
paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
+216
-193
未找到文件。
paddle/fluid/memory/allocation/CMakeLists.txt
浏览文件 @
c9beb885
...
...
@@ -17,7 +17,7 @@ if (WITH_GPU)
nv_library
(
cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard
)
nv_library
(
cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info
)
nv_library
(
pinned_allocator SRCS pinned_allocator.cc DEPS allocator
)
nv_library
(
stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator
)
nv_library
(
stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator
cuda_graph
)
nv_library
(
thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator
)
cc_test
(
thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator
)
...
...
paddle/fluid/memory/allocation/allocator_facade.cc
浏览文件 @
c9beb885
...
...
@@ -210,12 +210,7 @@ class AllocatorFacadePrivate {
InitNaiveBestFitCPUAllocator
();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
allow_free_idle_chunk_
=
allow_free_idle_chunk
;
if
(
FLAGS_use_stream_safe_cuda_allocator
)
{
for
(
int
dev_id
=
0
;
dev_id
<
platform
::
GetGPUDeviceCount
();
++
dev_id
)
{
InitStreamSafeCUDAAllocator
(
platform
::
CUDAPlace
(
dev_id
),
nullptr
);
}
}
else
{
if
(
!
FLAGS_use_stream_safe_cuda_allocator
)
{
for
(
int
dev_id
=
0
;
dev_id
<
platform
::
GetGPUDeviceCount
();
++
dev_id
)
{
InitAutoGrowthCUDAAllocator
(
platform
::
CUDAPlace
(
dev_id
),
...
...
@@ -298,6 +293,12 @@ class AllocatorFacadePrivate {
}
CheckAllocThreadSafe
();
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsThisThreadCapturing
()))
{
WrapCUDAGraphAllocator
();
}
#endif
}
inline
const
std
::
shared_ptr
<
Allocator
>&
GetAllocator
(
...
...
@@ -388,39 +389,6 @@ class AllocatorFacadePrivate {
allocation
.
get
()));
return
stream_safe_cuda_allocation
->
GetOwningStream
();
}
#ifdef PADDLE_WITH_CUDA
void
PrepareMemoryPoolForCUDAGraph
(
CUDAGraphID
id
)
{
PADDLE_ENFORCE_EQ
(
strategy_
,
AllocatorStrategy
::
kAutoGrowth
,
platform
::
errors
::
InvalidArgument
(
"CUDA Graph is only supported when the "
"FLAGS_allocator_strategy=
\"
auto_growth
\"
, but got "
"FLAGS_allocator_strategy=
\"
%s
\"
"
,
FLAGS_allocator_strategy
));
auto
&
allocator
=
cuda_graph_allocator_map_
[
id
];
PADDLE_ENFORCE_EQ
(
allocator
.
get
(),
nullptr
,
platform
::
errors
::
InvalidArgument
(
"The memory pool of the CUDA Graph with ID %d have been prepared."
,
id
));
allocator
.
reset
(
new
AllocatorFacadePrivate
(
/*allow_free_idle_chunk=*/
false
));
for
(
auto
&
item
:
allocator
->
allocators_
)
{
auto
&
old_allocator
=
item
.
second
;
old_allocator
=
CUDAGraphAllocator
::
Create
(
old_allocator
);
}
VLOG
(
10
)
<<
"Prepare memory pool for CUDA Graph with ID "
<<
id
;
}
void
RemoveMemoryPoolOfCUDAGraph
(
CUDAGraphID
id
)
{
auto
iter
=
cuda_graph_allocator_map_
.
find
(
id
);
PADDLE_ENFORCE_NE
(
iter
,
cuda_graph_allocator_map_
.
end
(),
platform
::
errors
::
InvalidArgument
(
"Cannot find CUDA Graph with ID = %d"
,
id
));
cuda_graph_allocator_map_
.
erase
(
iter
);
VLOG
(
10
)
<<
"Remove memory pool of CUDA Graph with ID "
<<
id
;
}
#endif
#endif
private:
...
...
@@ -439,24 +407,7 @@ class AllocatorFacadePrivate {
platform
::
Place
place_
;
};
const
AllocatorMap
&
GetAllocatorMap
()
{
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsThisThreadCapturing
()))
{
auto
id
=
platform
::
CUDAGraph
::
CapturingID
();
auto
iter
=
cuda_graph_allocator_map_
.
find
(
id
);
PADDLE_ENFORCE_NE
(
iter
,
cuda_graph_allocator_map_
.
end
(),
platform
::
errors
::
PermissionDenied
(
"No memory pool is prepared for CUDA Graph capturing."
));
VLOG
(
10
)
<<
"Choose CUDA Graph memory pool to allocate memory"
;
return
iter
->
second
->
allocators_
;
}
else
{
return
allocators_
;
}
#else
return
allocators_
;
#endif
}
const
AllocatorMap
&
GetAllocatorMap
()
{
return
allocators_
;
}
void
InitNaiveBestFitCPUAllocator
()
{
allocators_
[
platform
::
CPUPlace
()]
=
...
...
@@ -672,10 +623,10 @@ class AllocatorFacadePrivate {
}
void
WrapStreamSafeCUDAAllocator
(
platform
::
CUDAPlace
p
,
gpuStream_t
stream
)
{
const
std
::
shared_ptr
<
Allocator
>&
underlying_allocator
=
cuda_allocators_
[
p
][
stream
];
cuda_allocators_
[
p
][
stream
]
=
std
::
make_shared
<
StreamSafeCUDAAllocator
>
(
underlying_allocator
,
p
,
stream
);
std
::
shared_ptr
<
Allocator
>&
allocator
=
cuda_allocators_
[
p
][
stream
];
allocator
=
std
::
make_shared
<
StreamSafeCUDAAllocator
>
(
allocator
,
p
,
stream
,
/* in_cuda_graph_capturing = */
!
allow_free_idle_chunk_
);
}
void
WrapCUDARetryAllocator
(
platform
::
CUDAPlace
p
,
gpuStream_t
stream
,
...
...
@@ -684,10 +635,19 @@ class AllocatorFacadePrivate {
retry_time
,
0
,
platform
::
errors
::
InvalidArgument
(
"Retry time should be larger than 0, but got %d"
,
retry_time
));
std
::
shared_ptr
<
Allocator
>
allocator
=
cuda_allocators_
[
p
][
stream
];
std
::
shared_ptr
<
Allocator
>
&
allocator
=
cuda_allocators_
[
p
][
stream
];
allocator
=
std
::
make_shared
<
RetryAllocator
>
(
allocator
,
retry_time
);
}
#ifdef PADDLE_WITH_CUDA
void
WrapCUDAGraphAllocator
()
{
for
(
auto
&
item
:
allocators_
)
{
auto
&
allocator
=
item
.
second
;
allocator
=
CUDAGraphAllocator
::
Create
(
allocator
);
}
}
#endif
static
void
CheckCUDAAllocThreadSafe
(
const
CUDAAllocatorMap
&
allocators
)
{
for
(
auto
&
place_pair
:
allocators
)
{
for
(
auto
&
stream_pair
:
place_pair
.
second
)
{
...
...
@@ -864,10 +824,6 @@ class AllocatorFacadePrivate {
// a standalone CUDA allocator to support multi-stream GC in new executor
CUDAAllocatorMap
cuda_allocators_
;
std
::
shared_timed_mutex
cuda_allocator_mutex_
;
#ifdef PADDLE_WITH_CUDA
std
::
unordered_map
<
CUDAGraphID
,
std
::
unique_ptr
<
AllocatorFacadePrivate
>>
cuda_graph_allocator_map_
;
#endif
#endif
AllocatorStrategy
strategy_
;
AllocatorMap
allocators_
;
...
...
@@ -886,8 +842,24 @@ AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {}
AllocatorFacade
::~
AllocatorFacade
()
{}
AllocatorFacade
&
AllocatorFacade
::
Instance
()
{
static
AllocatorFacade
instance
;
return
instance
;
static
AllocatorFacade
*
instance
=
new
AllocatorFacade
;
return
*
instance
;
}
AllocatorFacadePrivate
*
AllocatorFacade
::
GetPrivate
()
const
{
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsThisThreadCapturing
()))
{
auto
id
=
platform
::
CUDAGraph
::
CapturingID
();
auto
iter
=
cuda_graph_map_
.
find
(
id
);
PADDLE_ENFORCE_NE
(
iter
,
cuda_graph_map_
.
end
(),
platform
::
errors
::
PermissionDenied
(
"No memory pool is prepared for CUDA Graph capturing."
));
VLOG
(
10
)
<<
"Choose CUDA Graph memory pool"
;
return
iter
->
second
.
get
();
}
#endif
return
m_
;
}
const
std
::
shared_ptr
<
Allocator
>&
AllocatorFacade
::
GetAllocator
(
...
...
@@ -895,19 +867,14 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
FLAGS_use_stream_safe_cuda_allocator
&&
platform
::
is_gpu_place
(
place
)
&&
FLAGS_use_system_allocator
==
false
)
{
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
return
m_
->
GetAllocator
(
place
,
/* A non-zero num to choose allocator_ */
1
);
}
#endif
AllocatorFacadePrivate
*
m
=
GetPrivate
();
platform
::
CUDAPlace
cuda_place
(
place
.
GetDeviceId
());
return
m
_
->
GetAllocator
(
cuda_place
,
m_
->
GetDefaultStream
(
cuda_place
));
return
m
->
GetAllocator
(
cuda_place
,
m
->
GetDefaultStream
(
cuda_place
));
}
#endif
return
m_
->
GetAllocator
(
place
,
/* A non-zero num to choose allocator_ */
1
);
return
GetPrivate
()
->
GetAllocator
(
place
,
/* A non-zero num to choose allocator_ */
1
);
}
void
*
AllocatorFacade
::
GetBasePtr
(
...
...
@@ -922,7 +889,7 @@ void* AllocatorFacade::GetBasePtr(
"GetBasePtr() is only implemented for CUDAPlace(), not "
"suppot place: %s"
,
allocation
->
place
()));
return
m_
->
GetBasePtr
(
allocation
);
return
GetPrivate
()
->
GetBasePtr
(
allocation
);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
...
...
@@ -930,21 +897,17 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
const
platform
::
Place
&
place
,
const
gpuStream_t
&
stream
)
{
if
(
FLAGS_use_stream_safe_cuda_allocator
&&
platform
::
is_gpu_place
(
place
)
&&
FLAGS_use_system_allocator
==
false
)
{
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
return
m_
->
GetAllocator
(
place
,
/* A non-zero num to choose allocator_ */
1
);
}
#endif
return
m_
->
GetAllocator
(
place
,
stream
,
/*create_if_not_found=*/
true
);
return
GetPrivate
()
->
GetAllocator
(
place
,
stream
,
/*create_if_not_found=*/
true
);
}
return
m_
->
GetAllocator
(
place
,
/* A non-zero num to choose allocator_ */
1
);
return
GetPrivate
()
->
GetAllocator
(
place
,
/* A non-zero num to choose allocator_ */
1
);
}
#endif
const
std
::
shared_ptr
<
Allocator
>&
AllocatorFacade
::
GetZeroAllocator
(
const
platform
::
Place
&
place
)
{
return
m_
->
GetAllocator
(
place
,
/* zero size */
0
);
return
GetPrivate
()
->
GetAllocator
(
place
,
/* zero size */
0
);
}
std
::
shared_ptr
<
phi
::
Allocation
>
AllocatorFacade
::
AllocShared
(
...
...
@@ -957,43 +920,30 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
FLAGS_use_stream_safe_cuda_allocator
&&
platform
::
is_gpu_place
(
place
)
&&
size
>
0
&&
FLAGS_use_system_allocator
==
false
)
{
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
return
m_
->
GetAllocator
(
place
,
size
)
->
Allocate
(
size
);
}
#endif
platform
::
CUDAPlace
cuda_place
(
place
.
GetDeviceId
());
return
Alloc
(
cuda_place
,
size
,
m_
->
GetDefaultStream
(
cuda_place
));
phi
::
Stream
default_stream
=
phi
::
Stream
(
reinterpret_cast
<
phi
::
StreamId
>
(
GetPrivate
()
->
GetDefaultStream
(
cuda_place
)));
return
Alloc
(
cuda_place
,
size
,
default_stream
);
}
#endif
return
m_
->
GetAllocator
(
place
,
size
)
->
Allocate
(
size
);
return
GetPrivate
()
->
GetAllocator
(
place
,
size
)
->
Allocate
(
size
);
}
uint64_t
AllocatorFacade
::
Release
(
const
platform
::
Place
&
place
)
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
FLAGS_use_stream_safe_cuda_allocator
&&
platform
::
is_gpu_place
(
place
)
&&
FLAGS_use_system_allocator
==
false
)
{
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
return
m_
->
GetAllocator
(
place
,
/* A non-zero num to choose allocator_ */
1
)
->
Release
(
place
);
}
#endif
platform
::
CUDAPlace
cuda_place
(
place
.
GetDeviceId
());
return
Release
(
cuda_place
,
m_
->
GetDefaultStream
(
cuda_place
));
return
Release
(
cuda_place
,
GetPrivate
()
->
GetDefaultStream
(
cuda_place
));
}
#endif
return
m_
->
GetAllocator
(
place
,
/* A non-zero num to choose allocator_ */
1
)
return
GetPrivate
()
->
GetAllocator
(
place
,
/* A non-zero num to choose allocator_ */
1
)
->
Release
(
place
);
}
std
::
shared_ptr
<
phi
::
Allocation
>
AllocatorFacade
::
AllocShared
(
const
platform
::
Place
&
place
,
size_t
size
,
const
phi
::
Stream
&
stream
)
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_EQ
(
FLAGS_use_stream_safe_cuda_allocator
,
true
,
platform
::
errors
::
Unimplemented
(
...
...
@@ -1001,71 +951,53 @@ std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
"multi-stream 'AllocaShared' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal."
));
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"
));
}
#endif
gpuStream_t
s
=
reinterpret_cast
<
gpuStream_t
>
(
stream
.
id
());
return
std
::
shared_ptr
<
phi
::
Allocation
>
(
Alloc
(
place
,
size
,
s
));
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with GPU."
));
#endif
return
std
::
shared_ptr
<
phi
::
Allocation
>
(
Alloc
(
place
,
size
,
stream
));
}
bool
AllocatorFacade
::
InSameStream
(
const
std
::
shared_ptr
<
phi
::
Allocation
>&
allocation
,
AllocationPtr
AllocatorFacade
::
Alloc
(
const
platform
::
Place
&
place
,
size_t
size
,
const
phi
::
Stream
&
stream
)
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_EQ
(
FLAGS_use_stream_safe_cuda_allocator
,
true
,
platform
::
errors
::
Unimplemented
(
"StreamSafeCUDAAllocator is disabled, you should not call this "
"multi-stream '
InSameStream
' function. To enable it, you can enter"
"multi-stream '
Alloc
' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal."
));
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"
));
}
#endif
platform
::
CUDAPlace
p
(
place
.
GetDeviceId
());
if
(
LIKELY
(
size
>
0
&&
FLAGS_use_system_allocator
==
false
))
{
gpuStream_t
s
=
reinterpret_cast
<
gpuStream_t
>
(
stream
.
id
());
return
s
==
GetStream
(
allocation
);
return
GetPrivate
()
->
GetAllocator
(
p
,
s
,
/* create_if_not_found = */
true
)
->
Allocate
(
size
);
}
else
{
return
GetPrivate
()
->
GetAllocator
(
p
,
size
)
->
Allocate
(
size
);
}
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with GPU."
));
#endif
}
bool
AllocatorFacade
::
InSameStream
(
const
std
::
shared_ptr
<
phi
::
Allocation
>&
allocation
,
const
phi
::
Stream
&
stream
)
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
AllocationPtr
AllocatorFacade
::
Alloc
(
const
platform
::
Place
&
place
,
size_t
size
,
const
gpuStream_t
&
stream
)
{
PADDLE_ENFORCE_EQ
(
FLAGS_use_stream_safe_cuda_allocator
,
true
,
platform
::
errors
::
Unimplemented
(
"StreamSafeCUDAAllocator is disabled, you should not call this "
"multi-stream '
Alloc
' function. To enable it, you can enter"
"multi-stream '
InSameStream
' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal."
));
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"
));
}
gpuStream_t
s
=
reinterpret_cast
<
gpuStream_t
>
(
stream
.
id
());
return
s
==
GetStream
(
allocation
);
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with GPU."
));
#endif
platform
::
CUDAPlace
p
(
place
.
GetDeviceId
());
if
(
LIKELY
(
size
>
0
&&
FLAGS_use_system_allocator
==
false
))
{
return
m_
->
GetAllocator
(
p
,
stream
,
/* create_if_not_found = */
true
)
->
Allocate
(
size
);
}
else
{
return
m_
->
GetAllocator
(
p
,
size
)
->
Allocate
(
size
);
}
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
uint64_t
AllocatorFacade
::
Release
(
const
platform
::
CUDAPlace
&
place
,
const
gpuStream_t
&
stream
)
{
PADDLE_ENFORCE_EQ
(
...
...
@@ -1075,15 +1007,7 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
"multi-stream 'Release' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal."
));
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"
));
}
#endif
return
m_
->
GetAllocator
(
place
,
stream
)
->
Release
(
place
);
return
GetPrivate
()
->
GetAllocator
(
place
,
stream
)
->
Release
(
place
);
}
void
AllocatorFacade
::
RecordStream
(
std
::
shared_ptr
<
phi
::
Allocation
>
allocation
,
...
...
@@ -1095,15 +1019,7 @@ void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
"'RecordStream' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal."
));
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"
));
}
#endif
m_
->
RecordStream
(
allocation
,
stream
);
GetPrivate
()
->
RecordStream
(
allocation
,
stream
);
}
const
gpuStream_t
&
AllocatorFacade
::
GetStream
(
...
...
@@ -1115,24 +1031,34 @@ const gpuStream_t& AllocatorFacade::GetStream(
"'GetStream' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal."
));
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"
));
}
#endif
return
m_
->
GetStream
(
allocation
);
return
GetPrivate
()
->
GetStream
(
allocation
);
}
#ifdef PADDLE_WITH_CUDA
void
AllocatorFacade
::
PrepareMemoryPoolForCUDAGraph
(
CUDAGraphID
id
)
{
return
m_
->
PrepareMemoryPoolForCUDAGraph
(
id
);
PADDLE_ENFORCE_EQ
(
GetAllocatorStrategy
(),
AllocatorStrategy
::
kAutoGrowth
,
platform
::
errors
::
InvalidArgument
(
"CUDA Graph is only supported when the "
"FLAGS_allocator_strategy=
\"
auto_growth
\"
, but got "
"FLAGS_allocator_strategy=
\"
%s
\"
"
,
FLAGS_allocator_strategy
));
auto
&
allocator
=
cuda_graph_map_
[
id
];
PADDLE_ENFORCE_EQ
(
allocator
.
get
(),
nullptr
,
platform
::
errors
::
InvalidArgument
(
"The memory pool of the CUDA Graph with ID %d have been prepared."
,
id
));
allocator
.
reset
(
new
AllocatorFacadePrivate
(
/*allow_free_idle_chunk=*/
false
));
VLOG
(
10
)
<<
"Prepare memory pool for CUDA Graph with ID "
<<
id
;
}
void
AllocatorFacade
::
RemoveMemoryPoolOfCUDAGraph
(
CUDAGraphID
id
)
{
return
m_
->
RemoveMemoryPoolOfCUDAGraph
(
id
);
auto
iter
=
cuda_graph_map_
.
find
(
id
);
PADDLE_ENFORCE_NE
(
iter
,
cuda_graph_map_
.
end
(),
platform
::
errors
::
InvalidArgument
(
"Cannot find CUDA Graph with ID = %d"
,
id
));
cuda_graph_map_
.
erase
(
iter
);
VLOG
(
10
)
<<
"Remove memory pool of CUDA Graph with ID "
<<
id
;
}
#endif
#endif
...
...
paddle/fluid/memory/allocation/allocator_facade.h
浏览文件 @
c9beb885
...
...
@@ -49,6 +49,8 @@ class AllocatorFacade {
static
AllocatorFacade
&
Instance
();
AllocatorFacadePrivate
*
GetPrivate
()
const
;
const
std
::
shared_ptr
<
Allocator
>&
GetAllocator
(
const
platform
::
Place
&
place
);
void
*
GetBasePtr
(
const
std
::
shared_ptr
<
Allocation
>&
allocation
);
...
...
@@ -73,13 +75,14 @@ class AllocatorFacade {
size_t
size
,
const
phi
::
Stream
&
stream
);
AllocationPtr
Alloc
(
const
platform
::
Place
&
place
,
size_t
size
,
const
phi
::
Stream
&
stream
);
bool
InSameStream
(
const
std
::
shared_ptr
<
Allocation
>&
allocation
,
const
phi
::
Stream
&
stream
);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// TODO(zhiqiu): change gpuStream_t to phi::Stream if needed.
AllocationPtr
Alloc
(
const
platform
::
Place
&
place
,
size_t
size
,
const
gpuStream_t
&
stream
);
uint64_t
Release
(
const
platform
::
CUDAPlace
&
place
,
const
gpuStream_t
&
stream
);
void
RecordStream
(
std
::
shared_ptr
<
Allocation
>
allocation
,
const
gpuStream_t
&
stream
);
...
...
@@ -96,6 +99,10 @@ class AllocatorFacade {
private:
AllocatorFacade
();
AllocatorFacadePrivate
*
m_
;
#ifdef PADDLE_WITH_CUDA
std
::
unordered_map
<
CUDAGraphID
,
std
::
unique_ptr
<
AllocatorFacadePrivate
>>
cuda_graph_map_
;
#endif
};
}
// namespace allocation
...
...
paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
浏览文件 @
c9beb885
...
...
@@ -15,56 +15,52 @@
#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
#endif
namespace
paddle
{
namespace
memory
{
namespace
allocation
{
StreamSafeCUDAAllocation
::
StreamSafeCUDAAllocation
(
DecoratedAllocationPtr
underlying_allocation
,
gpuStream_t
owning_stream
)
DecoratedAllocationPtr
underlying_allocation
,
gpuStream_t
owning_stream
,
StreamSafeCUDAAllocator
*
allocator
)
:
Allocation
(
underlying_allocation
->
ptr
(),
underlying_allocation
->
base_ptr
(),
underlying_allocation
->
size
(),
underlying_allocation
->
place
()),
underlying_allocation_
(
std
::
move
(
underlying_allocation
)),
owning_stream_
(
std
::
move
(
owning_stream
))
{}
owning_stream_
(
std
::
move
(
owning_stream
)),
allocator_
(
allocator
->
shared_from_this
())
{}
void
StreamSafeCUDAAllocation
::
RecordStream
(
const
gpuStream_t
&
stream
)
{
VLOG
(
8
)
<<
"Try record stream "
<<
stream
<<
" for address "
<<
ptr
();
if
(
stream
==
owning_stream_
)
{
VLOG
(
9
)
<<
"Record the same stream of "
<<
stream
;
return
;
}
std
::
lock_guard
<
SpinLock
>
lock_guard
(
outstanding_event_map_lock_
);
gpuEvent_t
record_event
;
auto
it
=
outstanding_event_map_
.
find
(
stream
);
if
(
it
==
outstanding_event_map_
.
end
())
{
gpuEvent_t
new_event
;
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaEventCreateWithFlags
(
&
new_event
,
cudaEventDisableTiming
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipEventCreateWithFlags
(
&
new_event
,
hipEventDisableTiming
));
#endif
outstanding_event_map_
[
stream
]
=
new_event
;
record_event
=
new_event
;
VLOG
(
9
)
<<
"Create a new event "
<<
new_event
;
}
else
{
record_event
=
it
->
second
;
VLOG
(
9
)
<<
"Reuse event "
<<
record_event
;
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsThisThreadCapturing
()))
{
graph_capturing_stream_set_
.
insert
(
stream
);
return
;
}
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaEventRecord
(
record_event
,
stream
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipEventRecord
(
record_event
,
stream
));
#endif
VLOG
(
8
)
<<
"Record event "
<<
record_event
<<
" to stream "
<<
stream
;
RecordStreamWithNoGraphCapturing
(
stream
);
RecordGraphCapturingStreams
();
}
bool
StreamSafeCUDAAllocation
::
CanBeFreed
()
{
// NOTE(Ruibiao): This function will not execute concurrently,
// so outstanding_event_lock_ is not required here
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsThisThreadCapturing
()))
{
return
graph_capturing_stream_set_
.
empty
()
&&
outstanding_event_map_
.
empty
();
}
#endif
RecordGraphCapturingStreams
();
for
(
auto
it
=
outstanding_event_map_
.
begin
();
it
!=
outstanding_event_map_
.
end
();
++
it
)
{
gpuEvent_t
&
event
=
it
->
second
;
...
...
@@ -98,21 +94,62 @@ const gpuStream_t& StreamSafeCUDAAllocation::GetOwningStream() const {
return
owning_stream_
;
}
void
StreamSafeCUDAAllocation
::
RecordGraphCapturingStreams
()
{
for
(
gpuStream_t
stream
:
graph_capturing_stream_set_
)
{
RecordStreamWithNoGraphCapturing
(
stream
);
}
graph_capturing_stream_set_
.
clear
();
}
void
StreamSafeCUDAAllocation
::
RecordStreamWithNoGraphCapturing
(
const
gpuStream_t
&
stream
)
{
gpuEvent_t
record_event
;
auto
it
=
outstanding_event_map_
.
find
(
stream
);
if
(
it
==
outstanding_event_map_
.
end
())
{
gpuEvent_t
new_event
;
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaEventCreateWithFlags
(
&
new_event
,
cudaEventDisableTiming
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipEventCreateWithFlags
(
&
new_event
,
hipEventDisableTiming
));
#endif
outstanding_event_map_
[
stream
]
=
new_event
;
record_event
=
new_event
;
VLOG
(
9
)
<<
"Create a new event "
<<
new_event
;
}
else
{
record_event
=
it
->
second
;
VLOG
(
9
)
<<
"Reuse event "
<<
record_event
;
}
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaEventRecord
(
record_event
,
stream
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipEventRecord
(
record_event
,
stream
));
#endif
VLOG
(
8
)
<<
"Record event "
<<
record_event
<<
" to stream "
<<
stream
;
}
StreamSafeCUDAAllocator
::
StreamSafeCUDAAllocator
(
std
::
shared_ptr
<
Allocator
>
underlying_allocator
,
platform
::
CUDAPlace
place
,
gpuStream_t
default_stream
)
gpuStream_t
default_stream
,
bool
in_cuda_graph_capturing
)
:
underlying_allocator_
(
std
::
move
(
underlying_allocator
)),
place_
(
std
::
move
(
place
)),
default_stream_
(
std
::
move
(
default_stream
))
{
default_stream_
(
std
::
move
(
default_stream
)),
in_cuda_graph_capturing_
(
in_cuda_graph_capturing
)
{
if
(
LIKELY
(
!
in_cuda_graph_capturing
))
{
std
::
lock_guard
<
SpinLock
>
lock_guard
(
allocator_map_lock_
);
allocator_map_
[
place
].
emplace_back
(
this
);
}
}
StreamSafeCUDAAllocator
::~
StreamSafeCUDAAllocator
()
{
if
(
LIKELY
(
!
in_cuda_graph_capturing_
))
{
std
::
lock_guard
<
SpinLock
>
lock_guard
(
allocator_map_lock_
);
std
::
vector
<
StreamSafeCUDAAllocator
*>&
allocators
=
allocator_map_
[
place_
];
allocators
.
erase
(
std
::
remove
(
allocators
.
begin
(),
allocators
.
end
(),
this
),
allocators
.
end
());
}
}
bool
StreamSafeCUDAAllocator
::
IsAllocThreadSafe
()
const
{
return
true
;
}
...
...
@@ -140,7 +177,7 @@ phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
}
StreamSafeCUDAAllocation
*
allocation
=
new
StreamSafeCUDAAllocation
(
static_unique_ptr_cast
<
Allocation
>
(
std
::
move
(
underlying_allocation
)),
default_stream_
);
default_stream_
,
this
);
VLOG
(
8
)
<<
"Allocate "
<<
allocation
->
size
()
<<
" bytes at address "
<<
allocation
->
ptr
();
return
allocation
;
...
...
@@ -157,22 +194,27 @@ void StreamSafeCUDAAllocator::FreeImpl(phi::Allocation* allocation) {
"StreamSafeCUDAAllocation*"
,
allocation
));
VLOG
(
8
)
<<
"Try free allocation "
<<
stream_safe_cuda_allocation
->
ptr
();
std
::
lock_guard
<
SpinLock
>
lock_guard
(
unfreed_allocation_lock_
);
if
(
stream_safe_cuda_allocation
->
CanBeFreed
())
{
VLOG
(
9
)
<<
"Directly delete allocation"
;
delete
stream_safe_cuda_allocation
;
}
else
{
VLOG
(
9
)
<<
"Put into unfreed_allocation list"
;
std
::
lock_guard
<
SpinLock
>
lock_guard
(
unfreed_allocation_lock_
);
unfreed_allocations_
.
emplace_back
(
stream_safe_cuda_allocation
);
}
}
uint64_t
StreamSafeCUDAAllocator
::
ReleaseImpl
(
const
platform
::
Place
&
place
)
{
if
(
UNLIKELY
(
in_cuda_graph_capturing_
))
{
VLOG
(
7
)
<<
"Memory release forbidden in CUDA Graph Captruing"
;
return
0
;
}
std
::
lock_guard
<
SpinLock
>
lock_guard
(
allocator_map_lock_
);
std
::
vector
<
StreamSafeCUDAAllocator
*>&
allocators
=
allocator_map_
[
place
];
uint64_t
released_size
=
0
;
for
(
StreamSafeCUDAAllocator
*
allocator
:
allocators
)
{
released_size
+=
allocator
->
ProcessUnfreedAllocations
With
Release
();
released_size
+=
allocator
->
ProcessUnfreedAllocations
And
Release
();
}
VLOG
(
8
)
<<
"Release "
<<
released_size
<<
" bytes memory from all streams"
;
return
released_size
;
...
...
@@ -191,7 +233,7 @@ void StreamSafeCUDAAllocator::ProcessUnfreedAllocations() {
}
}
uint64_t
StreamSafeCUDAAllocator
::
ProcessUnfreedAllocations
With
Release
()
{
uint64_t
StreamSafeCUDAAllocator
::
ProcessUnfreedAllocations
And
Release
()
{
ProcessUnfreedAllocations
();
return
underlying_allocator_
->
Release
(
place_
);
}
...
...
paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
浏览文件 @
c9beb885
...
...
@@ -14,10 +14,9 @@
#pragma once
#include <deque>
#include <list>
#include <map>
#include <
mutex
>
#include <
set
>
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/memory/allocation/spin_lock.h"
#include "paddle/fluid/platform/place.h"
...
...
@@ -32,27 +31,38 @@ namespace paddle {
namespace
memory
{
namespace
allocation
{
class
StreamSafeCUDAAllocator
;
class
StreamSafeCUDAAllocation
:
public
Allocation
{
public:
StreamSafeCUDAAllocation
(
DecoratedAllocationPtr
underlying_allocation
,
gpuStream_t
owning_stream
);
gpuStream_t
owning_stream
,
StreamSafeCUDAAllocator
*
allocator
);
void
RecordStream
(
const
gpuStream_t
&
stream
);
bool
CanBeFreed
();
const
gpuStream_t
&
GetOwningStream
()
const
;
private:
void
RecordGraphCapturingStreams
();
void
RecordStreamWithNoGraphCapturing
(
const
gpuStream_t
&
stream
);
DecoratedAllocationPtr
underlying_allocation_
;
std
::
set
<
gpuStream_t
>
graph_capturing_stream_set_
;
std
::
map
<
gpuStream_t
,
gpuEvent_t
>
outstanding_event_map_
;
gpuStream_t
owning_stream_
;
SpinLock
outstanding_event_map_lock_
;
// To compatiable with CUDA Graph, hold the allocator shared_ptr so that
// Allocator will not deconstruct before Allocation
std
::
shared_ptr
<
Allocator
>
allocator_
;
};
class
StreamSafeCUDAAllocator
:
public
Allocator
{
class
StreamSafeCUDAAllocator
:
public
Allocator
,
public
std
::
enable_shared_from_this
<
StreamSafeCUDAAllocator
>
{
public:
StreamSafeCUDAAllocator
(
std
::
shared_ptr
<
Allocator
>
underlying_allocator
,
platform
::
CUDAPlace
place
,
gpuStream_t
default_stream
);
platform
::
CUDAPlace
place
,
gpuStream_t
default_stream
,
bool
in_cuda_graph_capturing
=
false
);
~
StreamSafeCUDAAllocator
();
bool
IsAllocThreadSafe
()
const
override
;
...
...
@@ -63,7 +73,7 @@ class StreamSafeCUDAAllocator : public Allocator {
private:
void
ProcessUnfreedAllocations
();
uint64_t
ProcessUnfreedAllocations
With
Release
();
uint64_t
ProcessUnfreedAllocations
And
Release
();
static
std
::
map
<
platform
::
Place
,
std
::
vector
<
StreamSafeCUDAAllocator
*>>
allocator_map_
;
...
...
@@ -74,6 +84,8 @@ class StreamSafeCUDAAllocator : public Allocator {
gpuStream_t
default_stream_
;
std
::
list
<
StreamSafeCUDAAllocation
*>
unfreed_allocations_
;
SpinLock
unfreed_allocation_lock_
;
bool
in_cuda_graph_capturing_
;
};
}
// namespace allocation
...
...
paddle/fluid/memory/malloc.cc
浏览文件 @
c9beb885
...
...
@@ -41,6 +41,11 @@ std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
stream
);
}
AllocationPtr
Alloc
(
const
platform
::
CUDAPlace
&
place
,
size_t
size
,
const
phi
::
Stream
&
stream
)
{
return
allocation
::
AllocatorFacade
::
Instance
().
Alloc
(
place
,
size
,
stream
);
}
bool
InSameStream
(
const
std
::
shared_ptr
<
Allocation
>&
allocation
,
const
phi
::
Stream
&
stream
)
{
return
allocation
::
AllocatorFacade
::
Instance
().
InSameStream
(
allocation
,
...
...
@@ -52,11 +57,6 @@ void* GetBasePtr(const std::shared_ptr<Allocation>& allocation) {
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
AllocationPtr
Alloc
(
const
platform
::
CUDAPlace
&
place
,
size_t
size
,
const
gpuStream_t
&
stream
)
{
return
allocation
::
AllocatorFacade
::
Instance
().
Alloc
(
place
,
size
,
stream
);
}
uint64_t
Release
(
const
platform
::
CUDAPlace
&
place
,
const
gpuStream_t
&
stream
)
{
return
allocation
::
AllocatorFacade
::
Instance
().
Release
(
place
,
stream
);
}
...
...
paddle/fluid/memory/malloc.h
浏览文件 @
c9beb885
...
...
@@ -41,15 +41,15 @@ extern std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
size_t
size
,
const
phi
::
Stream
&
stream
);
extern
AllocationPtr
Alloc
(
const
platform
::
CUDAPlace
&
place
,
size_t
size
,
const
phi
::
Stream
&
stream
);
extern
bool
InSameStream
(
const
std
::
shared_ptr
<
Allocation
>&
allocation
,
const
phi
::
Stream
&
stream
);
extern
void
*
GetBasePtr
(
const
std
::
shared_ptr
<
Allocation
>&
allocation
);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
extern
AllocationPtr
Alloc
(
const
platform
::
CUDAPlace
&
place
,
size_t
size
,
const
gpuStream_t
&
stream
);
extern
uint64_t
Release
(
const
platform
::
CUDAPlace
&
place
,
const
gpuStream_t
&
stream
);
...
...
paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
浏览文件 @
c9beb885
...
...
@@ -12,34 +12,35 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/core/stream.h"
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
namespace
paddle
{
namespace
memory
{
__global__
void
add_kernel
(
int
*
x
,
int
n
)
{
// y += (x + 1)
__global__
void
add_kernel
(
int
*
x
,
int
*
y
,
int
n
)
{
int
thread_num
=
gridDim
.
x
*
blockDim
.
x
;
int
thread_id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(
int
i
=
thread_id
;
i
<
n
;
i
+=
thread_num
)
{
atomicAdd
(
x
+
i
,
thread_id
)
;
y
[
i
]
+=
x
[
i
]
+
1
;
}
}
...
...
@@ -51,153 +52,6 @@ void CheckMemLeak(const platform::CUDAPlace &place) {
<<
" there may be a memory leak problem"
;
}
class
StreamSafeCUDAAllocTest
:
public
::
testing
::
Test
{
protected:
void
SetUp
()
override
{
place_
=
platform
::
CUDAPlace
();
stream_num_
=
64
;
grid_num_
=
1
;
block_num_
=
32
;
data_num_
=
131072
;
workspace_size_
=
data_num_
*
sizeof
(
int
);
// alloc workspace for each stream
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
gpuStream_t
stream
;
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamCreate
(
&
stream
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipStreamCreate
(
&
stream
));
#endif
std
::
shared_ptr
<
Allocation
>
allocation
=
AllocShared
(
place_
,
workspace_size_
,
phi
::
Stream
(
reinterpret_cast
<
phi
::
StreamId
>
(
stream
)));
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaMemset
(
allocation
->
ptr
(),
0
,
allocation
->
size
()));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipMemset
(
allocation
->
ptr
(),
0
,
allocation
->
size
()));
#endif
streams_
.
emplace_back
(
stream
);
workspaces_
.
emplace_back
(
allocation
);
}
result_
=
Alloc
(
place_
,
stream_num_
*
workspace_size_
);
}
void
SingleStreamRun
(
size_t
idx
)
{
// for all stream i,
// stream idx lauch a kernel to add (j % thread_num) to workspaces_[i][j]
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
int
*
x
=
reinterpret_cast
<
int
*>
(
workspaces_
[
i
]
->
ptr
());
add_kernel
<<<
grid_num_
,
block_num_
,
0
,
streams_
[
idx
]
>>>
(
x
,
data_num_
);
RecordStream
(
workspaces_
[
i
],
streams_
[
idx
]);
}
}
void
CopyResultAsync
()
{
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaMemcpyAsync
(
reinterpret_cast
<
int
*>
(
result_
->
ptr
())
+
i
*
data_num_
,
workspaces_
[
i
]
->
ptr
(),
workspace_size_
,
cudaMemcpyDeviceToDevice
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipMemcpyAsync
(
reinterpret_cast
<
int
*>
(
result_
->
ptr
())
+
i
*
data_num_
,
workspaces_
[
i
]
->
ptr
(),
workspace_size_
,
hipMemcpyDeviceToDevice
));
#endif
}
}
void
MultiStreamRun
()
{
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
SingleStreamRun
(
i
);
}
CopyResultAsync
();
workspaces_
.
clear
();
// fast_gc
cudaDeviceSynchronize
();
}
void
MultiThreadMUltiStreamRun
()
{
std
::
vector
<
std
::
thread
>
threads
;
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
threads
.
push_back
(
std
::
thread
(
&
StreamSafeCUDAAllocTest
::
SingleStreamRun
,
this
,
i
));
}
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
threads
[
i
].
join
();
}
CopyResultAsync
();
workspaces_
.
clear
();
// fast_gc
cudaDeviceSynchronize
();
}
void
CheckResult
()
{
auto
result_host
=
std
::
unique_ptr
<
int
[]
>
(
new
int
[
result_
->
size
()]);
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaMemcpy
(
result_host
.
get
(),
result_
->
ptr
(),
result_
->
size
(),
cudaMemcpyDeviceToHost
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipMemcpy
(
result_host
.
get
(),
result_
->
ptr
(),
result_
->
size
(),
hipMemcpyDeviceToHost
));
#endif
size_t
thread_num
=
grid_num_
*
block_num_
;
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
for
(
size_t
j
=
0
;
j
<
data_num_
;
++
j
)
{
EXPECT_TRUE
(
result_host
[
i
*
stream_num_
+
j
]
==
(
j
%
thread_num
)
*
stream_num_
);
}
}
result_
.
reset
();
}
void
TearDown
()
override
{
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaDeviceSynchronize
());
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipDeviceSynchronize
());
#endif
for
(
gpuStream_t
stream
:
streams_
)
{
Release
(
place_
,
stream
);
}
for
(
size_t
i
=
1
;
i
<
stream_num_
;
++
i
)
{
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamDestroy
(
streams_
[
i
]));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipStreamDestroy
(
streams_
[
i
]));
#endif
}
CheckMemLeak
(
place_
);
}
size_t
stream_num_
;
size_t
grid_num_
;
size_t
block_num_
;
size_t
data_num_
;
size_t
workspace_size_
;
platform
::
CUDAPlace
place_
;
std
::
vector
<
gpuStream_t
>
streams_
;
std
::
vector
<
std
::
shared_ptr
<
Allocation
>>
workspaces_
;
allocation
::
AllocationPtr
result_
;
};
TEST_F
(
StreamSafeCUDAAllocTest
,
CUDAMutilStreamTest
)
{
MultiStreamRun
();
CheckResult
();
}
TEST_F
(
StreamSafeCUDAAllocTest
,
CUDAMutilThreadMutilStreamTest
)
{
MultiThreadMUltiStreamRun
();
CheckResult
();
}
TEST
(
StreamSafeCUDAAllocInterfaceTest
,
AllocInterfaceTest
)
{
platform
::
CUDAPlace
place
=
platform
::
CUDAPlace
();
size_t
alloc_size
=
256
;
...
...
@@ -214,7 +68,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
paddle
::
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
))
->
stream
();
allocation
::
AllocationPtr
allocation_unique
=
Alloc
(
place
,
alloc_size
,
default_stream
);
Alloc
(
place
,
alloc_size
,
phi
::
Stream
(
reinterpret_cast
<
phi
::
StreamId
>
(
default_stream
)));
EXPECT_GE
(
allocation_unique
->
size
(),
alloc_size
);
EXPECT_EQ
(
allocation_unique
->
ptr
(),
address
);
allocation_unique
.
reset
();
...
...
@@ -303,36 +158,6 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) {
CheckMemLeak
(
place
);
}
#ifdef PADDLE_WITH_CUDA
TEST
(
StreamSafeCUDAAllocInterfaceTest
,
CUDAGraphExceptionTest
)
{
platform
::
CUDAPlace
place
=
platform
::
CUDAPlace
();
size_t
alloc_size
=
1
;
std
::
shared_ptr
<
Allocation
>
allocation
=
AllocShared
(
place
,
alloc_size
);
platform
::
BeginCUDAGraphCapture
(
place
,
cudaStreamCaptureModeGlobal
);
EXPECT_THROW
(
AllocShared
(
place
,
alloc_size
),
paddle
::
platform
::
EnforceNotMet
);
EXPECT_THROW
(
Alloc
(
place
,
alloc_size
),
paddle
::
platform
::
EnforceNotMet
);
EXPECT_THROW
(
Release
(
place
),
paddle
::
platform
::
EnforceNotMet
);
EXPECT_THROW
(
allocation
::
AllocatorFacade
::
Instance
().
GetAllocator
(
place
),
paddle
::
platform
::
EnforceNotMet
);
EXPECT_THROW
(
AllocShared
(
place
,
alloc_size
,
phi
::
Stream
(
reinterpret_cast
<
phi
::
StreamId
>
(
nullptr
))),
paddle
::
platform
::
EnforceNotMet
);
EXPECT_THROW
(
Alloc
(
place
,
alloc_size
,
nullptr
),
paddle
::
platform
::
EnforceNotMet
);
EXPECT_THROW
(
Release
(
place
,
nullptr
),
paddle
::
platform
::
EnforceNotMet
);
EXPECT_THROW
(
RecordStream
(
allocation
,
nullptr
),
paddle
::
platform
::
EnforceNotMet
);
EXPECT_THROW
(
GetStream
(
allocation
),
paddle
::
platform
::
EnforceNotMet
);
platform
::
EndCUDAGraphCapture
();
allocation
.
reset
();
Release
(
place
);
CheckMemLeak
(
place
);
}
#endif
TEST
(
StreamSafeCUDAAllocRetryTest
,
RetryTest
)
{
platform
::
CUDAPlace
place
=
platform
::
CUDAPlace
();
gpuStream_t
stream1
,
stream2
;
...
...
@@ -348,12 +173,14 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
// so the second alloc will fail and retry
size_t
alloc_size
=
available_size
/
4
*
3
;
allocation
::
AllocationPtr
allocation1
=
Alloc
(
place
,
alloc_size
,
stream1
);
allocation
::
AllocationPtr
allocation1
=
Alloc
(
place
,
alloc_size
,
phi
::
Stream
(
reinterpret_cast
<
phi
::
StreamId
>
(
stream1
)));
allocation
::
AllocationPtr
allocation2
;
std
::
thread
th
([
&
allocation2
,
&
place
,
&
stream2
,
alloc_size
]()
{
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
seconds
(
1
));
allocation2
=
Alloc
(
place
,
alloc_size
,
stream2
);
allocation2
=
Alloc
(
place
,
alloc_size
,
phi
::
Stream
(
reinterpret_cast
<
phi
::
StreamId
>
(
stream2
)));
});
allocation1
.
reset
();
// free but not release
th
.
join
();
...
...
@@ -371,5 +198,201 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
CheckMemLeak
(
place
);
}
class
StreamSafeCUDAAllocTest
:
public
::
testing
::
Test
{
protected:
void
SetUp
()
override
{
place_
=
platform
::
CUDAPlace
();
stream_num_
=
64
;
grid_num_
=
1
;
block_num_
=
32
;
data_num_
=
131072
;
workspace_size_
=
data_num_
*
sizeof
(
int
);
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
gpuStream_t
stream
;
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamCreate
(
&
stream
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipStreamCreate
(
&
stream
));
#endif
std
::
shared_ptr
<
phi
::
Allocation
>
workspace_allocation
=
AllocShared
(
place_
,
workspace_size_
,
phi
::
Stream
(
reinterpret_cast
<
phi
::
StreamId
>
(
stream
)));
std
::
shared_ptr
<
phi
::
Allocation
>
result_allocation
=
AllocShared
(
place_
,
workspace_size_
,
phi
::
Stream
(
reinterpret_cast
<
phi
::
StreamId
>
(
stream
)));
std
::
shared_ptr
<
phi
::
Allocation
>
host_result_allocation
=
AllocShared
(
platform
::
CPUPlace
(),
workspace_size_
);
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaMemset
(
workspace_allocation
->
ptr
(),
0
,
workspace_allocation
->
size
()));
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaMemset
(
result_allocation
->
ptr
(),
0
,
result_allocation
->
size
()));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipMemset
(
workspace_allocation
->
ptr
(),
0
,
workspace_allocation
->
size
()));
PADDLE_ENFORCE_GPU_SUCCESS
(
hipMemset
(
result_allocation
->
ptr
(),
0
,
result_allocation
->
size
()));
#endif
streams_
.
emplace_back
(
stream
);
workspaces_
.
emplace_back
(
workspace_allocation
);
results_
.
emplace_back
(
result_allocation
);
host_results_
.
emplace_back
(
host_result_allocation
);
}
}
void
SingleStreamRun
(
size_t
idx
)
{
int
*
y
=
reinterpret_cast
<
int
*>
(
results_
[
idx
]
->
ptr
());
int
neighbouring_idx
=
idx
>
0
?
idx
-
1
:
idx
;
add_kernel
<<<
grid_num_
,
block_num_
,
0
,
streams_
[
idx
]
>>>
(
reinterpret_cast
<
int
*>
(
workspaces_
[
idx
]
->
ptr
()),
y
,
data_num_
);
add_kernel
<<<
grid_num_
,
block_num_
,
0
,
streams_
[
idx
]
>>>
(
reinterpret_cast
<
int
*>
(
workspaces_
[
neighbouring_idx
]
->
ptr
()),
y
,
data_num_
);
RecordStream
(
workspaces_
[
neighbouring_idx
],
streams_
[
idx
]);
}
void
MultiStreamRun
()
{
// Must run in reverse order, or the workspace_[i - 1] will be released
// before streams_[i]'s kernel launch
for
(
int
i
=
stream_num_
-
1
;
i
>=
0
;
--
i
)
{
SingleStreamRun
(
i
);
workspaces_
[
i
].
reset
();
// fast GC
}
}
void
MultiThreadMultiStreamRun
()
{
std
::
vector
<
std
::
thread
>
threads
;
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
threads
.
push_back
(
std
::
thread
(
&
StreamSafeCUDAAllocTest
::
SingleStreamRun
,
this
,
i
));
}
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
threads
[
i
].
join
();
}
workspaces_
.
clear
();
}
void
CUDAGraphRun
()
{
testing_cuda_graph_
=
true
;
platform
::
BeginCUDAGraphCapture
(
platform
::
CUDAPlace
(),
cudaStreamCaptureModeGlobal
);
std
::
shared_ptr
<
Allocation
>
data_allocation
=
AllocShared
(
platform
::
CUDAPlace
(),
workspace_size_
);
std
::
shared_ptr
<
Allocation
>
result_allocation
=
AllocShared
(
platform
::
CUDAPlace
(),
workspace_size_
);
int
*
data
=
static_cast
<
int
*>
(
data_allocation
->
ptr
());
int
*
result
=
static_cast
<
int
*>
(
result_allocation
->
ptr
());
gpuStream_t
main_stream
=
GetStream
(
data_allocation
);
gpuStream_t
other_stream
;
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamCreate
(
&
other_stream
));
add_kernel
<<<
grid_num_
,
block_num_
,
0
,
main_stream
>>>
(
data
,
result
,
data_num_
);
RecordStream
(
data_allocation
,
other_stream
);
std
::
unique_ptr
<
platform
::
CUDAGraph
>
cuda_graph
=
platform
::
EndCUDAGraphCapture
();
int
replay_times
=
10
;
for
(
int
i
=
0
;
i
<
replay_times
;
++
i
)
{
cuda_graph
->
Replay
();
}
std
::
shared_ptr
<
Allocation
>
host_result_allocation
=
AllocShared
(
platform
::
CPUPlace
(),
workspace_size_
);
Copy
(
host_result_allocation
->
place
(),
host_result_allocation
->
ptr
(),
result_allocation
->
place
(),
result_allocation
->
ptr
(),
workspace_size_
,
main_stream
);
cudaStreamSynchronize
(
main_stream
);
int
*
host_result
=
static_cast
<
int
*>
(
host_result_allocation
->
ptr
());
for
(
int
i
=
0
;
i
<
data_num_
;
++
i
)
{
EXPECT_EQ
(
host_result
[
i
],
replay_times
);
}
data_allocation
.
reset
();
result_allocation
.
reset
();
cuda_graph
.
release
();
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamDestroy
(
other_stream
));
}
void
CheckResult
()
{
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
Copy
(
host_results_
[
i
]
->
place
(),
host_results_
[
i
]
->
ptr
(),
results_
[
i
]
->
place
(),
results_
[
i
]
->
ptr
(),
workspace_size_
,
streams_
[
i
]);
}
cudaDeviceSynchronize
();
size_t
thread_num
=
grid_num_
*
block_num_
;
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
int
*
result
=
static_cast
<
int
*>
(
host_results_
[
i
]
->
ptr
());
for
(
size_t
j
=
0
;
j
<
data_num_
;
++
j
)
{
EXPECT_EQ
(
result
[
j
],
2
);
}
}
}
void
TearDown
()
override
{
workspaces_
.
clear
();
results_
.
clear
();
host_results_
.
clear
();
for
(
gpuStream_t
stream
:
streams_
)
{
Release
(
place_
,
stream
);
}
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamDestroy
(
streams_
[
i
]));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipStreamDestroy
(
streams_
[
i
]));
#endif
}
// Memory release for CUDA Graph memory pool is forbidden
if
(
!
testing_cuda_graph_
)
{
CheckMemLeak
(
place_
);
}
}
bool
testing_cuda_graph_
{
0
};
size_t
stream_num_
;
size_t
grid_num_
;
size_t
block_num_
;
size_t
data_num_
;
size_t
workspace_size_
;
platform
::
CUDAPlace
place_
;
std
::
vector
<
gpuStream_t
>
streams_
;
std
::
vector
<
std
::
shared_ptr
<
phi
::
Allocation
>>
workspaces_
;
std
::
vector
<
std
::
shared_ptr
<
phi
::
Allocation
>>
results_
;
std
::
vector
<
std
::
shared_ptr
<
phi
::
Allocation
>>
host_results_
;
};
TEST_F
(
StreamSafeCUDAAllocTest
,
CUDAMutilStreamTest
)
{
MultiStreamRun
();
CheckResult
();
}
TEST_F
(
StreamSafeCUDAAllocTest
,
CUDAMutilThreadMutilStreamTest
)
{
MultiThreadMultiStreamRun
();
CheckResult
();
}
#ifdef PADDLE_WITH_CUDA
TEST_F
(
StreamSafeCUDAAllocTest
,
CUDAGraphTest
)
{
MultiStreamRun
();
CUDAGraphRun
();
CheckResult
();
}
#endif
}
// namespace memory
}
// namespace paddle
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录