Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
c9beb885
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
c9beb885
编写于
3月 03, 2022
作者:
P
phlrain
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into move_yolo_box_to_phi
上级
25fe41d0
4c0511fa
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
436 addition
and
426 deletion
+436
-426
paddle/fluid/memory/allocation/CMakeLists.txt
paddle/fluid/memory/allocation/CMakeLists.txt
+1
-1
paddle/fluid/memory/allocation/allocator_facade.cc
paddle/fluid/memory/allocation/allocator_facade.cc
+101
-175
paddle/fluid/memory/allocation/allocator_facade.h
paddle/fluid/memory/allocation/allocator_facade.h
+9
-2
paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+81
-39
paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+20
-8
paddle/fluid/memory/malloc.cc
paddle/fluid/memory/malloc.cc
+5
-5
paddle/fluid/memory/malloc.h
paddle/fluid/memory/malloc.h
+3
-3
paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
+216
-193
未找到文件。
paddle/fluid/memory/allocation/CMakeLists.txt
浏览文件 @
c9beb885
...
...
@@ -17,7 +17,7 @@ if (WITH_GPU)
nv_library
(
cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard
)
nv_library
(
cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info
)
nv_library
(
pinned_allocator SRCS pinned_allocator.cc DEPS allocator
)
nv_library
(
stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator
)
nv_library
(
stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator
cuda_graph
)
nv_library
(
thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator
)
cc_test
(
thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator
)
...
...
paddle/fluid/memory/allocation/allocator_facade.cc
浏览文件 @
c9beb885
...
...
@@ -210,12 +210,7 @@ class AllocatorFacadePrivate {
InitNaiveBestFitCPUAllocator
();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
allow_free_idle_chunk_
=
allow_free_idle_chunk
;
if
(
FLAGS_use_stream_safe_cuda_allocator
)
{
for
(
int
dev_id
=
0
;
dev_id
<
platform
::
GetGPUDeviceCount
();
++
dev_id
)
{
InitStreamSafeCUDAAllocator
(
platform
::
CUDAPlace
(
dev_id
),
nullptr
);
}
}
else
{
if
(
!
FLAGS_use_stream_safe_cuda_allocator
)
{
for
(
int
dev_id
=
0
;
dev_id
<
platform
::
GetGPUDeviceCount
();
++
dev_id
)
{
InitAutoGrowthCUDAAllocator
(
platform
::
CUDAPlace
(
dev_id
),
...
...
@@ -298,6 +293,12 @@ class AllocatorFacadePrivate {
}
CheckAllocThreadSafe
();
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsThisThreadCapturing
()))
{
WrapCUDAGraphAllocator
();
}
#endif
}
inline
const
std
::
shared_ptr
<
Allocator
>&
GetAllocator
(
...
...
@@ -388,39 +389,6 @@ class AllocatorFacadePrivate {
allocation
.
get
()));
return
stream_safe_cuda_allocation
->
GetOwningStream
();
}
#ifdef PADDLE_WITH_CUDA
void
PrepareMemoryPoolForCUDAGraph
(
CUDAGraphID
id
)
{
PADDLE_ENFORCE_EQ
(
strategy_
,
AllocatorStrategy
::
kAutoGrowth
,
platform
::
errors
::
InvalidArgument
(
"CUDA Graph is only supported when the "
"FLAGS_allocator_strategy=
\"
auto_growth
\"
, but got "
"FLAGS_allocator_strategy=
\"
%s
\"
"
,
FLAGS_allocator_strategy
));
auto
&
allocator
=
cuda_graph_allocator_map_
[
id
];
PADDLE_ENFORCE_EQ
(
allocator
.
get
(),
nullptr
,
platform
::
errors
::
InvalidArgument
(
"The memory pool of the CUDA Graph with ID %d have been prepared."
,
id
));
allocator
.
reset
(
new
AllocatorFacadePrivate
(
/*allow_free_idle_chunk=*/
false
));
for
(
auto
&
item
:
allocator
->
allocators_
)
{
auto
&
old_allocator
=
item
.
second
;
old_allocator
=
CUDAGraphAllocator
::
Create
(
old_allocator
);
}
VLOG
(
10
)
<<
"Prepare memory pool for CUDA Graph with ID "
<<
id
;
}
void
RemoveMemoryPoolOfCUDAGraph
(
CUDAGraphID
id
)
{
auto
iter
=
cuda_graph_allocator_map_
.
find
(
id
);
PADDLE_ENFORCE_NE
(
iter
,
cuda_graph_allocator_map_
.
end
(),
platform
::
errors
::
InvalidArgument
(
"Cannot find CUDA Graph with ID = %d"
,
id
));
cuda_graph_allocator_map_
.
erase
(
iter
);
VLOG
(
10
)
<<
"Remove memory pool of CUDA Graph with ID "
<<
id
;
}
#endif
#endif
private:
...
...
@@ -439,24 +407,7 @@ class AllocatorFacadePrivate {
platform
::
Place
place_
;
};
const
AllocatorMap
&
GetAllocatorMap
()
{
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsThisThreadCapturing
()))
{
auto
id
=
platform
::
CUDAGraph
::
CapturingID
();
auto
iter
=
cuda_graph_allocator_map_
.
find
(
id
);
PADDLE_ENFORCE_NE
(
iter
,
cuda_graph_allocator_map_
.
end
(),
platform
::
errors
::
PermissionDenied
(
"No memory pool is prepared for CUDA Graph capturing."
));
VLOG
(
10
)
<<
"Choose CUDA Graph memory pool to allocate memory"
;
return
iter
->
second
->
allocators_
;
}
else
{
return
allocators_
;
}
#else
return
allocators_
;
#endif
}
const
AllocatorMap
&
GetAllocatorMap
()
{
return
allocators_
;
}
void
InitNaiveBestFitCPUAllocator
()
{
allocators_
[
platform
::
CPUPlace
()]
=
...
...
@@ -672,10 +623,10 @@ class AllocatorFacadePrivate {
}
void
WrapStreamSafeCUDAAllocator
(
platform
::
CUDAPlace
p
,
gpuStream_t
stream
)
{
const
std
::
shared_ptr
<
Allocator
>&
underlying_allocator
=
cuda_allocators_
[
p
][
stream
];
cuda_allocators_
[
p
][
stream
]
=
std
::
make_shared
<
StreamSafeCUDAAllocator
>
(
underlying_allocator
,
p
,
stream
);
std
::
shared_ptr
<
Allocator
>&
allocator
=
cuda_allocators_
[
p
][
stream
];
allocator
=
std
::
make_shared
<
StreamSafeCUDAAllocator
>
(
allocator
,
p
,
stream
,
/* in_cuda_graph_capturing = */
!
allow_free_idle_chunk_
);
}
void
WrapCUDARetryAllocator
(
platform
::
CUDAPlace
p
,
gpuStream_t
stream
,
...
...
@@ -684,10 +635,19 @@ class AllocatorFacadePrivate {
retry_time
,
0
,
platform
::
errors
::
InvalidArgument
(
"Retry time should be larger than 0, but got %d"
,
retry_time
));
std
::
shared_ptr
<
Allocator
>
allocator
=
cuda_allocators_
[
p
][
stream
];
std
::
shared_ptr
<
Allocator
>
&
allocator
=
cuda_allocators_
[
p
][
stream
];
allocator
=
std
::
make_shared
<
RetryAllocator
>
(
allocator
,
retry_time
);
}
#ifdef PADDLE_WITH_CUDA
void
WrapCUDAGraphAllocator
()
{
for
(
auto
&
item
:
allocators_
)
{
auto
&
allocator
=
item
.
second
;
allocator
=
CUDAGraphAllocator
::
Create
(
allocator
);
}
}
#endif
static
void
CheckCUDAAllocThreadSafe
(
const
CUDAAllocatorMap
&
allocators
)
{
for
(
auto
&
place_pair
:
allocators
)
{
for
(
auto
&
stream_pair
:
place_pair
.
second
)
{
...
...
@@ -864,10 +824,6 @@ class AllocatorFacadePrivate {
// a standalone CUDA allocator to support multi-stream GC in new executor
CUDAAllocatorMap
cuda_allocators_
;
std
::
shared_timed_mutex
cuda_allocator_mutex_
;
#ifdef PADDLE_WITH_CUDA
std
::
unordered_map
<
CUDAGraphID
,
std
::
unique_ptr
<
AllocatorFacadePrivate
>>
cuda_graph_allocator_map_
;
#endif
#endif
AllocatorStrategy
strategy_
;
AllocatorMap
allocators_
;
...
...
@@ -886,8 +842,24 @@ AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {}
AllocatorFacade
::~
AllocatorFacade
()
{}
AllocatorFacade
&
AllocatorFacade
::
Instance
()
{
static
AllocatorFacade
instance
;
return
instance
;
static
AllocatorFacade
*
instance
=
new
AllocatorFacade
;
return
*
instance
;
}
AllocatorFacadePrivate
*
AllocatorFacade
::
GetPrivate
()
const
{
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsThisThreadCapturing
()))
{
auto
id
=
platform
::
CUDAGraph
::
CapturingID
();
auto
iter
=
cuda_graph_map_
.
find
(
id
);
PADDLE_ENFORCE_NE
(
iter
,
cuda_graph_map_
.
end
(),
platform
::
errors
::
PermissionDenied
(
"No memory pool is prepared for CUDA Graph capturing."
));
VLOG
(
10
)
<<
"Choose CUDA Graph memory pool"
;
return
iter
->
second
.
get
();
}
#endif
return
m_
;
}
const
std
::
shared_ptr
<
Allocator
>&
AllocatorFacade
::
GetAllocator
(
...
...
@@ -895,19 +867,14 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
FLAGS_use_stream_safe_cuda_allocator
&&
platform
::
is_gpu_place
(
place
)
&&
FLAGS_use_system_allocator
==
false
)
{
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
return
m_
->
GetAllocator
(
place
,
/* A non-zero num to choose allocator_ */
1
);
}
#endif
AllocatorFacadePrivate
*
m
=
GetPrivate
();
platform
::
CUDAPlace
cuda_place
(
place
.
GetDeviceId
());
return
m
_
->
GetAllocator
(
cuda_place
,
m_
->
GetDefaultStream
(
cuda_place
));
return
m
->
GetAllocator
(
cuda_place
,
m
->
GetDefaultStream
(
cuda_place
));
}
#endif
return
m_
->
GetAllocator
(
place
,
/* A non-zero num to choose allocator_ */
1
);
return
GetPrivate
()
->
GetAllocator
(
place
,
/* A non-zero num to choose allocator_ */
1
);
}
void
*
AllocatorFacade
::
GetBasePtr
(
...
...
@@ -922,7 +889,7 @@ void* AllocatorFacade::GetBasePtr(
"GetBasePtr() is only implemented for CUDAPlace(), not "
"suppot place: %s"
,
allocation
->
place
()));
return
m_
->
GetBasePtr
(
allocation
);
return
GetPrivate
()
->
GetBasePtr
(
allocation
);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
...
...
@@ -930,21 +897,17 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
const
platform
::
Place
&
place
,
const
gpuStream_t
&
stream
)
{
if
(
FLAGS_use_stream_safe_cuda_allocator
&&
platform
::
is_gpu_place
(
place
)
&&
FLAGS_use_system_allocator
==
false
)
{
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
return
m_
->
GetAllocator
(
place
,
/* A non-zero num to choose allocator_ */
1
);
}
#endif
return
m_
->
GetAllocator
(
place
,
stream
,
/*create_if_not_found=*/
true
);
return
GetPrivate
()
->
GetAllocator
(
place
,
stream
,
/*create_if_not_found=*/
true
);
}
return
m_
->
GetAllocator
(
place
,
/* A non-zero num to choose allocator_ */
1
);
return
GetPrivate
()
->
GetAllocator
(
place
,
/* A non-zero num to choose allocator_ */
1
);
}
#endif
const
std
::
shared_ptr
<
Allocator
>&
AllocatorFacade
::
GetZeroAllocator
(
const
platform
::
Place
&
place
)
{
return
m_
->
GetAllocator
(
place
,
/* zero size */
0
);
return
GetPrivate
()
->
GetAllocator
(
place
,
/* zero size */
0
);
}
std
::
shared_ptr
<
phi
::
Allocation
>
AllocatorFacade
::
AllocShared
(
...
...
@@ -957,43 +920,30 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
FLAGS_use_stream_safe_cuda_allocator
&&
platform
::
is_gpu_place
(
place
)
&&
size
>
0
&&
FLAGS_use_system_allocator
==
false
)
{
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
return
m_
->
GetAllocator
(
place
,
size
)
->
Allocate
(
size
);
}
#endif
platform
::
CUDAPlace
cuda_place
(
place
.
GetDeviceId
());
return
Alloc
(
cuda_place
,
size
,
m_
->
GetDefaultStream
(
cuda_place
));
phi
::
Stream
default_stream
=
phi
::
Stream
(
reinterpret_cast
<
phi
::
StreamId
>
(
GetPrivate
()
->
GetDefaultStream
(
cuda_place
)));
return
Alloc
(
cuda_place
,
size
,
default_stream
);
}
#endif
return
m_
->
GetAllocator
(
place
,
size
)
->
Allocate
(
size
);
return
GetPrivate
()
->
GetAllocator
(
place
,
size
)
->
Allocate
(
size
);
}
uint64_t
AllocatorFacade
::
Release
(
const
platform
::
Place
&
place
)
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if
(
FLAGS_use_stream_safe_cuda_allocator
&&
platform
::
is_gpu_place
(
place
)
&&
FLAGS_use_system_allocator
==
false
)
{
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
return
m_
->
GetAllocator
(
place
,
/* A non-zero num to choose allocator_ */
1
)
->
Release
(
place
);
}
#endif
platform
::
CUDAPlace
cuda_place
(
place
.
GetDeviceId
());
return
Release
(
cuda_place
,
m_
->
GetDefaultStream
(
cuda_place
));
return
Release
(
cuda_place
,
GetPrivate
()
->
GetDefaultStream
(
cuda_place
));
}
#endif
return
m_
->
GetAllocator
(
place
,
/* A non-zero num to choose allocator_ */
1
)
return
GetPrivate
()
->
GetAllocator
(
place
,
/* A non-zero num to choose allocator_ */
1
)
->
Release
(
place
);
}
std
::
shared_ptr
<
phi
::
Allocation
>
AllocatorFacade
::
AllocShared
(
const
platform
::
Place
&
place
,
size_t
size
,
const
phi
::
Stream
&
stream
)
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_EQ
(
FLAGS_use_stream_safe_cuda_allocator
,
true
,
platform
::
errors
::
Unimplemented
(
...
...
@@ -1001,71 +951,53 @@ std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
"multi-stream 'AllocaShared' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal."
));
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"
));
}
#endif
gpuStream_t
s
=
reinterpret_cast
<
gpuStream_t
>
(
stream
.
id
());
return
std
::
shared_ptr
<
phi
::
Allocation
>
(
Alloc
(
place
,
size
,
s
));
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with GPU."
));
#endif
return
std
::
shared_ptr
<
phi
::
Allocation
>
(
Alloc
(
place
,
size
,
stream
));
}
bool
AllocatorFacade
::
InSameStream
(
const
std
::
shared_ptr
<
phi
::
Allocation
>&
allocation
,
const
phi
::
Stream
&
stream
)
{
AllocationPtr
AllocatorFacade
::
Alloc
(
const
platform
::
Place
&
place
,
size_t
size
,
const
phi
::
Stream
&
stream
)
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_EQ
(
FLAGS_use_stream_safe_cuda_allocator
,
true
,
platform
::
errors
::
Unimplemented
(
"StreamSafeCUDAAllocator is disabled, you should not call this "
"multi-stream '
InSameStream
' function. To enable it, you can enter"
"multi-stream '
Alloc
' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal."
));
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"
));
platform
::
CUDAPlace
p
(
place
.
GetDeviceId
());
if
(
LIKELY
(
size
>
0
&&
FLAGS_use_system_allocator
==
false
))
{
gpuStream_t
s
=
reinterpret_cast
<
gpuStream_t
>
(
stream
.
id
());
return
GetPrivate
()
->
GetAllocator
(
p
,
s
,
/* create_if_not_found = */
true
)
->
Allocate
(
size
);
}
else
{
return
GetPrivate
()
->
GetAllocator
(
p
,
size
)
->
Allocate
(
size
);
}
#endif
gpuStream_t
s
=
reinterpret_cast
<
gpuStream_t
>
(
stream
.
id
());
return
s
==
GetStream
(
allocation
);
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with GPU."
));
#endif
}
bool
AllocatorFacade
::
InSameStream
(
const
std
::
shared_ptr
<
phi
::
Allocation
>&
allocation
,
const
phi
::
Stream
&
stream
)
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
AllocationPtr
AllocatorFacade
::
Alloc
(
const
platform
::
Place
&
place
,
size_t
size
,
const
gpuStream_t
&
stream
)
{
PADDLE_ENFORCE_EQ
(
FLAGS_use_stream_safe_cuda_allocator
,
true
,
platform
::
errors
::
Unimplemented
(
"StreamSafeCUDAAllocator is disabled, you should not call this "
"multi-stream '
Alloc
' function. To enable it, you can enter"
"multi-stream '
InSameStream
' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal."
));
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"
));
}
gpuStream_t
s
=
reinterpret_cast
<
gpuStream_t
>
(
stream
.
id
());
return
s
==
GetStream
(
allocation
);
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with GPU."
));
#endif
platform
::
CUDAPlace
p
(
place
.
GetDeviceId
());
if
(
LIKELY
(
size
>
0
&&
FLAGS_use_system_allocator
==
false
))
{
return
m_
->
GetAllocator
(
p
,
stream
,
/* create_if_not_found = */
true
)
->
Allocate
(
size
);
}
else
{
return
m_
->
GetAllocator
(
p
,
size
)
->
Allocate
(
size
);
}
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
uint64_t
AllocatorFacade
::
Release
(
const
platform
::
CUDAPlace
&
place
,
const
gpuStream_t
&
stream
)
{
PADDLE_ENFORCE_EQ
(
...
...
@@ -1075,15 +1007,7 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
"multi-stream 'Release' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal."
));
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"
));
}
#endif
return
m_
->
GetAllocator
(
place
,
stream
)
->
Release
(
place
);
return
GetPrivate
()
->
GetAllocator
(
place
,
stream
)
->
Release
(
place
);
}
void
AllocatorFacade
::
RecordStream
(
std
::
shared_ptr
<
phi
::
Allocation
>
allocation
,
...
...
@@ -1095,15 +1019,7 @@ void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
"'RecordStream' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal."
));
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"
));
}
#endif
m_
->
RecordStream
(
allocation
,
stream
);
GetPrivate
()
->
RecordStream
(
allocation
,
stream
);
}
const
gpuStream_t
&
AllocatorFacade
::
GetStream
(
...
...
@@ -1115,24 +1031,34 @@ const gpuStream_t& AllocatorFacade::GetStream(
"'GetStream' function. To enable it, you can enter"
"'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
"terminal."
));
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"
));
}
#endif
return
m_
->
GetStream
(
allocation
);
return
GetPrivate
()
->
GetStream
(
allocation
);
}
#ifdef PADDLE_WITH_CUDA
void
AllocatorFacade
::
PrepareMemoryPoolForCUDAGraph
(
CUDAGraphID
id
)
{
return
m_
->
PrepareMemoryPoolForCUDAGraph
(
id
);
PADDLE_ENFORCE_EQ
(
GetAllocatorStrategy
(),
AllocatorStrategy
::
kAutoGrowth
,
platform
::
errors
::
InvalidArgument
(
"CUDA Graph is only supported when the "
"FLAGS_allocator_strategy=
\"
auto_growth
\"
, but got "
"FLAGS_allocator_strategy=
\"
%s
\"
"
,
FLAGS_allocator_strategy
));
auto
&
allocator
=
cuda_graph_map_
[
id
];
PADDLE_ENFORCE_EQ
(
allocator
.
get
(),
nullptr
,
platform
::
errors
::
InvalidArgument
(
"The memory pool of the CUDA Graph with ID %d have been prepared."
,
id
));
allocator
.
reset
(
new
AllocatorFacadePrivate
(
/*allow_free_idle_chunk=*/
false
));
VLOG
(
10
)
<<
"Prepare memory pool for CUDA Graph with ID "
<<
id
;
}
void
AllocatorFacade
::
RemoveMemoryPoolOfCUDAGraph
(
CUDAGraphID
id
)
{
return
m_
->
RemoveMemoryPoolOfCUDAGraph
(
id
);
auto
iter
=
cuda_graph_map_
.
find
(
id
);
PADDLE_ENFORCE_NE
(
iter
,
cuda_graph_map_
.
end
(),
platform
::
errors
::
InvalidArgument
(
"Cannot find CUDA Graph with ID = %d"
,
id
));
cuda_graph_map_
.
erase
(
iter
);
VLOG
(
10
)
<<
"Remove memory pool of CUDA Graph with ID "
<<
id
;
}
#endif
#endif
...
...
paddle/fluid/memory/allocation/allocator_facade.h
浏览文件 @
c9beb885
...
...
@@ -49,6 +49,8 @@ class AllocatorFacade {
static
AllocatorFacade
&
Instance
();
AllocatorFacadePrivate
*
GetPrivate
()
const
;
const
std
::
shared_ptr
<
Allocator
>&
GetAllocator
(
const
platform
::
Place
&
place
);
void
*
GetBasePtr
(
const
std
::
shared_ptr
<
Allocation
>&
allocation
);
...
...
@@ -73,13 +75,14 @@ class AllocatorFacade {
size_t
size
,
const
phi
::
Stream
&
stream
);
AllocationPtr
Alloc
(
const
platform
::
Place
&
place
,
size_t
size
,
const
phi
::
Stream
&
stream
);
bool
InSameStream
(
const
std
::
shared_ptr
<
Allocation
>&
allocation
,
const
phi
::
Stream
&
stream
);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// TODO(zhiqiu): change gpuStream_t to phi::Stream if needed.
AllocationPtr
Alloc
(
const
platform
::
Place
&
place
,
size_t
size
,
const
gpuStream_t
&
stream
);
uint64_t
Release
(
const
platform
::
CUDAPlace
&
place
,
const
gpuStream_t
&
stream
);
void
RecordStream
(
std
::
shared_ptr
<
Allocation
>
allocation
,
const
gpuStream_t
&
stream
);
...
...
@@ -96,6 +99,10 @@ class AllocatorFacade {
private:
AllocatorFacade
();
AllocatorFacadePrivate
*
m_
;
#ifdef PADDLE_WITH_CUDA
std
::
unordered_map
<
CUDAGraphID
,
std
::
unique_ptr
<
AllocatorFacadePrivate
>>
cuda_graph_map_
;
#endif
};
}
// namespace allocation
...
...
paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
浏览文件 @
c9beb885
...
...
@@ -15,56 +15,52 @@
#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
#endif
namespace
paddle
{
namespace
memory
{
namespace
allocation
{
StreamSafeCUDAAllocation
::
StreamSafeCUDAAllocation
(
DecoratedAllocationPtr
underlying_allocation
,
gpuStream_t
owning_stream
)
DecoratedAllocationPtr
underlying_allocation
,
gpuStream_t
owning_stream
,
StreamSafeCUDAAllocator
*
allocator
)
:
Allocation
(
underlying_allocation
->
ptr
(),
underlying_allocation
->
base_ptr
(),
underlying_allocation
->
size
(),
underlying_allocation
->
place
()),
underlying_allocation_
(
std
::
move
(
underlying_allocation
)),
owning_stream_
(
std
::
move
(
owning_stream
))
{}
owning_stream_
(
std
::
move
(
owning_stream
)),
allocator_
(
allocator
->
shared_from_this
())
{}
void
StreamSafeCUDAAllocation
::
RecordStream
(
const
gpuStream_t
&
stream
)
{
VLOG
(
8
)
<<
"Try record stream "
<<
stream
<<
" for address "
<<
ptr
();
if
(
stream
==
owning_stream_
)
{
VLOG
(
9
)
<<
"Record the same stream of "
<<
stream
;
return
;
}
std
::
lock_guard
<
SpinLock
>
lock_guard
(
outstanding_event_map_lock_
);
gpuEvent_t
record_event
;
auto
it
=
outstanding_event_map_
.
find
(
stream
);
if
(
it
==
outstanding_event_map_
.
end
())
{
gpuEvent_t
new_event
;
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaEventCreateWithFlags
(
&
new_event
,
cudaEventDisableTiming
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipEventCreateWithFlags
(
&
new_event
,
hipEventDisableTiming
));
#endif
outstanding_event_map_
[
stream
]
=
new_event
;
record_event
=
new_event
;
VLOG
(
9
)
<<
"Create a new event "
<<
new_event
;
}
else
{
record_event
=
it
->
second
;
VLOG
(
9
)
<<
"Reuse event "
<<
record_event
;
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsThisThreadCapturing
()))
{
graph_capturing_stream_set_
.
insert
(
stream
);
return
;
}
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaEventRecord
(
record_event
,
stream
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipEventRecord
(
record_event
,
stream
));
#endif
VLOG
(
8
)
<<
"Record event "
<<
record_event
<<
" to stream "
<<
stream
;
RecordStreamWithNoGraphCapturing
(
stream
);
RecordGraphCapturingStreams
();
}
bool
StreamSafeCUDAAllocation
::
CanBeFreed
()
{
// NOTE(Ruibiao): This function will not execute concurrently,
// so outstanding_event_lock_ is not required here
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsThisThreadCapturing
()))
{
return
graph_capturing_stream_set_
.
empty
()
&&
outstanding_event_map_
.
empty
();
}
#endif
RecordGraphCapturingStreams
();
for
(
auto
it
=
outstanding_event_map_
.
begin
();
it
!=
outstanding_event_map_
.
end
();
++
it
)
{
gpuEvent_t
&
event
=
it
->
second
;
...
...
@@ -98,21 +94,62 @@ const gpuStream_t& StreamSafeCUDAAllocation::GetOwningStream() const {
return
owning_stream_
;
}
void
StreamSafeCUDAAllocation
::
RecordGraphCapturingStreams
()
{
for
(
gpuStream_t
stream
:
graph_capturing_stream_set_
)
{
RecordStreamWithNoGraphCapturing
(
stream
);
}
graph_capturing_stream_set_
.
clear
();
}
void
StreamSafeCUDAAllocation
::
RecordStreamWithNoGraphCapturing
(
const
gpuStream_t
&
stream
)
{
gpuEvent_t
record_event
;
auto
it
=
outstanding_event_map_
.
find
(
stream
);
if
(
it
==
outstanding_event_map_
.
end
())
{
gpuEvent_t
new_event
;
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaEventCreateWithFlags
(
&
new_event
,
cudaEventDisableTiming
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipEventCreateWithFlags
(
&
new_event
,
hipEventDisableTiming
));
#endif
outstanding_event_map_
[
stream
]
=
new_event
;
record_event
=
new_event
;
VLOG
(
9
)
<<
"Create a new event "
<<
new_event
;
}
else
{
record_event
=
it
->
second
;
VLOG
(
9
)
<<
"Reuse event "
<<
record_event
;
}
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaEventRecord
(
record_event
,
stream
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipEventRecord
(
record_event
,
stream
));
#endif
VLOG
(
8
)
<<
"Record event "
<<
record_event
<<
" to stream "
<<
stream
;
}
StreamSafeCUDAAllocator
::
StreamSafeCUDAAllocator
(
std
::
shared_ptr
<
Allocator
>
underlying_allocator
,
platform
::
CUDAPlace
place
,
gpuStream_t
default_stream
)
gpuStream_t
default_stream
,
bool
in_cuda_graph_capturing
)
:
underlying_allocator_
(
std
::
move
(
underlying_allocator
)),
place_
(
std
::
move
(
place
)),
default_stream_
(
std
::
move
(
default_stream
))
{
std
::
lock_guard
<
SpinLock
>
lock_guard
(
allocator_map_lock_
);
allocator_map_
[
place
].
emplace_back
(
this
);
default_stream_
(
std
::
move
(
default_stream
)),
in_cuda_graph_capturing_
(
in_cuda_graph_capturing
)
{
if
(
LIKELY
(
!
in_cuda_graph_capturing
))
{
std
::
lock_guard
<
SpinLock
>
lock_guard
(
allocator_map_lock_
);
allocator_map_
[
place
].
emplace_back
(
this
);
}
}
StreamSafeCUDAAllocator
::~
StreamSafeCUDAAllocator
()
{
std
::
lock_guard
<
SpinLock
>
lock_guard
(
allocator_map_lock_
);
std
::
vector
<
StreamSafeCUDAAllocator
*>&
allocators
=
allocator_map_
[
place_
];
allocators
.
erase
(
std
::
remove
(
allocators
.
begin
(),
allocators
.
end
(),
this
),
allocators
.
end
());
if
(
LIKELY
(
!
in_cuda_graph_capturing_
))
{
std
::
lock_guard
<
SpinLock
>
lock_guard
(
allocator_map_lock_
);
std
::
vector
<
StreamSafeCUDAAllocator
*>&
allocators
=
allocator_map_
[
place_
];
allocators
.
erase
(
std
::
remove
(
allocators
.
begin
(),
allocators
.
end
(),
this
),
allocators
.
end
());
}
}
bool
StreamSafeCUDAAllocator
::
IsAllocThreadSafe
()
const
{
return
true
;
}
...
...
@@ -140,7 +177,7 @@ phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
}
StreamSafeCUDAAllocation
*
allocation
=
new
StreamSafeCUDAAllocation
(
static_unique_ptr_cast
<
Allocation
>
(
std
::
move
(
underlying_allocation
)),
default_stream_
);
default_stream_
,
this
);
VLOG
(
8
)
<<
"Allocate "
<<
allocation
->
size
()
<<
" bytes at address "
<<
allocation
->
ptr
();
return
allocation
;
...
...
@@ -157,22 +194,27 @@ void StreamSafeCUDAAllocator::FreeImpl(phi::Allocation* allocation) {
"StreamSafeCUDAAllocation*"
,
allocation
));
VLOG
(
8
)
<<
"Try free allocation "
<<
stream_safe_cuda_allocation
->
ptr
();
std
::
lock_guard
<
SpinLock
>
lock_guard
(
unfreed_allocation_lock_
);
if
(
stream_safe_cuda_allocation
->
CanBeFreed
())
{
VLOG
(
9
)
<<
"Directly delete allocation"
;
delete
stream_safe_cuda_allocation
;
}
else
{
VLOG
(
9
)
<<
"Put into unfreed_allocation list"
;
std
::
lock_guard
<
SpinLock
>
lock_guard
(
unfreed_allocation_lock_
);
unfreed_allocations_
.
emplace_back
(
stream_safe_cuda_allocation
);
}
}
uint64_t
StreamSafeCUDAAllocator
::
ReleaseImpl
(
const
platform
::
Place
&
place
)
{
if
(
UNLIKELY
(
in_cuda_graph_capturing_
))
{
VLOG
(
7
)
<<
"Memory release forbidden in CUDA Graph Captruing"
;
return
0
;
}
std
::
lock_guard
<
SpinLock
>
lock_guard
(
allocator_map_lock_
);
std
::
vector
<
StreamSafeCUDAAllocator
*>&
allocators
=
allocator_map_
[
place
];
uint64_t
released_size
=
0
;
for
(
StreamSafeCUDAAllocator
*
allocator
:
allocators
)
{
released_size
+=
allocator
->
ProcessUnfreedAllocations
With
Release
();
released_size
+=
allocator
->
ProcessUnfreedAllocations
And
Release
();
}
VLOG
(
8
)
<<
"Release "
<<
released_size
<<
" bytes memory from all streams"
;
return
released_size
;
...
...
@@ -191,7 +233,7 @@ void StreamSafeCUDAAllocator::ProcessUnfreedAllocations() {
}
}
uint64_t
StreamSafeCUDAAllocator
::
ProcessUnfreedAllocations
With
Release
()
{
uint64_t
StreamSafeCUDAAllocator
::
ProcessUnfreedAllocations
And
Release
()
{
ProcessUnfreedAllocations
();
return
underlying_allocator_
->
Release
(
place_
);
}
...
...
paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
浏览文件 @
c9beb885
...
...
@@ -14,10 +14,9 @@
#pragma once
#include <deque>
#include <list>
#include <map>
#include <
mutex
>
#include <
set
>
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/memory/allocation/spin_lock.h"
#include "paddle/fluid/platform/place.h"
...
...
@@ -32,27 +31,38 @@ namespace paddle {
namespace
memory
{
namespace
allocation
{
class
StreamSafeCUDAAllocator
;
class
StreamSafeCUDAAllocation
:
public
Allocation
{
public:
StreamSafeCUDAAllocation
(
DecoratedAllocationPtr
underlying_allocation
,
gpuStream_t
owning_stream
);
gpuStream_t
owning_stream
,
StreamSafeCUDAAllocator
*
allocator
);
void
RecordStream
(
const
gpuStream_t
&
stream
);
bool
CanBeFreed
();
const
gpuStream_t
&
GetOwningStream
()
const
;
private:
void
RecordGraphCapturingStreams
();
void
RecordStreamWithNoGraphCapturing
(
const
gpuStream_t
&
stream
);
DecoratedAllocationPtr
underlying_allocation_
;
std
::
set
<
gpuStream_t
>
graph_capturing_stream_set_
;
std
::
map
<
gpuStream_t
,
gpuEvent_t
>
outstanding_event_map_
;
gpuStream_t
owning_stream_
;
SpinLock
outstanding_event_map_lock_
;
// To compatiable with CUDA Graph, hold the allocator shared_ptr so that
// Allocator will not deconstruct before Allocation
std
::
shared_ptr
<
Allocator
>
allocator_
;
};
class
StreamSafeCUDAAllocator
:
public
Allocator
{
class
StreamSafeCUDAAllocator
:
public
Allocator
,
public
std
::
enable_shared_from_this
<
StreamSafeCUDAAllocator
>
{
public:
StreamSafeCUDAAllocator
(
std
::
shared_ptr
<
Allocator
>
underlying_allocator
,
platform
::
CUDAPlace
place
,
gpuStream_t
default_stream
);
platform
::
CUDAPlace
place
,
gpuStream_t
default_stream
,
bool
in_cuda_graph_capturing
=
false
);
~
StreamSafeCUDAAllocator
();
bool
IsAllocThreadSafe
()
const
override
;
...
...
@@ -63,7 +73,7 @@ class StreamSafeCUDAAllocator : public Allocator {
private:
void
ProcessUnfreedAllocations
();
uint64_t
ProcessUnfreedAllocations
With
Release
();
uint64_t
ProcessUnfreedAllocations
And
Release
();
static
std
::
map
<
platform
::
Place
,
std
::
vector
<
StreamSafeCUDAAllocator
*>>
allocator_map_
;
...
...
@@ -74,6 +84,8 @@ class StreamSafeCUDAAllocator : public Allocator {
gpuStream_t
default_stream_
;
std
::
list
<
StreamSafeCUDAAllocation
*>
unfreed_allocations_
;
SpinLock
unfreed_allocation_lock_
;
bool
in_cuda_graph_capturing_
;
};
}
// namespace allocation
...
...
paddle/fluid/memory/malloc.cc
浏览文件 @
c9beb885
...
...
@@ -41,6 +41,11 @@ std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
stream
);
}
AllocationPtr
Alloc
(
const
platform
::
CUDAPlace
&
place
,
size_t
size
,
const
phi
::
Stream
&
stream
)
{
return
allocation
::
AllocatorFacade
::
Instance
().
Alloc
(
place
,
size
,
stream
);
}
bool
InSameStream
(
const
std
::
shared_ptr
<
Allocation
>&
allocation
,
const
phi
::
Stream
&
stream
)
{
return
allocation
::
AllocatorFacade
::
Instance
().
InSameStream
(
allocation
,
...
...
@@ -52,11 +57,6 @@ void* GetBasePtr(const std::shared_ptr<Allocation>& allocation) {
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
AllocationPtr
Alloc
(
const
platform
::
CUDAPlace
&
place
,
size_t
size
,
const
gpuStream_t
&
stream
)
{
return
allocation
::
AllocatorFacade
::
Instance
().
Alloc
(
place
,
size
,
stream
);
}
uint64_t
Release
(
const
platform
::
CUDAPlace
&
place
,
const
gpuStream_t
&
stream
)
{
return
allocation
::
AllocatorFacade
::
Instance
().
Release
(
place
,
stream
);
}
...
...
paddle/fluid/memory/malloc.h
浏览文件 @
c9beb885
...
...
@@ -41,15 +41,15 @@ extern std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
size_t
size
,
const
phi
::
Stream
&
stream
);
extern
AllocationPtr
Alloc
(
const
platform
::
CUDAPlace
&
place
,
size_t
size
,
const
phi
::
Stream
&
stream
);
extern
bool
InSameStream
(
const
std
::
shared_ptr
<
Allocation
>&
allocation
,
const
phi
::
Stream
&
stream
);
extern
void
*
GetBasePtr
(
const
std
::
shared_ptr
<
Allocation
>&
allocation
);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
extern
AllocationPtr
Alloc
(
const
platform
::
CUDAPlace
&
place
,
size_t
size
,
const
gpuStream_t
&
stream
);
extern
uint64_t
Release
(
const
platform
::
CUDAPlace
&
place
,
const
gpuStream_t
&
stream
);
...
...
paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
浏览文件 @
c9beb885
...
...
@@ -12,34 +12,35 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/core/stream.h"
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
namespace
paddle
{
namespace
memory
{
__global__
void
add_kernel
(
int
*
x
,
int
n
)
{
// y += (x + 1)
__global__
void
add_kernel
(
int
*
x
,
int
*
y
,
int
n
)
{
int
thread_num
=
gridDim
.
x
*
blockDim
.
x
;
int
thread_id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
for
(
int
i
=
thread_id
;
i
<
n
;
i
+=
thread_num
)
{
atomicAdd
(
x
+
i
,
thread_id
)
;
y
[
i
]
+=
x
[
i
]
+
1
;
}
}
...
...
@@ -51,153 +52,6 @@ void CheckMemLeak(const platform::CUDAPlace &place) {
<<
" there may be a memory leak problem"
;
}
class
StreamSafeCUDAAllocTest
:
public
::
testing
::
Test
{
protected:
void
SetUp
()
override
{
place_
=
platform
::
CUDAPlace
();
stream_num_
=
64
;
grid_num_
=
1
;
block_num_
=
32
;
data_num_
=
131072
;
workspace_size_
=
data_num_
*
sizeof
(
int
);
// alloc workspace for each stream
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
gpuStream_t
stream
;
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamCreate
(
&
stream
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipStreamCreate
(
&
stream
));
#endif
std
::
shared_ptr
<
Allocation
>
allocation
=
AllocShared
(
place_
,
workspace_size_
,
phi
::
Stream
(
reinterpret_cast
<
phi
::
StreamId
>
(
stream
)));
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaMemset
(
allocation
->
ptr
(),
0
,
allocation
->
size
()));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipMemset
(
allocation
->
ptr
(),
0
,
allocation
->
size
()));
#endif
streams_
.
emplace_back
(
stream
);
workspaces_
.
emplace_back
(
allocation
);
}
result_
=
Alloc
(
place_
,
stream_num_
*
workspace_size_
);
}
void
SingleStreamRun
(
size_t
idx
)
{
// for all stream i,
// stream idx lauch a kernel to add (j % thread_num) to workspaces_[i][j]
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
int
*
x
=
reinterpret_cast
<
int
*>
(
workspaces_
[
i
]
->
ptr
());
add_kernel
<<<
grid_num_
,
block_num_
,
0
,
streams_
[
idx
]
>>>
(
x
,
data_num_
);
RecordStream
(
workspaces_
[
i
],
streams_
[
idx
]);
}
}
void
CopyResultAsync
()
{
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaMemcpyAsync
(
reinterpret_cast
<
int
*>
(
result_
->
ptr
())
+
i
*
data_num_
,
workspaces_
[
i
]
->
ptr
(),
workspace_size_
,
cudaMemcpyDeviceToDevice
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipMemcpyAsync
(
reinterpret_cast
<
int
*>
(
result_
->
ptr
())
+
i
*
data_num_
,
workspaces_
[
i
]
->
ptr
(),
workspace_size_
,
hipMemcpyDeviceToDevice
));
#endif
}
}
void
MultiStreamRun
()
{
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
SingleStreamRun
(
i
);
}
CopyResultAsync
();
workspaces_
.
clear
();
// fast_gc
cudaDeviceSynchronize
();
}
void
MultiThreadMUltiStreamRun
()
{
std
::
vector
<
std
::
thread
>
threads
;
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
threads
.
push_back
(
std
::
thread
(
&
StreamSafeCUDAAllocTest
::
SingleStreamRun
,
this
,
i
));
}
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
threads
[
i
].
join
();
}
CopyResultAsync
();
workspaces_
.
clear
();
// fast_gc
cudaDeviceSynchronize
();
}
void
CheckResult
()
{
auto
result_host
=
std
::
unique_ptr
<
int
[]
>
(
new
int
[
result_
->
size
()]);
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaMemcpy
(
result_host
.
get
(),
result_
->
ptr
(),
result_
->
size
(),
cudaMemcpyDeviceToHost
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipMemcpy
(
result_host
.
get
(),
result_
->
ptr
(),
result_
->
size
(),
hipMemcpyDeviceToHost
));
#endif
size_t
thread_num
=
grid_num_
*
block_num_
;
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
for
(
size_t
j
=
0
;
j
<
data_num_
;
++
j
)
{
EXPECT_TRUE
(
result_host
[
i
*
stream_num_
+
j
]
==
(
j
%
thread_num
)
*
stream_num_
);
}
}
result_
.
reset
();
}
void
TearDown
()
override
{
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaDeviceSynchronize
());
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipDeviceSynchronize
());
#endif
for
(
gpuStream_t
stream
:
streams_
)
{
Release
(
place_
,
stream
);
}
for
(
size_t
i
=
1
;
i
<
stream_num_
;
++
i
)
{
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamDestroy
(
streams_
[
i
]));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipStreamDestroy
(
streams_
[
i
]));
#endif
}
CheckMemLeak
(
place_
);
}
size_t
stream_num_
;
size_t
grid_num_
;
size_t
block_num_
;
size_t
data_num_
;
size_t
workspace_size_
;
platform
::
CUDAPlace
place_
;
std
::
vector
<
gpuStream_t
>
streams_
;
std
::
vector
<
std
::
shared_ptr
<
Allocation
>>
workspaces_
;
allocation
::
AllocationPtr
result_
;
};
TEST_F
(
StreamSafeCUDAAllocTest
,
CUDAMutilStreamTest
)
{
MultiStreamRun
();
CheckResult
();
}
TEST_F
(
StreamSafeCUDAAllocTest
,
CUDAMutilThreadMutilStreamTest
)
{
MultiThreadMUltiStreamRun
();
CheckResult
();
}
TEST
(
StreamSafeCUDAAllocInterfaceTest
,
AllocInterfaceTest
)
{
platform
::
CUDAPlace
place
=
platform
::
CUDAPlace
();
size_t
alloc_size
=
256
;
...
...
@@ -214,7 +68,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
paddle
::
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
))
->
stream
();
allocation
::
AllocationPtr
allocation_unique
=
Alloc
(
place
,
alloc_size
,
default_stream
);
Alloc
(
place
,
alloc_size
,
phi
::
Stream
(
reinterpret_cast
<
phi
::
StreamId
>
(
default_stream
)));
EXPECT_GE
(
allocation_unique
->
size
(),
alloc_size
);
EXPECT_EQ
(
allocation_unique
->
ptr
(),
address
);
allocation_unique
.
reset
();
...
...
@@ -303,36 +158,6 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) {
CheckMemLeak
(
place
);
}
#ifdef PADDLE_WITH_CUDA
TEST
(
StreamSafeCUDAAllocInterfaceTest
,
CUDAGraphExceptionTest
)
{
platform
::
CUDAPlace
place
=
platform
::
CUDAPlace
();
size_t
alloc_size
=
1
;
std
::
shared_ptr
<
Allocation
>
allocation
=
AllocShared
(
place
,
alloc_size
);
platform
::
BeginCUDAGraphCapture
(
place
,
cudaStreamCaptureModeGlobal
);
EXPECT_THROW
(
AllocShared
(
place
,
alloc_size
),
paddle
::
platform
::
EnforceNotMet
);
EXPECT_THROW
(
Alloc
(
place
,
alloc_size
),
paddle
::
platform
::
EnforceNotMet
);
EXPECT_THROW
(
Release
(
place
),
paddle
::
platform
::
EnforceNotMet
);
EXPECT_THROW
(
allocation
::
AllocatorFacade
::
Instance
().
GetAllocator
(
place
),
paddle
::
platform
::
EnforceNotMet
);
EXPECT_THROW
(
AllocShared
(
place
,
alloc_size
,
phi
::
Stream
(
reinterpret_cast
<
phi
::
StreamId
>
(
nullptr
))),
paddle
::
platform
::
EnforceNotMet
);
EXPECT_THROW
(
Alloc
(
place
,
alloc_size
,
nullptr
),
paddle
::
platform
::
EnforceNotMet
);
EXPECT_THROW
(
Release
(
place
,
nullptr
),
paddle
::
platform
::
EnforceNotMet
);
EXPECT_THROW
(
RecordStream
(
allocation
,
nullptr
),
paddle
::
platform
::
EnforceNotMet
);
EXPECT_THROW
(
GetStream
(
allocation
),
paddle
::
platform
::
EnforceNotMet
);
platform
::
EndCUDAGraphCapture
();
allocation
.
reset
();
Release
(
place
);
CheckMemLeak
(
place
);
}
#endif
TEST
(
StreamSafeCUDAAllocRetryTest
,
RetryTest
)
{
platform
::
CUDAPlace
place
=
platform
::
CUDAPlace
();
gpuStream_t
stream1
,
stream2
;
...
...
@@ -348,12 +173,14 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
// so the second alloc will fail and retry
size_t
alloc_size
=
available_size
/
4
*
3
;
allocation
::
AllocationPtr
allocation1
=
Alloc
(
place
,
alloc_size
,
stream1
);
allocation
::
AllocationPtr
allocation1
=
Alloc
(
place
,
alloc_size
,
phi
::
Stream
(
reinterpret_cast
<
phi
::
StreamId
>
(
stream1
)));
allocation
::
AllocationPtr
allocation2
;
std
::
thread
th
([
&
allocation2
,
&
place
,
&
stream2
,
alloc_size
]()
{
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
seconds
(
1
));
allocation2
=
Alloc
(
place
,
alloc_size
,
stream2
);
allocation2
=
Alloc
(
place
,
alloc_size
,
phi
::
Stream
(
reinterpret_cast
<
phi
::
StreamId
>
(
stream2
)));
});
allocation1
.
reset
();
// free but not release
th
.
join
();
...
...
@@ -371,5 +198,201 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
CheckMemLeak
(
place
);
}
class
StreamSafeCUDAAllocTest
:
public
::
testing
::
Test
{
protected:
void
SetUp
()
override
{
place_
=
platform
::
CUDAPlace
();
stream_num_
=
64
;
grid_num_
=
1
;
block_num_
=
32
;
data_num_
=
131072
;
workspace_size_
=
data_num_
*
sizeof
(
int
);
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
gpuStream_t
stream
;
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamCreate
(
&
stream
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipStreamCreate
(
&
stream
));
#endif
std
::
shared_ptr
<
phi
::
Allocation
>
workspace_allocation
=
AllocShared
(
place_
,
workspace_size_
,
phi
::
Stream
(
reinterpret_cast
<
phi
::
StreamId
>
(
stream
)));
std
::
shared_ptr
<
phi
::
Allocation
>
result_allocation
=
AllocShared
(
place_
,
workspace_size_
,
phi
::
Stream
(
reinterpret_cast
<
phi
::
StreamId
>
(
stream
)));
std
::
shared_ptr
<
phi
::
Allocation
>
host_result_allocation
=
AllocShared
(
platform
::
CPUPlace
(),
workspace_size_
);
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaMemset
(
workspace_allocation
->
ptr
(),
0
,
workspace_allocation
->
size
()));
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaMemset
(
result_allocation
->
ptr
(),
0
,
result_allocation
->
size
()));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipMemset
(
workspace_allocation
->
ptr
(),
0
,
workspace_allocation
->
size
()));
PADDLE_ENFORCE_GPU_SUCCESS
(
hipMemset
(
result_allocation
->
ptr
(),
0
,
result_allocation
->
size
()));
#endif
streams_
.
emplace_back
(
stream
);
workspaces_
.
emplace_back
(
workspace_allocation
);
results_
.
emplace_back
(
result_allocation
);
host_results_
.
emplace_back
(
host_result_allocation
);
}
}
void
SingleStreamRun
(
size_t
idx
)
{
int
*
y
=
reinterpret_cast
<
int
*>
(
results_
[
idx
]
->
ptr
());
int
neighbouring_idx
=
idx
>
0
?
idx
-
1
:
idx
;
add_kernel
<<<
grid_num_
,
block_num_
,
0
,
streams_
[
idx
]
>>>
(
reinterpret_cast
<
int
*>
(
workspaces_
[
idx
]
->
ptr
()),
y
,
data_num_
);
add_kernel
<<<
grid_num_
,
block_num_
,
0
,
streams_
[
idx
]
>>>
(
reinterpret_cast
<
int
*>
(
workspaces_
[
neighbouring_idx
]
->
ptr
()),
y
,
data_num_
);
RecordStream
(
workspaces_
[
neighbouring_idx
],
streams_
[
idx
]);
}
void
MultiStreamRun
()
{
// Must run in reverse order, or the workspace_[i - 1] will be released
// before streams_[i]'s kernel launch
for
(
int
i
=
stream_num_
-
1
;
i
>=
0
;
--
i
)
{
SingleStreamRun
(
i
);
workspaces_
[
i
].
reset
();
// fast GC
}
}
void
MultiThreadMultiStreamRun
()
{
std
::
vector
<
std
::
thread
>
threads
;
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
threads
.
push_back
(
std
::
thread
(
&
StreamSafeCUDAAllocTest
::
SingleStreamRun
,
this
,
i
));
}
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
threads
[
i
].
join
();
}
workspaces_
.
clear
();
}
void
CUDAGraphRun
()
{
testing_cuda_graph_
=
true
;
platform
::
BeginCUDAGraphCapture
(
platform
::
CUDAPlace
(),
cudaStreamCaptureModeGlobal
);
std
::
shared_ptr
<
Allocation
>
data_allocation
=
AllocShared
(
platform
::
CUDAPlace
(),
workspace_size_
);
std
::
shared_ptr
<
Allocation
>
result_allocation
=
AllocShared
(
platform
::
CUDAPlace
(),
workspace_size_
);
int
*
data
=
static_cast
<
int
*>
(
data_allocation
->
ptr
());
int
*
result
=
static_cast
<
int
*>
(
result_allocation
->
ptr
());
gpuStream_t
main_stream
=
GetStream
(
data_allocation
);
gpuStream_t
other_stream
;
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamCreate
(
&
other_stream
));
add_kernel
<<<
grid_num_
,
block_num_
,
0
,
main_stream
>>>
(
data
,
result
,
data_num_
);
RecordStream
(
data_allocation
,
other_stream
);
std
::
unique_ptr
<
platform
::
CUDAGraph
>
cuda_graph
=
platform
::
EndCUDAGraphCapture
();
int
replay_times
=
10
;
for
(
int
i
=
0
;
i
<
replay_times
;
++
i
)
{
cuda_graph
->
Replay
();
}
std
::
shared_ptr
<
Allocation
>
host_result_allocation
=
AllocShared
(
platform
::
CPUPlace
(),
workspace_size_
);
Copy
(
host_result_allocation
->
place
(),
host_result_allocation
->
ptr
(),
result_allocation
->
place
(),
result_allocation
->
ptr
(),
workspace_size_
,
main_stream
);
cudaStreamSynchronize
(
main_stream
);
int
*
host_result
=
static_cast
<
int
*>
(
host_result_allocation
->
ptr
());
for
(
int
i
=
0
;
i
<
data_num_
;
++
i
)
{
EXPECT_EQ
(
host_result
[
i
],
replay_times
);
}
data_allocation
.
reset
();
result_allocation
.
reset
();
cuda_graph
.
release
();
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamDestroy
(
other_stream
));
}
void
CheckResult
()
{
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
Copy
(
host_results_
[
i
]
->
place
(),
host_results_
[
i
]
->
ptr
(),
results_
[
i
]
->
place
(),
results_
[
i
]
->
ptr
(),
workspace_size_
,
streams_
[
i
]);
}
cudaDeviceSynchronize
();
size_t
thread_num
=
grid_num_
*
block_num_
;
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
int
*
result
=
static_cast
<
int
*>
(
host_results_
[
i
]
->
ptr
());
for
(
size_t
j
=
0
;
j
<
data_num_
;
++
j
)
{
EXPECT_EQ
(
result
[
j
],
2
);
}
}
}
void
TearDown
()
override
{
workspaces_
.
clear
();
results_
.
clear
();
host_results_
.
clear
();
for
(
gpuStream_t
stream
:
streams_
)
{
Release
(
place_
,
stream
);
}
for
(
size_t
i
=
0
;
i
<
stream_num_
;
++
i
)
{
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamDestroy
(
streams_
[
i
]));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
hipStreamDestroy
(
streams_
[
i
]));
#endif
}
// Memory release for CUDA Graph memory pool is forbidden
if
(
!
testing_cuda_graph_
)
{
CheckMemLeak
(
place_
);
}
}
bool
testing_cuda_graph_
{
0
};
size_t
stream_num_
;
size_t
grid_num_
;
size_t
block_num_
;
size_t
data_num_
;
size_t
workspace_size_
;
platform
::
CUDAPlace
place_
;
std
::
vector
<
gpuStream_t
>
streams_
;
std
::
vector
<
std
::
shared_ptr
<
phi
::
Allocation
>>
workspaces_
;
std
::
vector
<
std
::
shared_ptr
<
phi
::
Allocation
>>
results_
;
std
::
vector
<
std
::
shared_ptr
<
phi
::
Allocation
>>
host_results_
;
};
TEST_F
(
StreamSafeCUDAAllocTest
,
CUDAMutilStreamTest
)
{
MultiStreamRun
();
CheckResult
();
}
TEST_F
(
StreamSafeCUDAAllocTest
,
CUDAMutilThreadMutilStreamTest
)
{
MultiThreadMultiStreamRun
();
CheckResult
();
}
#ifdef PADDLE_WITH_CUDA
TEST_F
(
StreamSafeCUDAAllocTest
,
CUDAGraphTest
)
{
MultiStreamRun
();
CUDAGraphRun
();
CheckResult
();
}
#endif
}
// namespace memory
}
// namespace paddle
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录