Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
62b1f38c
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
62b1f38c
编写于
12月 10, 2021
作者:
S
sneaxiy
提交者:
GitHub
12月 10, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
make cuda graph thread local allocator (#37814)
上级
c732c831
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
83 addition
and
3 deletion
+83
-3
paddle/fluid/memory/allocation/allocator_facade.cc
paddle/fluid/memory/allocation/allocator_facade.cc
+3
-2
paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
+14
-0
paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
+22
-0
python/paddle/fluid/tests/unittests/test_cuda_graph.py
python/paddle/fluid/tests/unittests/test_cuda_graph.py
+44
-1
未找到文件。
paddle/fluid/memory/allocation/allocator_facade.cc
浏览文件 @
62b1f38c
...
...
@@ -348,13 +348,14 @@ class AllocatorFacadePrivate {
const
AllocatorMap
&
GetAllocatorMap
()
{
#ifdef PADDLE_WITH_CUDA
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
IsCapturing
()))
{
if
(
UNLIKELY
(
platform
::
CUDAGraph
::
Is
ThisThread
Capturing
()))
{
auto
id
=
platform
::
CUDAGraph
::
CapturingID
();
auto
iter
=
cuda_graph_allocator_map_
.
find
(
id
);
PADDLE_ENFORCE_NE
(
iter
,
cuda_graph_allocator_map_
.
end
(),
platform
::
errors
::
PermissionDenied
(
"No memory pool is prepared for CUDA Graph capturing."
));
VLOG
(
10
)
<<
"Choose CUDA Graph memory pool to allocate memory"
;
return
iter
->
second
->
allocators_
;
}
else
{
return
allocators_
;
...
...
@@ -405,7 +406,7 @@ class AllocatorFacadePrivate {
#if defined(PADDLE_WITH_HIP)
auto
cuda_allocator
=
std
::
make_shared
<
CUDAAllocator
>
(
p
);
cuda_allocators_
[
p
][
stream
]
=
std
::
make_shared
<
AutoGrowthBestFitAllocator
>
(
cuda_allocator
,
platform
::
GpuMinChunkSize
(),
allow_free_idle_chunk_
);
cuda_allocator
,
platform
::
GpuMinChunkSize
(),
0
,
allow_free_idle_chunk_
);
#endif
#if defined(PADDLE_WITH_CUDA)
...
...
paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
浏览文件 @
62b1f38c
...
...
@@ -18,6 +18,7 @@ namespace paddle {
namespace
platform
{
std
::
unique_ptr
<
CUDAGraph
>
CUDAGraph
::
capturing_graph_
{
nullptr
};
paddle
::
optional
<
std
::
thread
::
id
>
CUDAGraph
::
capturing_thread_id_
{
paddle
::
none
};
void
CUDAGraph
::
Reset
()
{
if
(
is_reset_
)
return
;
...
...
@@ -58,6 +59,13 @@ void CUDAGraph::BeginSegmentCapture() {
IsCapturing
(),
true
,
errors
::
PermissionDenied
(
"BeginSegmentCapture should be called when CUDA "
"Graph is capturing."
));
if
(
IsThreadLocalCapturing
())
{
PADDLE_ENFORCE_EQ
(
IsThisThreadCapturing
(),
true
,
platform
::
errors
::
PermissionDenied
(
"When capturing CUDA Graph in the thread local mode, "
"you cannot begin segmented capturing in the thread "
"which is not the one that starts the capturing."
));
}
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaStreamBeginCapture
(
capturing_graph_
->
stream_
,
capturing_graph_
->
capture_mode_
));
PADDLE_ENFORCE_EQ
(
IsValidCapturing
(),
true
,
...
...
@@ -82,6 +90,11 @@ void CUDAGraph::BeginCapture(platform::CUDAPlace place, cudaStream_t stream,
capturing_graph_
->
place_
=
place
;
capturing_graph_
->
stream_
=
stream
;
capturing_graph_
->
capture_mode_
=
mode
;
if
(
mode
==
cudaStreamCaptureModeThreadLocal
)
{
capturing_thread_id_
=
std
::
this_thread
::
get_id
();
VLOG
(
10
)
<<
"Capturing CUDA Graph in thread local mode, thread id: "
<<
capturing_thread_id_
;
}
BeginSegmentCapture
();
#endif
}
...
...
@@ -115,6 +128,7 @@ void CUDAGraph::EndSegmentCapture() {
std
::
unique_ptr
<
CUDAGraph
>
CUDAGraph
::
EndCapture
()
{
EndSegmentCapture
();
capturing_thread_id_
=
paddle
::
none
;
return
std
::
move
(
capturing_graph_
);
}
...
...
paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
浏览文件 @
62b1f38c
...
...
@@ -18,6 +18,7 @@
#include <functional>
#include <memory>
#include <mutex>
#include <thread>
#include <vector>
#include "cuda.h" // NOLINT
#include "cuda_runtime.h" // NOLINT
...
...
@@ -26,6 +27,7 @@
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/utils/optional.h"
namespace
paddle
{
namespace
platform
{
...
...
@@ -99,6 +101,25 @@ class CUDAGraph {
// supported during capturing CUDA Graph.
static
bool
IsValidCapturing
();
static
bool
IsThreadLocalCapturing
()
{
#if CUDA_VERSION >= 10010
return
IsCapturing
()
&&
capturing_graph_
->
capture_mode_
==
cudaStreamCaptureModeThreadLocal
;
#else
return
false
;
#endif
}
static
bool
IsThisThreadCapturing
()
{
if
(
UNLIKELY
(
IsCapturing
()))
{
return
IsThreadLocalCapturing
()
?
capturing_thread_id_
.
get
()
==
std
::
this_thread
::
get_id
()
:
true
;
}
else
{
return
false
;
}
}
private:
static
CUDAGraphID
UniqueID
()
{
static
std
::
atomic
<
CUDAGraphID
>
id
;
...
...
@@ -118,6 +139,7 @@ class CUDAGraph {
bool
is_reset_
{
false
};
std
::
mutex
mtx_
;
static
paddle
::
optional
<
std
::
thread
::
id
>
capturing_thread_id_
;
static
std
::
unique_ptr
<
CUDAGraph
>
capturing_graph_
;
};
...
...
python/paddle/fluid/tests/unittests/test_cuda_graph.py
浏览文件 @
62b1f38c
...
...
@@ -34,7 +34,8 @@ class TestCUDAGraph(unittest.TestCase):
paddle
.
set_flags
({
'FLAGS_allocator_strategy'
:
'auto_growth'
,
'FLAGS_sync_nccl_allreduce'
:
False
,
'FLAGS_cudnn_deterministic'
:
True
'FLAGS_cudnn_deterministic'
:
True
,
'FLAGS_use_stream_safe_cuda_allocator'
:
False
,
})
def
random_tensor
(
self
,
shape
):
...
...
@@ -187,6 +188,48 @@ class TestCUDAGraph(unittest.TestCase):
finally
:
graph
.
reset
()
def
test_dataloader
(
self
):
if
not
can_use_cuda_graph
():
return
class
AutoIncDataset
(
paddle
.
io
.
Dataset
):
def
__init__
(
self
,
n
,
dtype
):
self
.
n
=
n
self
.
dtype
=
dtype
def
__len__
(
self
):
return
self
.
n
def
__getitem__
(
self
,
idx
):
return
np
.
array
([
idx
]).
astype
(
self
.
dtype
)
n
=
100
dtype
=
'int64'
dataset
=
AutoIncDataset
(
n
,
dtype
)
data_loader
=
paddle
.
io
.
DataLoader
(
dataset
,
batch_size
=
1
,
num_workers
=
2
,
use_buffer_reader
=
True
)
x
=
None
y
=
None
graph
=
None
for
i
,
data
in
enumerate
(
data_loader
):
if
graph
is
None
:
x
=
data
x
=
x
.
cuda
()
graph
=
CUDAGraph
()
graph
.
capture_begin
()
y
=
x
*
x
graph
.
capture_end
()
else
:
x
.
copy_
(
data
,
False
)
x
=
x
.
cuda
()
graph
.
replay
()
actual_x
=
np
.
array
([[
i
]]).
astype
(
dtype
)
actual_y
=
np
.
array
([[
i
*
i
]]).
astype
(
dtype
)
self
.
assertTrue
(
np
.
array_equal
(
actual_x
,
x
.
numpy
()))
self
.
assertTrue
(
np
.
array_equal
(
actual_y
,
y
.
numpy
()))
if
__name__
==
"__main__"
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录