Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
d83d59dd
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
d83d59dd
编写于
6月 14, 2022
作者:
Y
Yuang Liu
提交者:
GitHub
6月 14, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[cuda graph] partial program with cuda graph under static mode (#43440)
上级
db58dd27
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
525 addition
and
13 deletion
+525
-13
paddle/fluid/framework/program_desc.cc
paddle/fluid/framework/program_desc.cc
+18
-3
paddle/fluid/operators/run_program_op.h
paddle/fluid/operators/run_program_op.h
+35
-10
paddle/fluid/platform/cuda_graph_with_memory_pool.cc
paddle/fluid/platform/cuda_graph_with_memory_pool.cc
+10
-0
python/paddle/device/cuda/graphs.py
python/paddle/device/cuda/graphs.py
+335
-0
python/paddle/fluid/tests/unittests/test_cuda_graph_partial_graph_static_run.py
...sts/unittests/test_cuda_graph_partial_graph_static_run.py
+127
-0
未找到文件。
paddle/fluid/framework/program_desc.cc
浏览文件 @
d83d59dd
...
@@ -59,9 +59,12 @@ ProgramDesc::ProgramDesc() {
...
@@ -59,9 +59,12 @@ ProgramDesc::ProgramDesc() {
ProgramDesc
::
ProgramDesc
(
const
ProgramDesc
&
o
)
{
ProgramDesc
::
ProgramDesc
(
const
ProgramDesc
&
o
)
{
desc_
=
o
.
desc_
;
desc_
=
o
.
desc_
;
std
::
vector
<
framework
::
BlockDesc
*>
old_block_desc
;
for
(
int
i
=
0
;
i
<
desc_
.
blocks_size
();
++
i
)
{
for
(
int
i
=
0
;
i
<
desc_
.
blocks_size
();
++
i
)
{
auto
*
block
=
desc_
.
mutable_blocks
(
i
);
auto
*
block
=
desc_
.
mutable_blocks
(
i
);
blocks_
.
emplace_back
(
new
BlockDesc
(
*
o
.
blocks_
[
i
],
block
,
this
));
blocks_
.
emplace_back
(
new
BlockDesc
(
*
o
.
blocks_
[
i
],
block
,
this
));
// record all block desc's ptr from origin program
old_block_desc
.
emplace_back
(
o
.
blocks_
[
i
].
get
());
}
}
for
(
size_t
block_id
=
0
;
block_id
<
blocks_
.
size
();
++
block_id
)
{
for
(
size_t
block_id
=
0
;
block_id
<
blocks_
.
size
();
++
block_id
)
{
auto
all_ops
=
blocks_
[
block_id
]
->
AllOps
();
auto
all_ops
=
blocks_
[
block_id
]
->
AllOps
();
...
@@ -70,9 +73,21 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
...
@@ -70,9 +73,21 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
for
(
const
std
::
string
&
attr_name
:
op
->
AttrNames
())
{
for
(
const
std
::
string
&
attr_name
:
op
->
AttrNames
())
{
if
(
op
->
GetAttrType
(
attr_name
)
==
proto
::
AttrType
::
BLOCK
)
{
if
(
op
->
GetAttrType
(
attr_name
)
==
proto
::
AttrType
::
BLOCK
)
{
int
sub_block_id
=
framework
::
BlockDesc
*
block_desc
=
o
.
Block
(
block_id
).
Op
(
op_id
)
->
GetBlockAttrId
(
attr_name
);
BOOST_GET_CONST
(
framework
::
BlockDesc
*
,
op
->
GetAttr
(
attr_name
));
op
->
SetBlockAttr
(
attr_name
,
MutableBlock
(
sub_block_id
));
if
(
std
::
find
(
old_block_desc
.
begin
(),
old_block_desc
.
end
(),
block_desc
)
!=
old_block_desc
.
end
())
{
// The block is owned by the origin program. Just use id to get
// the corresponding block.
int
sub_block_id
=
o
.
Block
(
block_id
).
Op
(
op_id
)
->
GetBlockAttrId
(
attr_name
);
op
->
SetBlockAttr
(
attr_name
,
MutableBlock
(
sub_block_id
));
}
else
{
// The block is not owned by the origin program. Should copy
// the real block desc instead of logical block in the program.
VLOG
(
3
)
<<
"Set op's block attr with the original block"
;
op
->
SetBlockAttr
(
attr_name
,
block_desc
);
}
}
else
if
(
op
->
GetAttrType
(
attr_name
)
==
proto
::
AttrType
::
BLOCKS
)
{
}
else
if
(
op
->
GetAttrType
(
attr_name
)
==
proto
::
AttrType
::
BLOCKS
)
{
std
::
vector
<
int
>
sub_block_ids
=
std
::
vector
<
int
>
sub_block_ids
=
o
.
Block
(
block_id
).
Op
(
op_id
)
->
GetBlocksAttrIds
(
attr_name
);
o
.
Block
(
block_id
).
Op
(
op_id
)
->
GetBlocksAttrIds
(
attr_name
);
...
...
paddle/fluid/operators/run_program_op.h
浏览文件 @
d83d59dd
...
@@ -257,7 +257,12 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
...
@@ -257,7 +257,12 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
auto
input_var_names
=
ctx
.
InputNames
(
"X"
);
auto
input_var_names
=
ctx
.
InputNames
(
"X"
);
auto
output_var_names
=
ctx
.
OutputNames
(
"Out"
);
auto
output_var_names
=
ctx
.
OutputNames
(
"Out"
);
auto
dout_var_names
=
ctx
.
OutputNames
(
"DOut"
);
std
::
vector
<
std
::
string
>
dout_var_names
;
if
(
!
dout_vars
.
empty
())
{
// DOut is a dispensable out, only get the names when it exists.
// Otherwise, it will throw a NotFound error.
dout_var_names
=
ctx
.
OutputNames
(
"DOut"
);
}
// current program may not hold parameters
// current program may not hold parameters
std
::
vector
<
std
::
string
>
param_names
;
std
::
vector
<
std
::
string
>
param_names
;
...
@@ -272,10 +277,23 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
...
@@ -272,10 +277,23 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
// NOTE(chenweihang): In order not to add new variable type, use vector
// NOTE(chenweihang): In order not to add new variable type, use vector
// here. Originally, here can use scope directly.
// here. Originally, here can use scope directly.
auto
*
out_scope_vec
=
ctx
.
Output
<
StepScopeVar
>
(
"OutScope"
);
auto
*
out_scope_vec
=
ctx
.
Output
<
StepScopeVar
>
(
"OutScope"
);
PADDLE_ENFORCE_EQ
(
std
::
unique_ptr
<
framework
::
Scope
>
inner_scope
{
nullptr
};
out_scope_vec
->
size
(),
1
,
if
(
out_scope_vec
->
size
()
==
0
)
{
platform
::
errors
::
InvalidArgument
(
// For cuda graph under static mode usage.
"The OutScope of RunProgramGradOp should only hold one scope."
));
// For static mode, we cannot set value of a tensor before any run,
// the OutScope variable passed to the op actually contains nothing.
// Just create a tmp scope to run the program.
PADDLE_ENFORCE_EQ
(
use_cuda_graph
,
true
,
platform
::
errors
::
InvalidArgument
(
"If not provide OutScope then must run under cuda graph mode."
));
inner_scope
=
std
::
make_unique
<
framework
::
Scope
>
();
}
else
{
PADDLE_ENFORCE_EQ
(
out_scope_vec
->
size
(),
1
,
platform
::
errors
::
InvalidArgument
(
"The OutScope of RunProgramGradOp should only hold one scope."
));
}
// Step 2. prepare executor and init persistable variables
// Step 2. prepare executor and init persistable variables
...
@@ -284,9 +302,10 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
...
@@ -284,9 +302,10 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
// Learning. Tensor data in multi-step training should be saved into single
// Learning. Tensor data in multi-step training should be saved into single
// scope separately. Otherwise, the gradients can be miscalculated because
// scope separately. Otherwise, the gradients can be miscalculated because
// always using the Tensor data of the last step in forward.
// always using the Tensor data of the last step in forward.
framework
::
Scope
*
global_inner_scope
=
out_scope_vec
->
front
();
framework
::
Scope
*
global_inner_scope
=
out_scope_vec
->
size
()
==
0
?
inner_scope
.
get
()
:
out_scope_vec
->
front
();
VLOG
(
2
)
<<
"The number of sub scopes before forward: "
VLOG
(
2
)
<<
"The number of sub scopes before forward: "
<<
out_scope_vec
->
front
()
->
kids
().
size
();
<<
global_inner_scope
->
kids
().
size
();
framework
::
Scope
&
scope
=
global_inner_scope
->
NewScope
();
framework
::
Scope
&
scope
=
global_inner_scope
->
NewScope
();
// share input_vars & parameters into scope
// share input_vars & parameters into scope
...
@@ -341,13 +360,19 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
...
@@ -341,13 +360,19 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
&
scope
);
&
scope
);
// Debug info: scope info when run end
// Debug info: scope info when run end
VLOG
(
3
)
<<
framework
::
GenScopeTreeDebugInfo
(
out_scope_vec
->
front
());
framework
::
Scope
*
target_scope
{
nullptr
};
if
(
out_scope_vec
->
size
()
==
0
)
{
target_scope
=
inner_scope
.
get
();
}
else
{
target_scope
=
out_scope_vec
->
front
();
}
VLOG
(
3
)
<<
framework
::
GenScopeTreeDebugInfo
(
target_scope
);
// Step 5. Drop all children scopes while testing.
// Step 5. Drop all children scopes while testing.
if
(
is_test
)
{
if
(
is_test
)
{
out_scope_vec
->
front
()
->
DropKids
();
target_scope
->
DropKids
();
}
}
VLOG
(
2
)
<<
"The number of sub scopes after forward: "
VLOG
(
2
)
<<
"The number of sub scopes after forward: "
<<
out_scope_vec
->
front
()
->
kids
().
size
();
<<
target_scope
->
kids
().
size
();
#ifdef PADDLE_WITH_MKLDNN
#ifdef PADDLE_WITH_MKLDNN
if
(
FLAGS_use_mkldnn
)
platform
::
DontClearMKLDNNCache
(
ctx
.
GetPlace
());
if
(
FLAGS_use_mkldnn
)
platform
::
DontClearMKLDNNCache
(
ctx
.
GetPlace
());
#endif
#endif
...
...
paddle/fluid/platform/cuda_graph_with_memory_pool.cc
浏览文件 @
d83d59dd
...
@@ -28,6 +28,16 @@ void BeginCUDAGraphCapture(platform::CUDAPlace place,
...
@@ -28,6 +28,16 @@ void BeginCUDAGraphCapture(platform::CUDAPlace place,
auto
*
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
GetByPlace
(
place
);
auto
*
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
GetByPlace
(
place
);
dev_ctx
->
cudnn_workspace_handle
().
ResetWorkspace
();
dev_ctx
->
cudnn_workspace_handle
().
ResetWorkspace
();
// After PR(#43206), cudnn related initializations will change to lazy mode.
// It will only be initialized when op calls them. But cuda graph not support
// capture such kind of init, need to init all these handle before cuda graph.
dev_ctx
->
cublas_handle
();
#if CUDA_VERSION >= 11060
dev_ctx
->
cublaslt_handle
();
#endif
dev_ctx
->
cudnn_handle
();
dev_ctx
->
cusolver_dn_handle
();
auto
stream
=
dev_ctx
->
stream
();
auto
stream
=
dev_ctx
->
stream
();
CUDAGraph
::
BeginCapture
(
place
,
stream
,
mode
);
CUDAGraph
::
BeginCapture
(
place
,
stream
,
mode
);
...
...
python/paddle/device/cuda/graphs.py
浏览文件 @
d83d59dd
...
@@ -14,7 +14,10 @@
...
@@ -14,7 +14,10 @@
import
os
import
os
import
paddle
import
paddle
from
paddle.fluid
import
core
from
paddle.fluid.layers.utils
import
_hash_with_id
from
paddle.fluid.core
import
is_compiled_with_cuda
,
is_compiled_with_rocm
,
CUDAPlace
from
paddle.fluid.core
import
is_compiled_with_cuda
,
is_compiled_with_rocm
,
CUDAPlace
import
warnings
if
is_compiled_with_cuda
()
and
not
is_compiled_with_rocm
():
if
is_compiled_with_cuda
()
and
not
is_compiled_with_rocm
():
from
paddle.fluid.core
import
CUDAGraph
as
CoreCUDAGraph
from
paddle.fluid.core
import
CUDAGraph
as
CoreCUDAGraph
...
@@ -106,3 +109,335 @@ def wrap_cuda_graph(function, mode="thread_local", memory_pool="default"):
...
@@ -106,3 +109,335 @@ def wrap_cuda_graph(function, mode="thread_local", memory_pool="default"):
else
:
else
:
mock_func
.
_cuda_graph_pool_id
=
memory_pool
.
_cuda_graph_pool_id
mock_func
.
_cuda_graph_pool_id
=
memory_pool
.
_cuda_graph_pool_id
return
new_function
return
new_function
def
copy_var_desc
(
dst
,
src
):
"""
copy var desc from src to dst
:param dst: framework.VarDesc(cpp), dst var desc, cpp VarDesc instance
:param src: framework.VarDesc(cpp), src var desc, cpp VarDesc instance
:return: no return
"""
dst
.
set_shape
(
src
.
shape
)
dst
.
set_dtype
(
src
.
dtype
)
dst
.
set_lod_level
(
src
.
lod_level
)
dst
.
set_type
(
src
.
type
)
dst
.
set_persistable
(
src
.
persistable
)
dst
.
set_is_parameter
(
src
.
is_parameter
)
dst
.
set_stop_gradient
(
src
.
stop_gradient
)
def
all_inputs_of_later_op
(
block
,
begin_idx
):
"""
find all inputs of ops after an idx, used to determine the logical output of a cuda graph section
:param block: framework.Block, the original block
:param begin_idx: int, from which idx (not include) to find the later ins
:return: a list of inputs names for all ops behind begin_idx
"""
ins
=
[]
for
idx
,
op
in
enumerate
(
block
.
ops
):
if
idx
<=
begin_idx
:
continue
for
in_name
in
op
.
input_arg_names
:
ins
.
append
(
in_name
)
return
list
(
set
(
ins
))
def
construct_program_and_find_ins_outs
(
section
,
origin_program
,
section_idx
):
"""
1. Construct a new program for corresponding section
2. Find all the logical inputs and outputs of a program section
:param section: list, one cuda graph section, list of ops
:param origin_program: framework.Program, origin program
:param section_idx: list, the section ops' idx corresponding to the cuda graph section, a list of idx
:return: a new program for the cuda graph section
the logical ins and outs of the cuda graph section
"""
program
=
paddle
.
static
.
Program
()
block
=
program
.
global_block
()
origin_block
=
origin_program
.
global_block
()
ins
=
[]
outs
=
[]
op_role_attr_name
=
core
.
op_proto_and_checker_maker
.
kOpRoleAttrName
()
later_ins
=
all_inputs_of_later_op
(
origin_block
,
section_idx
[
-
1
])
for
op
in
section
:
for
in_name
in
op
.
input_arg_names
:
var
=
origin_block
.
var
(
in_name
)
new_var_desc
=
block
.
desc
.
var
(
var
.
name
.
encode
(
"ascii"
))
copy_var_desc
(
new_var_desc
,
var
)
if
outs
.
count
(
in_name
)
==
0
and
ins
.
count
(
in_name
)
==
0
:
# This in var is generated from op outside this section
# Only record once for same input
ins
.
append
(
in_name
)
elif
later_ins
.
count
(
in_name
)
==
0
:
# this is var is generated from op inside this section, and only will be used inside this section
outs
.
remove
(
in_name
)
for
out_name
in
op
.
output_arg_names
:
var
=
origin_block
.
var
(
out_name
)
new_var_desc
=
block
.
desc
.
var
(
var
.
name
.
encode
(
"ascii"
))
copy_var_desc
(
new_var_desc
,
var
)
# for every output, we add it to the section's outs
if
outs
.
count
(
out_name
)
==
0
:
# Only record one out var even if it will be generated by multi ops.
# For scenario like this:
# A = op1(a)
# A = op2(b)
# B = op3(A)
outs
.
append
(
out_name
)
new_op_desc
=
block
.
desc
.
append_op
()
new_op_desc
.
copy_from
(
op
.
desc
)
new_op_desc
.
_set_attr
(
op_role_attr_name
,
op
.
attr
(
op_role_attr_name
))
program
.
_sync_with_cpp
()
return
program
,
[
ins
,
outs
]
def
get_cuda_graph_sections
(
program
):
"""
get all sections that should run under cuda graph and the corresponding idx
:param program: framework.Program, the original program
:return: A list of cuda graph sections and the corresponding ops' idx in the block.
The program is under is test or not.
"""
block
=
program
.
global_block
()
cuda_graph_sections
=
[]
# record all ops in every cuda graph sections
sections_idx
=
[]
# idx of all ops in every cuda graph sections
is_test
=
False
# will be set to True is any op's 'is_test' attr is True
# ops and it's idx between cuda graph wrapped op, may belong to a section
internal_section
=
[]
internal_idx
=
[]
current_section
=
[]
# current recording cuda graph sections
current_idx
=
[]
# current recording cuda graph ops' idx
current_cuda_graph_id
=
-
1
# current recording cuda graph id
op_role_attr_name
=
core
.
op_proto_and_checker_maker
.
kOpRoleAttrName
()
loss_op_role
=
int
(
core
.
op_proto_and_checker_maker
.
OpRole
.
Loss
)
backward_op_role
=
int
(
core
.
op_proto_and_checker_maker
.
OpRole
.
Backward
)
loss_grad_op_role
=
loss_op_role
|
backward_op_role
for
idx
,
op
in
enumerate
(
block
.
ops
):
if
op
.
type
==
'conditional_block'
or
op
.
type
==
'while'
:
assert
op
.
_cuda_graph_attr
is
None
,
"Cuda graph not support conditional block op and while op."
if
op
.
has_attr
(
'is_test'
)
and
op
.
attr
(
'is_test'
):
is_test
=
True
# find cuda graph sections
if
op
.
_cuda_graph_attr
is
not
None
:
assert
isinstance
(
op
.
_cuda_graph_attr
,
str
),
"cuda_graph_attr should be a str"
cuda_graph_attrs
=
op
.
_cuda_graph_attr
.
split
(
';'
)
assert
len
(
cuda_graph_attrs
)
==
3
,
"cuda graph attr should have three fields: "
\
"cuda graph mode, cuda graph memory pool id, cuda graph id"
local_cuda_graph_id
=
int
(
cuda_graph_attrs
[
2
])
if
local_cuda_graph_id
==
current_cuda_graph_id
:
if
len
(
internal_section
)
>
0
:
assert
len
(
internal_section
)
==
len
(
internal_idx
),
"len of internal section should be equal with len of internal idx"
for
internal_op
in
internal_section
:
loss_related
=
(
int
(
internal_op
.
attr
(
op_role_attr_name
))
==
loss_op_role
)
or
int
(
(
internal_op
.
attr
(
op_role_attr_name
)
)
==
loss_grad_op_role
)
sub_block_related
=
(
op
.
type
==
'conditional_block'
or
op
.
type
==
'while'
)
if
loss_related
or
sub_block_related
:
# if loss_related is True
# The internal section contains loss related ops,
# although these ops are between two cuda graph sections with same graph id,
# they belong to none of these two sections.
# The loss related op should be wrapped by user explicitly.
# if sub_block_related is True
# The internal section contains while op or conditional block op.
# These two ops are not supported by cuda graph. Won't extend the section.
internal_section
=
[]
internal_idx
=
[]
# Beside clear the internal section, a new cuda graph section should be recorded
assert
len
(
current_section
)
==
len
(
current_idx
),
\
"num of section's op is not equal with the idx"
if
len
(
current_section
)
>
0
:
# store previous section
cuda_graph_sections
.
append
(
current_section
)
sections_idx
.
append
(
current_idx
)
current_section
=
[]
current_idx
=
[]
break
# some ops inserted by some optimizer, should be added to current section
for
i
in
range
(
len
(
internal_section
)):
current_section
.
append
(
internal_section
[
i
])
current_idx
.
append
(
internal_idx
[
i
])
internal_section
=
[]
current_section
.
append
(
op
)
current_idx
.
append
(
idx
)
else
:
# current graph id is different with previous, start a new section of cuda graph
# internal ops and idx belong to no section, just clear it
internal_section
=
[]
internal_idx
=
[]
current_cuda_graph_id
=
local_cuda_graph_id
# start record a new section
assert
len
(
current_section
)
==
len
(
current_idx
),
"num of section's op is not equal with num of idx"
if
len
(
current_section
)
>
0
:
# store previous section
cuda_graph_sections
.
append
(
current_section
)
sections_idx
.
append
(
current_idx
)
current_section
=
[
op
]
current_idx
=
[
idx
]
else
:
# recode ops which cuda_graph_attr is None, may belong to a section
internal_section
.
append
(
op
)
internal_idx
.
append
(
idx
)
# handle the last section
assert
len
(
current_section
)
==
len
(
current_idx
),
"num of section's op is not equal with num of idx"
if
len
(
current_section
)
>
0
:
# store previous section
cuda_graph_sections
.
append
(
current_section
)
sections_idx
.
append
(
current_idx
)
return
cuda_graph_sections
,
sections_idx
,
is_test
def
replace_cuda_graph_section
(
ins_and_outs
,
section_program
,
section_idx
,
origin_program
,
cuda_graph_section
,
order
,
is_test
):
"""
Use section_program and ins_and_outs to initialize a run_program_op,
and replace the section_idx marks ops in the origin program.
:param ins_and_outs: list, the logical ins and outs of the section program
:param section_program: framework.Program, the partial program need to run under cuda graph
:param section_idx: list, the idx need to be removed from origin program
:param origin_program: framework.Program, the origin program
:param cuda_graph_section: list, the ops in current sections, used to get the mode, memory pool id and is_test
:param order: int, the order of current section, used to create unique cuda graph var
:param is_test: bool, the program is running under is_test or not
:return: no return
"""
ins
=
ins_and_outs
[
0
]
outs
=
ins_and_outs
[
1
]
insert_idx
=
section_idx
[
0
]
origin_block
=
origin_program
.
global_block
()
for
idx
in
reversed
(
section_idx
):
# remove all cuda graph marked ops from origin block
origin_block
.
_remove_op
(
idx
,
sync
=
False
)
mode
=
None
memory_pool_id
=
None
for
op
in
cuda_graph_section
:
# find the cuda graph mode and memory pool id, determine is test or not
if
op
.
_cuda_graph_attr
is
not
None
:
attrs
=
op
.
_cuda_graph_attr
.
split
(
';'
)
mode
=
attrs
[
0
]
memory_pool_id
=
int
(
attrs
[
1
])
break
assert
mode
is
not
None
and
memory_pool_id
is
not
None
,
\
"mode and memory pool id should be specified in cuda graph attr"
cuda_graph_var
=
origin_block
.
create_var
(
name
=
"cuda_graph_"
+
str
(
order
),
type
=
core
.
VarDesc
.
VarType
.
RAW
,
persistable
=
True
,
stop_gradient
=
True
,
)
# not used for the run_program_op, just needed by the op, but won't be used
out_scope_var
=
origin_block
.
create_var
(
name
=
"program_out_scope_"
+
str
(
order
),
type
=
core
.
VarDesc
.
VarType
.
STEP_SCOPES
,
persistable
=
True
,
stop_gradient
=
True
,
)
program_id
=
_hash_with_id
(
section_program
,
ins_and_outs
)
# insert the run_program_op into the block
origin_block
.
_insert_op
(
insert_idx
,
type
=
'run_program'
,
inputs
=
{
'X'
:
ins
},
outputs
=
{
'Out'
:
outs
,
'OutScope'
:
out_scope_var
,
'CUDAGraph'
:
cuda_graph_var
},
attrs
=
{
'global_block'
:
section_program
.
global_block
(),
'start_op_index'
:
0
,
'end_op_index'
:
len
(
section_program
.
global_block
().
ops
),
'is_test'
:
is_test
,
'program_id'
:
program_id
,
'cuda_graph_capture_mode'
:
mode
,
'cuda_graph_pool_id'
:
memory_pool_id
,
})
def
cuda_graph_transform
(
program
):
"""
replace the ops marked with cuda_graph_attr to run_program_op to use cuda graph
:param program: framework.Program, the program to be transformed
:return: the cuda graph section program, user should hold these programs!
"""
if
len
(
program
.
blocks
)
>
1
:
# some sub blocks may be inserted by optimizer but will not use during training, just warn here
warnings
.
warn
(
"Sub block(s) has been detected in the program. "
"Cuda graph not support op with sub block, and it will only handle the global block."
)
# step 1: get all cuda graph sections.
# A cuda graph section contains all ops marked with same cuda graph id and
# some ops inserted by some optimizers (amp, sharding for example) between ops with same id.
cuda_graph_sections
,
sections_idx
,
is_test
=
get_cuda_graph_sections
(
program
)
assert
len
(
cuda_graph_sections
)
==
len
(
sections_idx
),
\
"num of cuda graph sections is not equal with num of idx sections"
# step 2: construct new program for each section and find inputs and outputs of each section.
# The inputs are variables generated outside the section but will be used by this section.
# The outputs are variables generated by this section and will be used after the end of the section.
ins_and_outs
=
[]
section_programs
=
[]
for
i
in
range
(
len
(
cuda_graph_sections
)):
# creating new program for current section
section_program
,
ins_outs
=
construct_program_and_find_ins_outs
(
cuda_graph_sections
[
i
],
program
,
sections_idx
[
i
])
ins_and_outs
.
append
(
ins_outs
)
section_programs
.
append
(
section_program
)
assert
len
(
section_programs
)
==
len
(
cuda_graph_sections
),
\
"the num of cuda graph sections should be equal with the num of new program"
# step 3: replace the ops in original program with run_program_op.
# Will remove all ops in the section from origin program, and use run_program_op to replace them.
for
i
in
reversed
(
range
(
len
(
cuda_graph_sections
))):
# carry out the replacement in reversed order, to keep the previous idx intact
replace_cuda_graph_section
(
ins_and_outs
[
i
],
section_programs
[
i
],
sections_idx
[
i
],
program
,
cuda_graph_sections
[
i
],
order
=
i
,
is_test
=
is_test
)
# NOTE: user should hold these program, for now just return these program back to caller
return
section_programs
python/paddle/fluid/tests/unittests/test_cuda_graph_partial_graph_static_run.py
0 → 100644
浏览文件 @
d83d59dd
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
import
paddle.nn
as
nn
import
unittest
import
numpy
as
np
from
paddle.device.cuda.graphs
import
wrap_cuda_graph
,
is_cuda_graph_supported
,
cuda_graph_transform
paddle
.
enable_static
()
class
SimpleModel
(
nn
.
Layer
):
def
__init__
(
self
,
in_size
,
out_size
):
super
(
SimpleModel
,
self
).
__init__
()
self
.
linear
=
nn
.
Linear
(
in_size
,
out_size
)
self
.
dropout_1
=
paddle
.
nn
.
Dropout
(
0.1
)
self
.
relu
=
nn
.
ReLU
()
self
.
dropout_2
=
paddle
.
nn
.
Dropout
(
0.5
)
self
.
gelu
=
nn
.
GELU
()
def
forward
(
self
,
x
):
x
=
self
.
linear
(
x
)
x
=
self
.
dropout_1
(
x
)
x
=
self
.
relu
(
x
)
x
=
self
.
dropout_2
(
x
)
x
=
self
.
gelu
(
x
)
return
x
class
TestCudaGraphAttrAll
(
unittest
.
TestCase
):
def
setUp
(
self
):
paddle
.
set_flags
({
'FLAGS_eager_delete_tensor_gb'
:
0.0
})
def
get_model
(
self
,
use_cuda_graph
=
False
):
x
=
paddle
.
static
.
data
(
shape
=
[
3
,
10
],
dtype
=
'float32'
,
name
=
'x'
)
model_start
=
SimpleModel
(
10
,
20
)
if
use_cuda_graph
:
model_start
=
wrap_cuda_graph
(
model_start
)
model_inter
=
SimpleModel
(
20
,
20
)
model_end
=
SimpleModel
(
20
,
10
)
if
use_cuda_graph
:
model_end
=
wrap_cuda_graph
(
model_end
,
memory_pool
=
'new'
)
start_out
=
model_start
(
x
)
inter_out
=
model_inter
(
start_out
)
end_out
=
model_end
(
inter_out
)
loss
=
paddle
.
mean
(
end_out
)
opt
=
paddle
.
optimizer
.
SGD
()
opt
.
minimize
(
loss
)
return
loss
def
run_with_cuda_graph
(
self
,
x_data
):
# run with cuda graph
paddle
.
seed
(
1024
)
main_prog
=
paddle
.
static
.
Program
()
start_prog
=
paddle
.
static
.
Program
()
with
paddle
.
static
.
program_guard
(
main_prog
,
start_prog
):
loss
=
self
.
get_model
(
use_cuda_graph
=
True
)
section_programs
=
cuda_graph_transform
(
main_prog
)
assert
len
(
section_programs
)
==
4
block
=
main_prog
.
global_block
()
run_program_op_num
=
0
for
op
in
block
.
ops
:
if
op
.
type
==
'run_program'
:
run_program_op_num
+=
1
assert
run_program_op_num
==
4
exe
=
paddle
.
static
.
Executor
(
paddle
.
CUDAPlace
(
0
))
exe
.
run
(
start_prog
)
for
i
in
range
(
10
):
rst
=
exe
.
run
(
main_prog
,
feed
=
{
'x'
:
x_data
},
fetch_list
=
[
loss
])
return
rst
def
normal_run
(
self
,
x_data
):
# run without cuda graph
paddle
.
seed
(
1024
)
main_prog
=
paddle
.
static
.
Program
()
start_prog
=
paddle
.
static
.
Program
()
with
paddle
.
static
.
program_guard
(
main_prog
,
start_prog
):
loss
=
self
.
get_model
()
exe
=
paddle
.
static
.
Executor
(
paddle
.
CUDAPlace
(
0
))
exe
.
run
(
start_prog
)
for
i
in
range
(
10
):
rst
=
exe
.
run
(
main_prog
,
feed
=
{
'x'
:
x_data
},
fetch_list
=
[
loss
])
return
rst
def
test_static_mode_cuda_graph
(
self
):
if
not
is_cuda_graph_supported
():
return
x_data
=
np
.
random
.
random
((
3
,
10
)).
astype
(
'float32'
)
cuda_graph_rst
=
self
.
run_with_cuda_graph
(
x_data
)
normal_run_rst
=
self
.
normal_run
(
x_data
)
assert
np
.
array_equal
(
cuda_graph_rst
,
normal_run_rst
)
if
__name__
==
"__main__"
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录