Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
4d2994cb
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
4d2994cb
编写于
8月 11, 2021
作者:
Y
Yuang Liu
提交者:
GitHub
8月 11, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Optimize fused allreduce in raw program (#34509)
上级
6a9fac14
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
116 addition
and
60 deletion
+116
-60
python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py
...istributed/fleet/meta_optimizers/raw_program_optimizer.py
+116
-60
未找到文件。
python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py
浏览文件 @
4d2994cb
...
...
@@ -217,9 +217,13 @@ class RawProgramOptimizer(MetaOptimizerBase):
block
=
self
.
main_program
.
global_block
()
ring_id
=
self
.
global_ring_id
param_grads
=
[]
first_backward_idx
=
-
1
# find all grad params
for
op
in
reversed
(
block
.
ops
):
for
idx
,
op
in
enumerate
(
block
.
ops
):
if
first_backward_idx
==
-
1
and
\
is_backward_op
(
op
):
first_backward_idx
=
idx
if
is_backward_op
(
op
)
and
\
OP_ROLE_VAR_KEY
in
op
.
attr_names
:
op_role_var
=
op
.
attr
(
OP_ROLE_VAR_KEY
)
...
...
@@ -234,70 +238,100 @@ class RawProgramOptimizer(MetaOptimizerBase):
grad
=
block
.
var
(
grad_name
)
if
param
.
is_distributed
:
continue
param_grads
.
append
(
grad
)
param_grads
.
append
((
param
,
grad
))
outputs_name_to_idx
=
self
.
__get_ouputs_name_to_idx
(
first_backward_idx
,
block
)
segments
=
[]
# structure of grad_param_segments is
# [([grad0, grad1], [param0, param1]), ([grad2, grad3], [param2, param3])]
# each entry of the list is a tuple stores the grads segment list and
# the corresponding params segment list
grad_param_segments
=
[]
last_dtype
=
None
# split the grad based on dtype and fused size
for
var
in
param_grads
:
if
len
(
segments
)
==
0
\
or
len
(
segments
[
-
1
])
==
self
.
fuse_grad_size_in_num
\
or
var
.
dtype
!=
last_dtype
:
segments
.
append
([
var
]
)
last_dtype
=
var
.
dtype
for
param
,
grad
in
param_grads
:
if
len
(
grad_param_
segments
)
==
0
\
or
len
(
grad_param_segments
[
-
1
][
0
])
==
self
.
fuse_grad_size_in_num
\
or
grad
.
dtype
!=
last_dtype
:
grad_param_segments
.
append
(([
grad
],
[
param
])
)
last_dtype
=
grad
.
dtype
else
:
segments
[
-
1
].
append
(
var
)
grad_param_segments
[
-
1
][
0
].
append
(
grad
)
grad_param_segments
[
-
1
][
1
].
append
(
param
)
fused_vars
=
[]
for
idx
,
op
in
enumerate
(
block
.
ops
):
if
is_optimizer_op
(
op
):
for
segment
in
segments
:
# insert coalesce tensor
tmp_var
=
block
.
create_var
(
name
=
unique_name
.
generate
(
'FusedOutput_{}'
.
format
(
segment
[
0
].
name
)),
dtype
=
segment
[
0
].
dtype
,
persistable
=
True
,
stop_gradient
=
True
)
fused_vars
.
append
(
tmp_var
)
block
.
_insert_op_without_sync
(
idx
,
type
=
"coalesce_tensor"
,
inputs
=
{
"Input"
:
segment
},
outputs
=
{
"Output"
:
segment
,
"FusedOutput"
:
tmp_var
},
attrs
=
{
"copy_data"
:
True
,
"use_align"
:
True
,
"dtype"
:
segment
[
0
].
dtype
,
OP_ROLE_KEY
:
OpRole
.
Backward
})
break
if
len
(
grad_param_segments
)
==
0
:
return
# insert the allreduce_sum op
for
idx
,
op
in
enumerate
(
block
.
ops
):
if
is_optimizer_op
(
op
):
for
fused_var
in
fused_vars
:
block
.
_insert_op_without_sync
(
idx
,
type
=
'c_allreduce_sum'
,
inputs
=
{
'X'
:
fused_var
},
outputs
=
{
'Out'
:
fused_var
},
attrs
=
{
'ring_id'
:
ring_id
,
'use_calc_stream'
:
self
.
calc_comm_same_stream
,
OP_ROLE_KEY
:
OpRole
.
Backward
})
if
not
self
.
calc_comm_same_stream
:
block
.
_insert_op_without_sync
(
idx
,
type
=
'c_sync_calc_stream'
,
inputs
=
{
'X'
:
fused_var
},
outputs
=
{
'Out'
:
fused_var
},
attrs
=
{
OP_ROLE_KEY
:
OpRole
.
Backward
})
break
fused_vars
=
[
None
]
*
len
(
grad_param_segments
)
for
i
in
range
(
len
(
grad_param_segments
)
-
1
,
-
1
,
-
1
):
# travers the grad_param_segments in backward
# not to use reversed since needs the absolute index value
grad_segment
,
param_segment
=
grad_param_segments
[
i
]
# insert coalesce tensor
fused_var
=
block
.
create_var
(
name
=
unique_name
.
generate
(
'FusedOutput_{}'
.
format
(
grad_segment
[
0
].
name
)),
dtype
=
grad_segment
[
0
].
dtype
,
persistable
=
False
,
stop_gradient
=
True
)
fused_vars
[
i
]
=
fused_var
after_idx
=
outputs_name_to_idx
[
grad_segment
[
-
1
]][
1
]
block
.
_insert_op_without_sync
(
after_idx
+
1
,
type
=
'c_allreduce_sum'
,
inputs
=
{
'X'
:
fused_var
},
outputs
=
{
'Out'
:
fused_var
},
attrs
=
{
'ring_id'
:
ring_id
,
'use_calc_stream'
:
self
.
calc_comm_same_stream
,
OP_ROLE_KEY
:
OpRole
.
Backward
})
if
not
self
.
calc_comm_same_stream
:
block
.
_insert_op_without_sync
(
after_idx
+
1
,
type
=
'c_sync_calc_stream'
,
inputs
=
{
'X'
:
fused_var
},
outputs
=
{
'Out'
:
fused_var
},
attrs
=
{
OP_ROLE_KEY
:
OpRole
.
Backward
})
if
len
(
fused_vars
)
==
0
:
# update the outputs_name_to_idx after insertion of sync/allreduce ops
outputs_name_to_idx
=
self
.
__get_ouputs_name_to_idx
(
first_backward_idx
,
block
)
# the before_idx is not guaranteed sorted, therefore we have to find the
# topology to insert the coalesce ops
pos_for_coalesce
=
{}
for
i
in
range
(
len
(
grad_param_segments
)
-
1
,
-
1
,
-
1
):
# We separate the insertion of coalesce op and the insertion of sync/allreduce op,
# since that the coalesce op's insertion may invalidate the outputs_name_to_idx
grad_segment
,
param_segment
=
grad_param_segments
[
i
]
before_idx
=
len
(
block
.
ops
)
for
grad
in
outputs_name_to_idx
:
before_idx
=
min
(
before_idx
,
outputs_name_to_idx
[
grad
][
0
])
pos_for_coalesce
[
i
]
=
before_idx
# insert the coalesce op based on the sorted before_idx
pos_for_coalesce
=
sorted
(
pos_for_coalesce
.
items
(),
key
=
lambda
kv
:
(
kv
[
1
],
kv
[
0
]),
reverse
=
True
)
for
i
,
before_idx
in
pos_for_coalesce
:
grad_segment
,
param_segment
=
grad_param_segments
[
i
]
fused_var
=
fused_vars
[
i
]
block
.
_insert_op_without_sync
(
before_idx
,
type
=
"coalesce_tensor"
,
inputs
=
{
"Input"
:
param_segment
},
outputs
=
{
"Output"
:
grad_segment
,
"FusedOutput"
:
fused_var
},
attrs
=
{
"copy_data"
:
False
,
"use_align"
:
True
,
"dtype"
:
grad_segment
[
0
].
dtype
,
OP_ROLE_KEY
:
OpRole
.
Backward
})
if
self
.
calc_comm_same_stream
:
block
.
_sync_with_cpp
()
return
...
...
@@ -307,9 +341,31 @@ class RawProgramOptimizer(MetaOptimizerBase):
block
.
_insert_op_without_sync
(
idx
,
type
=
'c_sync_comm_stream'
,
inputs
=
{
'X'
:
fused_vars
[
0
]},
outputs
=
{
'Out'
:
fused_vars
[
0
]},
inputs
=
{
'X'
:
grad_segment
[
0
]},
outputs
=
{
'Out'
:
grad_segment
[
0
]},
attrs
=
{
'ring_id'
:
ring_id
,
OP_ROLE_KEY
:
OpRole
.
Backward
})
break
block
.
_sync_with_cpp
()
def
__get_ouputs_name_to_idx
(
self
,
first_backward_idx
,
block
):
# Each item of outputs_name_to_idx is a pair of idx.
# The first entry of this pair is the idx of the first op generates the grad,
# which is used to indicate the position to insert coalesce op.
# The second entry of this pair is the idx of the last op generates the grad,
# which is used to indicate the position to insert sync and allreduce op.
outputs_name_to_idx
=
{}
for
idx
in
range
(
first_backward_idx
,
len
(
block
.
ops
)):
op
=
block
.
ops
[
idx
]
if
is_optimizer_op
(
op
):
break
for
name
in
op
.
output_arg_names
:
var
=
block
.
var
(
name
)
if
not
outputs_name_to_idx
.
get
(
var
):
# if the grad only be generated by one op
# the first idx and the last ids are identical
outputs_name_to_idx
[
var
]
=
(
idx
,
idx
)
else
:
outputs_name_to_idx
[
var
]
=
(
outputs_name_to_idx
[
var
][
0
],
idx
)
return
outputs_name_to_idx
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录