Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
8c895085
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
8c895085
编写于
2月 07, 2021
作者:
S
sandyhouse
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
update, test=develop
上级
920806db
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
539 addition
and
204 deletion
+539
-204
python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+7
-3
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+532
-201
未找到文件。
python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
浏览文件 @
8c895085
...
...
@@ -123,7 +123,8 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
outputs
=
{
"Out"
:
out_var
},
attrs
=
{
"in_dtype"
:
in_var
.
dtype
,
"out_dtype"
:
out_var
.
dtype
"out_dtype"
:
out_var
.
dtype
,
"op_device"
:
op
.
attr
(
"op_device"
)
})
num_cast_ops
+=
1
_rename_arg
(
op
,
in_var
.
name
,
out_var
.
name
)
...
...
@@ -171,8 +172,11 @@ def _insert_cast_post_op(block, op, idx, src_dtype, dest_dtype, target_name,
type
=
"cast"
,
inputs
=
{
"X"
:
target_var
},
outputs
=
{
"Out"
:
cast_var
},
attrs
=
{
"in_dtype"
:
target_var
.
dtype
,
"out_dtype"
:
cast_var
.
dtype
})
attrs
=
{
"in_dtype"
:
target_var
.
dtype
,
"out_dtype"
:
cast_var
.
dtype
,
"op_device"
:
op
.
attr
(
"op_device"
)
})
num_cast_ops
+=
1
op_var_rename_map
[
block
.
idx
][
target_var
.
name
]
=
cast_var
.
name
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
8c895085
...
...
@@ -19,6 +19,7 @@ import six
import
os
import
logging
from
collections
import
defaultdict
import
time
import
paddle
from
paddle.fluid.distribute_lookup_table
import
find_distributed_lookup_table
...
...
@@ -3759,15 +3760,21 @@ class PipelineOptimizer(object):
def
__init__
(
self
,
optimizer
,
num_microbatches
=
1
,
start_cpu_core_id
=
0
):
if
framework
.
in_dygraph_mode
():
raise
Exception
(
"In dygraph, don't support PipelineOptimizer."
)
if
not
isinstance
(
optimizer
,
Optimizer
)
and
not
isinstance
(
optimizer
,
paddle
.
optimizer
.
Optimizer
)
and
not
isinstance
(
optimizer
,
paddle
.
fluid
.
contrib
.
mixed_precision
.
decorator
.
OptimizerWithMixedPrecision
):
supported_opt_types
=
(
Optimizer
,
paddle
.
fluid
.
contrib
.
mixed_precision
.
decorator
.
OptimizerWithMixedPrecision
)
if
not
isinstance
(
optimizer
,
supported_opt_types
):
raise
ValueError
(
"The 'optimizer' parameter for "
"PipelineOptimizer must be an instance of "
"Optimizer, but the given type is {}."
.
format
(
type
(
optimizer
)))
"PipelineOptimizer must be an instance of one of "
"{}, but the type is {}."
.
format
(
supported_opt_types
,
type
(
optimizer
)))
self
.
_optimizer
=
optimizer
# Get the original optimizer defined by users, such as SGD
self
.
_origin_optimizer
=
self
.
_optimizer
while
hasattr
(
self
.
_origin_optimizer
,
"inner_opt"
):
self
.
_origin_optimizer
=
self
.
_origin_optimizer
.
inner_opt
assert
num_microbatches
>=
1
,
(
"num_microbatches must be a positive value."
)
self
.
_num_microbatches
=
num_microbatches
...
...
@@ -3783,50 +3790,141 @@ class PipelineOptimizer(object):
self
.
_param_device_map
=
None
def
_create_vars
(
self
,
block
,
ori_block
):
# Create vars for block, copied from
main_program's global
block
# Create vars for block, copied from
ori_
block
used_var_set
=
set
()
for
op_idx
in
range
(
block
.
desc
.
op_size
()):
op_desc
=
block
.
desc
.
op
(
op_idx
)
vars
=
op_desc
.
input_arg_names
()
+
op_desc
.
output_arg_names
()
# Whether to insert allreduce_sum or allreduce_max op?
# For amp and global gradient clip strategies, we should
# get the global infomation, so allreduce op is needed.
should_insert
=
False
op
=
block
.
ops
[
op_idx
]
# For op process vars on all devices, remove its input
# vars not in this block
reserved_x
=
[]
if
op
.
type
==
'reduce_any'
and
self
.
_is_optimize_op
(
op
):
should_insert
=
True
if
op
.
type
==
'concat'
and
self
.
_is_optimize_op
(
op
):
for
input_name
in
op
.
desc
.
input
(
"X"
):
if
block
.
_find_var_recursive
(
input_name
):
reserved_x
.
append
(
input_name
)
op
.
desc
.
set_input
(
'X'
,
reserved_x
)
print
(
'reserved_x:'
,
reserved_x
)
if
op
.
type
==
'update_loss_scaling'
:
for
input_name
in
op
.
desc
.
input
(
"X"
):
if
block
.
_find_var_recursive
(
input_name
):
reserved_x
.
append
(
input_name
)
op
.
desc
.
set_input
(
'X'
,
reserved_x
)
op
.
desc
.
set_output
(
'Out'
,
reserved_x
)
if
op
.
type
==
'sum'
and
self
.
_is_gradient_clip_op
(
op
):
for
input_name
in
op
.
desc
.
input
(
"X"
):
if
block
.
_find_var_recursive
(
input_name
):
reserved_x
.
append
(
input_name
)
op
.
desc
.
set_input
(
'X'
,
reserved_x
)
should_insert
=
True
vars
=
op
.
desc
.
input_arg_names
()
+
op
.
desc
.
output_arg_names
()
for
var
in
vars
:
# a var whose name contains "blocking_queue"
# only exists in startup program
if
var
in
used_var_set
or
"_blocking_queue"
in
var
:
continue
if
var
in
used_var_set
or
"_blocking_queue"
in
var
:
continue
used_var_set
.
add
(
var
)
if
block
.
_find_var_recursive
(
str
(
var
)):
continue
source_var
=
ori_block
.
_var_recursive
(
str
(
var
))
if
source_var
.
type
==
core
.
VarDesc
.
VarType
.
READER
:
block
.
create_var
(
dest_var
=
block
.
create_var
(
name
=
var
,
type
=
core
.
VarDesc
.
VarType
.
READER
,
persistable
=
source_var
.
persistable
)
else
:
block
.
_clone_variable
(
source_var
,
False
)
dest_var
=
block
.
_clone_variable
(
source_var
,
False
)
dest_var
.
stop_gradient
=
source_var
.
stop_gradient
if
not
should_insert
:
continue
out_name
=
op
.
desc
.
output_arg_names
()[
0
]
out_var
=
block
.
var
(
out_name
)
offset
=
0
if
op
.
type
==
"reduce_any"
:
# cast the bool var to int32 to use allreduce op
temp_var_name
=
unique_name
.
generate
(
out_name
+
"_cast_int32"
)
temp_var
=
block
.
create_var
(
name
=
temp_var_name
,
shape
=
[
1
],
dtype
=
"int32"
)
block
.
_insert_op
(
op_idx
+
1
+
offset
,
type
=
'cast'
,
inputs
=
{
'X'
:
out_var
},
outputs
=
{
'Out'
:
temp_var
},
attrs
=
{
'in_dtype'
:
out_var
.
dtype
,
'out_dtype'
:
temp_var
.
dtype
,
self
.
_op_role_key
:
core
.
op_proto_and_checker_maker
.
OpRole
.
Optimize
})
offset
+=
1
# block._insert_op(
# op_idx + 1 + offset,
# type='c_sync_calc_stream',
# inputs={'X': temp_var if op.type == "reduce_any" else out_var},
# outputs={
# 'Out': temp_var if op.type == "reduce_any" else out_var
# },
# attrs={
# OP_ROLE_KEY:
# core.op_proto_and_checker_maker.OpRole.Optimize,
# })
# offset += 1
block
.
_insert_op
(
op_idx
+
1
+
offset
,
type
=
'c_allreduce_max'
if
op
.
type
==
"reduce_any"
else
'c_allreduce_sum'
,
inputs
=
{
'X'
:
temp_var
if
op
.
type
==
"reduce_any"
else
out_var
},
outputs
=
{
'Out'
:
temp_var
if
op
.
type
==
"reduce_any"
else
out_var
},
attrs
=
{
'ring_id'
:
self
.
ring_id
,
self
.
_op_role_key
:
core
.
op_proto_and_checker_maker
.
OpRole
.
Optimize
,
'use_calc_stream'
:
True
})
offset
+=
1
# block._insert_op(
# # op_idx + 1 + extra_index,
# op_idx + 1 + offset,
# type='c_sync_comm_stream',
# inputs={'X': temp_var if op.type == "reduce_any" else out_var},
# outputs={
# 'Out': temp_var if op.type == "reduce_any" else out_var
# },
# attrs={
# 'ring_id': self.ring_id,
# OP_ROLE_KEY:
# core.op_proto_and_checker_maker.OpRole.Optimize,
# })
# offset += 1
if
op
.
type
==
"reduce_any"
:
block
.
_insert_op
(
op_idx
+
1
+
offset
,
type
=
'cast'
,
inputs
=
{
'X'
:
temp_var
},
outputs
=
{
'Out'
:
out_var
},
attrs
=
{
'in_dtype'
:
temp_var
.
dtype
,
'out_dtype'
:
out_var
.
dtype
,
self
.
_op_role_key
:
core
.
op_proto_and_checker_maker
.
OpRole
.
Optimize
})
def
_is_loss_grad_op
(
self
,
op
):
if
self
.
_op_role_key
not
in
op
.
attr_names
:
return
False
op_role
=
int
(
op
.
all_attrs
()[
self
.
_op_role_key
])
assert
self
.
_op_role_key
in
op
.
attr_names
op_role
=
int
(
op
.
attr
(
self
.
_op_role_key
))
return
op_role
&
int
(
self
.
_op_role
.
Backward
)
and
op_role
&
int
(
self
.
_op_role
.
Loss
)
def
_is_backward_op
(
self
,
op
):
return
self
.
_op_role_key
in
op
.
attr_names
and
int
(
op
.
all_attrs
()[
self
.
_op_role_key
])
&
int
(
self
.
_op_role
.
Backward
)
def
_is_optimize_op
(
self
,
op
):
return
self
.
_op_role_key
in
op
.
attr_names
and
int
(
op
.
all_attrs
()[
self
.
_op_role_key
])
&
int
(
self
.
_op_role
.
Optimize
)
def
_is_update_op
(
self
,
op
):
return
'Param'
in
op
.
input_names
and
'Grad'
in
op
.
input_names
and
(
"LearningRate"
in
op
.
input_names
)
def
_split_program
(
self
,
main_program
,
devices
):
"""
Split a program into sections according to devices that ops run on.
The op
s of the role LRSched are
copied to all sections.
The op
whose op_device attr is "gpu:all" is
copied to all sections.
Args:
main_program (Program): the main program
...
...
@@ -3842,27 +3940,20 @@ class PipelineOptimizer(object):
block
=
main_program
.
block
(
0
)
for
op
in
block
.
ops
:
device
=
op
.
attr
(
self
.
_op_device_key
)
op_role
=
op
.
attr
(
self
.
_op_role_key
)
if
int
(
op_role
)
&
int
(
self
.
_op_role
.
LRSched
):
# Copy ops of the role LRSched to all sections.
for
device
in
device_program_map
.
keys
():
program
=
device_program_map
[
device
]
op_desc
=
op
.
desc
ap_op
=
program
[
"program"
].
block
(
0
).
desc
.
append_op
()
ap_op
.
copy_from
(
op_desc
)
# ap_op._set_attr(self._op_device_key, "")
elif
op
.
type
==
"create_py_reader"
or
op
.
type
==
"read"
or
op
.
type
==
"create_double_buffer_reader"
:
# Copy read related ops to all section to make them exit after each epoch.
# Copy ops whose op_device set to "gpu:all" to all sections.
if
device
==
"gpu:all"
:
for
device
in
device_program_map
.
keys
():
program
=
device_program_map
[
device
]
op_desc
=
op
.
desc
ap_op
=
program
[
"program"
].
block
(
0
).
desc
.
append_op
()
ap_op
.
copy_from
(
op_desc
)
ap_op
.
_set_attr
(
self
.
_op_device_key
,
""
)
else
:
program
=
device_program_map
[
device
]
op_desc
=
op
.
desc
ap_op
=
program
[
"program"
].
block
(
0
).
desc
.
append_op
()
ap_op
.
copy_from
(
op_desc
)
ap_op
.
_set_attr
(
self
.
_op_device_key
,
""
)
for
key
in
devices
:
program
=
device_program_map
[
key
]
...
...
@@ -3921,6 +4012,11 @@ class PipelineOptimizer(object):
var_name as output.
var_name (string): Variable name.
"""
# To skip the cast op added by amp which has no op_device set
if
'.cast_fp32'
in
var_name
:
var_name
=
var_name
.
replace
(
'.cast_fp32'
,
''
)
if
'.cast_fp16'
in
var_name
:
var_name
=
var_name
.
replace
(
'.cast_fp16'
,
''
)
post_op
=
[]
before
=
True
for
op
in
ops
:
...
...
@@ -3982,9 +4078,10 @@ class PipelineOptimizer(object):
dtype
=
ref_var
.
dtype
,
type
=
ref_var
.
type
,
lod_level
=
ref_var
.
lod_level
,
persistable
=
Fals
e
,
is_data
=
False
,
persistable
=
ref_var
.
persistabl
e
,
is_data
=
ref_var
.
is_data
,
need_check_feed
=
ref_var
.
desc
.
need_check_feed
())
new_var
.
stop_gradient
=
ref_var
.
stop_gradient
return
new_var
def
_get_data_var_info
(
self
,
block
):
...
...
@@ -4046,6 +4143,7 @@ class PipelineOptimizer(object):
self
.
_op_role_key
:
self
.
_op_role
.
Forward
,
'use_calc_stream'
:
True
,
'peer'
:
dev_index
,
'ring_id'
:
self
.
ring_id
,
})
# Get the device that that data on
assert
device
in
devices
...
...
@@ -4070,6 +4168,7 @@ class PipelineOptimizer(object):
self
.
_op_role_key
:
self
.
_op_role
.
Forward
,
'peer'
:
first_dev_index
,
'use_calc_stream'
:
True
,
'ring_id'
:
self
.
ring_id
,
})
def
_strip_grad_suffix
(
self
,
name
):
...
...
@@ -4085,79 +4184,178 @@ class PipelineOptimizer(object):
"""
return
name
+
core
.
grad_var_suffix
()
def
_
add_opdevice_attr_for_regularization_clip
(
self
,
block
):
def
_
is_forward_op
(
self
,
op
):
"""
Add op_device attribute for regulization and clip ops
.
Is the op_role attribute of a op is Forward
.
"""
for
op
in
block
.
ops
:
# role for regularization and clip ops is optimize
if
int
(
op
.
attr
(
self
.
_op_role_key
))
!=
int
(
self
.
_op_role
.
Optimize
):
continue
if
op
.
has_attr
(
self
.
_op_device_key
)
and
(
op
.
attr
(
self
.
_op_device_key
)
!=
""
):
continue
assert
self
.
_op_role_var_key
in
op
.
attr_names
op_role_var
=
op
.
all_attrs
()[
self
.
_op_role_var_key
]
assert
len
(
op_role_var
)
==
2
param_name
=
op_role_var
[
0
]
device
=
self
.
_param_device_map
[
param_name
]
op
.
_set_attr
(
self
.
_op_device_key
,
device
)
assert
self
.
_op_role_key
in
op
.
attr_names
return
int
(
op
.
attr
(
self
.
_op_role_key
))
==
int
(
self
.
_op_role
.
Forward
)
def
_
add_default_opdevice_attr
(
self
,
block
):
def
_
is_backward_op
(
self
,
op
):
"""
1. Add default op_device attribute for lr-related ops.
The default value is the one that of the first place.
2. Add default op_device attribute for sum ops added during
backward. For these ops, we set the op_device attribute
as the one of its post op, i.e, which op has the output of the
sum op as an input.
Is the op_role attribute of a op is Backward.
"""
first_devcie
=
""
assert
self
.
_op_role_key
in
op
.
attr_names
return
int
(
op
.
attr
(
self
.
_op_role_key
))
==
int
(
self
.
_op_role
.
Backward
)
# Get the device spec of the first place.
# device_spec: 'cpu' for cpu device and 'gpu:id' for gpu device,
# e.g. 'gpu:0', 'gpu:1', etc.
for
op
in
block
.
ops
:
if
op
.
has_attr
(
self
.
_op_device_key
)
and
(
op
.
attr
(
self
.
_op_device_key
)
!=
""
):
first_device
=
op
.
attr
(
self
.
_op_device_key
)
break
assert
first_device
first_device_type
=
first_device
.
split
(
":"
)[
0
]
assert
first_device_type
==
"gpu"
def
_is_loss_op
(
self
,
op
):
"""
Is the op_role attribute of a op is Loss.
"""
assert
self
.
_op_role_key
in
op
.
attr_names
return
int
(
op
.
attr
(
self
.
_op_role_key
))
==
int
(
self
.
_op_role
.
Loss
)
# set op_device attr for lr-related ops
lrsched_role
=
int
(
self
.
_op_role
.
LRSched
)
for
op
in
block
.
ops
:
if
not
op
.
has_attr
(
self
.
_op_device_key
)
or
(
op
.
attr
(
self
.
_op_device_key
)
==
""
):
if
op
.
type
==
"sum"
:
# For sum ops that compute the sum of @RENAMED@ vars
for
name
in
op
.
desc
.
input_arg_names
():
assert
'@RENAME@'
in
name
assert
len
(
op
.
desc
.
output_arg_names
())
==
1
out_name
=
op
.
desc
.
output_arg_names
()[
0
]
post_op
=
self
.
_find_post_op
(
block
.
ops
,
op
,
out_name
)
device
=
post_op
.
attr
(
self
.
_op_device_key
)
assert
device
op
.
_set_attr
(
self
.
_op_device_key
,
device
)
continue
def
_is_optimize_op
(
self
,
op
):
"""
Is the op_role attribute of a op is Optimize.
"""
assert
self
.
_op_role_key
in
op
.
attr_names
return
int
(
op
.
attr
(
self
.
_op_role_key
))
==
int
(
self
.
_op_role
.
Optimize
)
def
_is_lrsched_op
(
self
,
op
):
"""
Is the op_role attribute of a op is LRSched.
"""
assert
self
.
_op_role_key
in
op
.
attr_names
return
int
(
op
.
attr
(
self
.
_op_role_key
))
==
int
(
self
.
_op_role
.
LRSched
)
def
_is_update_op
(
self
,
op
):
"""
Is the op updates the parameter using gradient.
"""
return
'Param'
in
op
.
input_names
and
'Grad'
in
op
.
input_names
and
(
"LearningRate"
in
op
.
input_names
)
def
_get_op_device_attr
(
self
,
op
):
"""
Get the op_device attribute of a op.
"""
device
=
op
.
attr
(
self
.
_op_device_key
)
\
if
op
.
has_attr
(
self
.
_op_device_key
)
else
None
if
device
:
assert
device
[
0
:
3
]
==
'gpu'
,
"Now, only gpu devices are "
\
"supported in pipeline parallemism."
return
device
def
_add_op_device_attr_for_op
(
self
,
op
,
idx
,
block
):
"""
Add op_device attrribute for ops that have not that attribute set.
assert
op
.
attr
(
self
.
_op_role_key
)
==
lrsched_role
,
(
"Op whose op_device attr has not been set for pipeline"
" must be of the role LRSched."
)
op
.
_set_attr
(
self
.
_op_device_key
,
first_device
)
We use "gpu:all" to represent the op should be put on all
sub-programs, such as lr-related ops. Note that: "gpu:all"
is only used by pipeline as an indicator.
"""
lrsched_role
=
int
(
self
.
_op_role
.
LRSched
)
if
op
.
attr
(
self
.
_op_role_key
)
==
lrsched_role
:
# For LRSched ops, we should put them on all sub-programs to
# make sure each sub-program update the lr correctly
op
.
_set_attr
(
self
.
_op_device_key
,
"gpu:all"
)
elif
op
.
type
==
"sum"
and
self
.
_is_backward_op
(
op
):
# For sum ops that compute the sum of @RENAMED@ vars
for
name
in
op
.
desc
.
input_arg_names
():
assert
'@RENAME@'
in
name
,
\
"The op must be sum used to accumulate renamed vars."
assert
len
(
op
.
desc
.
output_arg_names
())
==
1
out_name
=
op
.
desc
.
output_arg_names
()[
0
]
post_op
=
self
.
_find_post_op
(
block
.
ops
,
op
,
out_name
)
assert
post_op
.
has_attr
(
'op_device'
),
"{} has no op_device attr for var {}"
.
format
(
post_op
.
type
,
out_name
)
device
=
post_op
.
attr
(
self
.
_op_device_key
)
assert
device
,
"The post op must have op_device set."
op
.
_set_attr
(
self
.
_op_device_key
,
device
)
elif
(
op
.
type
==
"cast"
or
op
.
type
==
"scale"
)
and
self
.
_is_backward_op
(
op
):
prev_op
=
self
.
_find_real_prev_op
(
block
.
ops
,
op
,
op
.
desc
.
input
(
"X"
)[
0
])
op
.
_set_attr
(
'op_device'
,
prev_op
.
attr
(
'op_device'
))
elif
self
.
_is_loss_op
(
op
):
# For loss * loss_scaling op added by AMP
offset
=
1
while
(
not
block
.
ops
[
idx
+
offset
].
has_attr
(
self
.
_op_device_key
)
or
not
block
.
ops
[
idx
+
offset
].
attr
(
self
.
_op_device_key
)):
offset
+=
1
# assert block.ops[idx + 1].type == "fill_constant"
# assert block.ops[idx + 2].type == "elementwise_mul_grad"
# assert block.ops[idx + 3].type == "elementwise_add_grad"
# assert block.ops[idx + 4].type == "mean_grad"
# device = block.ops[idx + 4].attr(self._op_device_key)
device
=
block
.
ops
[
idx
+
offset
].
attr
(
self
.
_op_device_key
)
assert
device
,
"Please put you program within device_guard scope."
# op._set_attr(self._op_device_key, device)
# block.ops[idx + 1]._set_attr(self._op_device_key, device)
# block.ops[idx + 2]._set_attr(self._op_device_key, device)
# block.ops[idx + 2]._set_attr(self._op_device_key, device)
for
i
in
range
(
offset
):
block
.
ops
[
idx
+
i
].
_set_attr
(
self
.
_op_device_key
,
device
)
elif
self
.
_is_optimize_op
(
op
)
and
op
.
type
==
"check_finite_and_unscale"
:
#op._set_attr(self._op_device_key, "gpu:all")
op_role_var
=
op
.
attr
(
self
.
_op_role_var_key
)
param_name
=
op_role_var
[
0
]
device
=
self
.
_param_device_map
[
param_name
]
op
.
_set_attr
(
self
.
_op_device_key
,
device
)
elif
self
.
_is_optimize_op
(
op
)
and
op
.
type
==
"cast"
:
# For fp16-->fp32 cast added by AMP
grad_name
=
op
.
output
(
'Out'
)
assert
len
(
grad_name
)
==
1
param_name
=
grad_name
[
0
].
strip
(
core
.
grad_var_suffix
())
device
=
self
.
_param_device_map
[
param_name
]
op
.
_set_attr
(
self
.
_op_device_key
,
device
)
elif
self
.
_is_gradient_clip_op
(
op
)
or
self
.
_is_regularization_op
(
op
):
# For gradient clip and regularization ops, we set their op_device
# attribute to the device where their corresponding parameters on.
assert
self
.
_op_role_var_key
in
op
.
attr_names
,
"gradient_clip "
\
"and regularization ops must have op_role_var attribute."
op_role_var
=
op
.
attr
(
self
.
_op_role_var_key
)
assert
len
(
op_role_var
)
==
2
,
"op_role_var for gradient_clip "
\
"regularization ops must have two elements."
param_name
=
op_role_var
[
0
]
device
=
self
.
_param_device_map
[
param_name
]
# For sum op added by global gradient clip, it must be
# put on all devices
if
(
op
.
type
==
'sum'
or
op
.
type
==
'sqrt'
or
op
.
type
==
'fill_constant'
or
op
.
type
==
'elementwise_max'
or
op
.
type
==
'elementwise_div'
):
device
=
"gpu:all"
op
.
_set_attr
(
self
.
_op_device_key
,
device
)
else
:
other_known_ops
=
[
'update_loss_scaling'
,
'reduce_any'
,
'concat'
,
'sum'
]
assert
op
.
type
in
other_known_ops
,
"For other ops without "
\
"op_device set, they must be one of {}, but it "
\
"is {}"
.
format
(
other_known_ops
,
op
.
type
)
assert
self
.
_is_optimize_op
(
op
)
op
.
_set_attr
(
self
.
_op_device_key
,
"gpu:all"
)
def
_add_op_device_attr
(
self
,
block
):
"""
Add op_device attrribute for ops in block that have
not that attribute set.
"""
for
idx
,
op
in
enumerate
(
list
(
block
.
ops
)):
if
(
op
.
type
==
"create_py_reader"
or
op
.
type
==
"read"
or
op
.
type
==
"create_double_buffer_reader"
):
# Copy read related ops to all section to make them exit
# after each epoch.
# We use "gpu:all" to represent the op should be put on all
# sub-programs, such as lr-related ops. Note that: "gpu:all"
# is only used by pipeline as an indicator.
op
.
_set_attr
(
self
.
_op_device_key
,
"gpu:all"
)
continue
# op_device attribute has been set
if
self
.
_get_op_device_attr
(
op
):
continue
self
.
_add_op_device_attr_for_op
(
op
,
idx
,
block
)
def
_check_validation
(
self
,
block
):
"""
Check whether ops in a block are all validate (i.e., the
op_device attribute has been set).
Then, return all device specifications in order.
Check whether ops in a block have the op_device attribute set.
Then, return all devices in order.
"""
device_
specs
=
[]
device_
list
=
[]
for
op
in
block
.
ops
:
type
=
op
.
type
if
not
op
.
_has_kernel
(
type
):
if
not
op
.
_has_kernel
(
op
.
type
):
assert
op
.
type
==
"conditional_block"
and
(
op
.
attr
(
self
.
_op_role_key
)
==
int
(
self
.
_op_role
.
LRSched
)),
(
"Now, the only supported op without kernel is "
...
...
@@ -4165,15 +4363,16 @@ class PipelineOptimizer(object):
assert
op
.
has_attr
(
self
.
_op_device_key
),
(
"op ({}) has no {} attribute."
.
format
(
op
.
type
,
self
.
_op_device_key
))
dev_spec
=
op
.
attr
(
self
.
_op_device_key
)
assert
dev_spec
,
(
"op_device attribute for op "
"{} has not been set."
.
format
(
op
.
type
))
dev_type
=
dev_spec
.
split
(
':'
)[
0
]
device
=
op
.
attr
(
self
.
_op_device_key
)
assert
device
,
(
"op_device attribute for op "
"{} has not been set."
.
format
(
op
.
type
))
if
device
==
"gpu:all"
:
continue
dev_type
=
device
.
split
(
':'
)[
0
]
assert
dev_type
==
"gpu"
,
(
"Now only gpu devices are supported "
"for pipeline parallelism."
)
if
not
dev
_spec
in
device_specs
:
device_
specs
.
append
(
dev_spec
)
return
device_
specs
if
not
dev
ice
in
device_list
:
device_
list
.
append
(
device
)
return
device_
list
def
_insert_sendrecv_ops_for_boundaries
(
self
,
block
):
"""
...
...
@@ -4182,49 +4381,44 @@ class PipelineOptimizer(object):
"""
extra_index
=
0
# A map from var to device
spec
where op takes it as input,
# A map from var to device where op takes it as input,
# avoiding multiple send and recv ops.
var_dev
spec
=
dict
()
var_dev
_map
=
dict
()
for
index
,
op
in
enumerate
(
list
(
block
.
ops
)):
# skips lr-related ops and vars, as we will process them later.
if
int
(
op
.
attr
(
self
.
_op_role_key
))
&
int
(
self
.
_op_role
.
LRSched
):
continue
# skips update ops and vars, as we will process them later.
if
self
.
_is_update_op
(
op
):
continue
cur_device_spec
=
op
.
attr
(
self
.
_op_device_key
)
cur_device
=
op
.
attr
(
self
.
_op_device_key
)
if
cur_device
==
"gpu:all"
:
continue
for
var_name
in
op
.
input_arg_names
:
# i.e., lod_tensor_blocking_queue created by DataLoader,
# which only exists in startup program.
if
not
var_name
in
block
.
vars
:
continue
#
if not var_name in block.vars: continue
var
=
block
.
var
(
var_name
)
# skip data, because we will process it later
if
var
.
is_data
:
continue
prev_op
=
self
.
_find_real_prev_op
(
block
.
ops
,
op
,
var_name
)
if
prev_op
is
None
:
continu
e
prev_device_spec
=
prev_op
.
attr
(
self
.
_op_device_key
)
prev_device
=
prev_op
.
attr
(
self
.
_op_device_key
)
\
if
prev_op
else
Non
e
if
not
prev_device
or
prev_device
==
'gpu:all'
:
continue
if
prev_device_spec
!=
cur_device_spec
:
if
var_name
not
in
var_devspec
:
var_devspec
[
var_name
]
=
[]
if
cur_device_spec
in
var_devspec
[
var_name
]:
continue
var_devspec
[
var_name
].
append
(
cur_device_spec
)
if
prev_device
!=
cur_device
:
if
var_name
not
in
var_dev_map
:
var_dev_map
[
var_name
]
=
[]
if
cur_device
in
var_dev_map
[
var_name
]:
continue
var_dev_map
[
var_name
].
append
(
cur_device
)
op_role
=
op
.
all_attrs
()[
self
.
_op_role_key
]
var
=
block
.
vars
[
var_name
]
prev_device_index
=
int
(
prev_device
_spec
.
split
(
':'
)[
1
])
cur_device_index
=
int
(
cur_device
_spec
.
split
(
':'
)[
1
])
prev_device_index
=
int
(
prev_device
.
split
(
':'
)[
1
])
cur_device_index
=
int
(
cur_device
.
split
(
':'
)[
1
])
block
.
_insert_op
(
index
=
index
+
extra_index
,
type
=
'send_v2'
,
inputs
=
{
'X'
:
var
},
attrs
=
{
self
.
_op_device_key
:
prev_device
_spec
,
self
.
_op_device_key
:
prev_device
,
self
.
_op_role_key
:
op_role
,
'use_calc_stream'
:
True
,
'peer'
:
cur_device_index
,
'ring_id'
:
self
.
ring_id
,
})
extra_index
+=
1
block
.
_insert_op
(
...
...
@@ -4234,23 +4428,28 @@ class PipelineOptimizer(object):
attrs
=
{
'out_shape'
:
var
.
shape
,
'dtype'
:
var
.
dtype
,
self
.
_op_device_key
:
cur_device
_spec
,
self
.
_op_device_key
:
cur_device
,
self
.
_op_role_key
:
op_role
,
'use_calc_stream'
:
True
,
'peer'
:
prev_device_index
,
'ring_id'
:
self
.
ring_id
,
})
extra_index
+=
1
def
_clear_gradients
(
self
,
main_block
,
dev_spec
):
def
_clear_gradients
(
self
,
main_block
,
param_names
):
"""
Clear gradients at the begining of each run of a minibatch.
"""
for
param_name
in
self
.
_param_device_map
:
device
=
self
.
_param_device_map
[
param_name
]
if
device
!=
dev_spec
:
continue
# for param_name in self._param_device_map:
print
(
"param_names:"
,
param_names
)
for
param_name
in
param_names
:
# device = self._param_device_map[param_name]
# if device != dev_spec: continue
grad_name
=
self
.
_append_grad_suffix
(
param_name
)
if
not
main_block
.
has_var
(
grad_name
):
continue
grad_var
=
main_block
.
vars
[
grad_name
]
# if not main_block.has_var(grad_name): continue
assert
main_block
.
has_var
(
grad_name
)
grad_var
=
main_block
.
var
(
grad_name
)
grad_var
.
persistable
=
True
main_block
.
_insert_op
(
index
=
0
,
type
=
'fill_constant'
,
...
...
@@ -4260,21 +4459,20 @@ class PipelineOptimizer(object):
'shape'
:
grad_var
.
shape
,
'dtype'
:
grad_var
.
dtype
,
'value'
:
float
(
0
),
self
.
_op_device_key
:
device
,
#
self._op_device_key: device,
# a trick to run this op once per mini-batch
self
.
_op_role_key
:
self
.
_op_role
.
Optimize
.
LRSched
,
})
def
_
accumulate_gradients
(
self
,
block
):
def
_
insert_loss_scale
(
self
,
block
):
"""
Accumulate the gradients generated in microbatch to the one in mini-batch.
We also scale the loss corresponding to number of micro-batches as well.
"""
if
self
.
_num_microbatches
==
1
:
return
for
index
,
op
in
reversed
(
tuple
(
enumerate
(
list
(
block
.
ops
)))):
offset
=
index
device
=
op
.
attr
(
self
.
_op_device_key
)
#
device = op.attr(self._op_device_key)
# Backward pass
if
self
.
_is_loss_grad_op
(
op
):
loss_grad_var
=
block
.
vars
[
op
.
output_arg_names
[
0
]]
scale_factor
=
self
.
_num_microbatches
...
...
@@ -4285,36 +4483,151 @@ class PipelineOptimizer(object):
outputs
=
{
'Out'
:
loss_grad_var
},
attrs
=
{
'scale'
:
1.0
/
scale_factor
,
self
.
_op_device_key
:
device
,
#
self._op_device_key: device,
self
.
_op_role_key
:
self
.
_op_role
.
Backward
})
break
def
_rename_gradient_var_name
(
self
,
block
):
for
index
,
op
in
enumerate
(
block
.
ops
):
if
self
.
_is_backward_op
(
op
)
and
(
self
.
_op_role_var_key
in
op
.
attr_names
):
op_role_var
=
op
.
a
ll_attrs
()[
self
.
_op_role_var_key
]
op_role_var
=
op
.
a
ttr
(
self
.
_op_role_var_key
)
if
len
(
op_role_var
)
==
0
:
continue
assert
len
(
op_role_var
)
%
2
==
0
offset
=
index
for
i
in
range
(
0
,
len
(
op_role_var
),
2
):
grad_name
=
op_role_var
[
i
+
1
]
grad_var
=
block
.
vars
[
grad_name
]
new_grad_var_name
=
unique_name
.
generate
(
grad_name
)
new_var
=
self
.
_create_var
(
block
,
grad_var
,
new_grad_var_name
)
new_var
.
persistable
=
False
self
.
_rename_arg
(
op
,
grad_name
,
new_grad_var_name
)
def
_accumulate_gradients
(
self
,
block
):
"""
Accumulate the gradients generated in microbatch to the one in mini-batch.
"""
first_optimize_op_index
=
None
for
index
,
op
in
reversed
(
tuple
(
enumerate
(
list
(
block
.
ops
)))):
# device = op.attr(self._op_device_key)
if
not
self
.
_is_optimize_op
(
op
)
and
not
first_optimize_op_index
:
first_optimize_op_index
=
index
+
1
if
block
.
ops
[
first_optimize_op_index
].
type
==
'c_sync_comm_stream'
:
block
.
ops
[
first_optimize_op_index
].
_set_attr
(
self
.
_op_role_key
,
self
.
_op_role
.
Backward
)
first_optimize_op_index
+=
1
if
self
.
_is_backward_op
(
op
)
and
(
self
.
_op_role_var_key
in
op
.
attr_names
):
op_role_var
=
op
.
attr
(
self
.
_op_role_var_key
)
if
len
(
op_role_var
)
==
0
:
continue
assert
len
(
op_role_var
)
%
2
==
0
for
i
in
range
(
0
,
len
(
op_role_var
),
2
):
offset
=
0
param_name
=
op_role_var
[
i
]
if
not
block
.
has_var
(
param_name
):
continue
# clear gradient
param_grad_name
=
self
.
_append_grad_suffix
(
param_name
)
# if not main_block.has_var(grad_name): continue
if
not
block
.
has_var
(
param_grad_name
):
self
.
_create_var
(
block
,
block
.
vars
[
param_name
],
param_grad_name
)
assert
block
.
has_var
(
param_grad_name
)
param_grad_var
=
block
.
var
(
param_grad_name
)
param_grad_var
.
persistable
=
True
block
.
_insert_op
(
index
=
offset
+
1
,
type
=
'
sum
'
,
inputs
=
{
'X'
:
[
grad_var
,
new_var
]
},
outputs
=
{
'Out'
:
grad_var
},
index
=
first_optimize_op_index
+
offset
,
type
=
'
fill_constant
'
,
inputs
=
{},
outputs
=
{
'Out'
:
[
param_grad_var
]
},
attrs
=
{
self
.
_op_device_key
:
device
,
self
.
_op_role_key
:
self
.
_op_role
.
Backward
,
self
.
_op_role_var_key
:
op_role_var
'shape'
:
param_grad_var
.
shape
,
'dtype'
:
param_grad_var
.
dtype
,
'value'
:
float
(
0
),
# self._op_device_key: device,
# a trick to run this op once per mini-batch
self
.
_op_role_key
:
self
.
_op_role
.
Optimize
.
LRSched
,
})
offset
+=
1
grad_name
=
op_role_var
[
i
+
1
]
# with _0 suffix
grad_var
=
block
.
vars
[
grad_name
]
# without _0 suffix
real_grad_name
=
grad_name
[
0
:
grad_name
.
find
(
'@GRAD'
)]
+
'@GRAD'
real_grad_var
=
block
.
vars
[
real_grad_name
]
# without _0 suffix
# new_grad_var_name = unique_name.generate(grad_name)
# new_var = self._create_var(block, grad_var,
# new_grad_var_name)
# new_var.persistable = False
# self._rename_arg(op, grad_name, new_grad_var_name)
if
not
'cast_fp16'
in
grad_name
:
block
.
_insert_op
(
index
=
first_optimize_op_index
+
offset
,
type
=
'sum'
,
inputs
=
{
'X'
:
[
grad_var
,
real_grad_var
]},
outputs
=
{
'Out'
:
real_grad_var
},
attrs
=
{
#self._op_device_key: device,
self
.
_op_role_key
:
self
.
_op_role
.
Backward
,
#self._op_role_var_key: op_role_var
})
offset
+=
1
else
:
grad_name
=
op_role_var
[
i
+
1
]
# with _0 suffix
grad_var
=
block
.
vars
[
grad_name
]
# without _0 suffix
fp32_grad_var_name
=
param_name
+
core
.
grad_var_suffix
()
fp32_grad_var
=
block
.
vars
[
fp32_grad_var_name
]
fp32_grad_var
.
persistable
=
True
cast_grad_var_name
=
unique_name
.
generate
(
fp32_grad_var_name
)
cast_var
=
self
.
_create_var
(
block
,
grad_var
,
cast_grad_var_name
)
cast_var
.
persistable
=
False
real_grad_name
=
grad_name
[
0
:
grad_name
.
find
(
'@GRAD'
)]
+
'@GRAD'
real_grad_var
=
block
.
vars
[
real_grad_name
]
# without _0 suffix
block
.
_insert_op
(
index
=
first_optimize_op_index
+
offset
,
type
=
'cast'
,
inputs
=
{
'X'
:
fp32_grad_var
},
outputs
=
{
'Out'
:
cast_var
},
attrs
=
{
'in_dtype'
:
fp32_grad_var
.
dtype
,
'out_dtype'
:
cast_var
.
dtype
,
# self._op_device_key: device,
self
.
_op_role_key
:
self
.
_op_role
.
Backward
,
# self._op_role_var_key: op_role_var
})
offset
+=
1
block
.
_insert_op
(
index
=
first_optimize_op_index
+
offset
,
type
=
'sum'
,
inputs
=
{
'X'
:
[
grad_var
,
cast_var
]},
outputs
=
{
'Out'
:
real_grad_var
},
attrs
=
{
# self._op_device_key: device,
self
.
_op_role_key
:
self
.
_op_role
.
Backward
,
# self._op_role_var_key: op_role_var
})
offset
+=
1
block
.
_insert_op
(
index
=
first_optimize_op_index
+
offset
,
type
=
'cast'
,
inputs
=
{
'X'
:
real_grad_var
},
outputs
=
{
'Out'
:
fp32_grad_var
},
attrs
=
{
'in_dtype'
:
real_grad_var
.
dtype
,
'out_dtype'
:
fp32_grad_var
.
dtype
,
# self._op_device_key: device,
self
.
_op_role_key
:
self
.
_op_role
.
Backward
,
# self._op_role_var_key: op_role_var
})
def
_add_sub_blocks
(
self
,
main_block
,
program_list
):
main_program
=
main_block
.
program
...
...
@@ -4372,7 +4685,7 @@ class PipelineOptimizer(object):
block
=
prog
.
block
(
0
)
for
op
in
block
.
ops
:
if
op
.
type
==
"recv_v2"
or
op
.
type
==
"create_py_reader"
or
\
op
.
type
==
"read"
:
op
.
type
==
"read"
or
op
.
type
==
"update_loss_scaling"
:
continue
# We have processed lr related vars
if
op
.
attr
(
self
.
_op_role_key
)
==
int
(
...
...
@@ -4412,6 +4725,7 @@ class PipelineOptimizer(object):
# microbatch
self
.
_op_role_key
:
self
.
_op_role
.
LRSched
,
'peer'
:
read_dev_index
,
'ring_id'
:
self
.
ring_id
,
})
read_block
.
_insert_op
(
index
=
0
,
...
...
@@ -4425,9 +4739,18 @@ class PipelineOptimizer(object):
# A trick to make the role LRSched to avoid copy every
# microbatch
self
.
_op_role_key
:
self
.
_op_role
.
LRSched
,
'peer'
:
write_dev_index
'peer'
:
write_dev_index
,
'ring_id'
:
self
.
ring_id
,
})
def
_is_gradient_clip_op
(
self
,
op
):
return
op
.
desc
.
has_attr
(
"op_namescope"
)
\
and
op
.
desc
.
attr
(
"op_namescope"
).
startswith
(
"/gradient_clip"
)
def
_is_regularization_op
(
self
,
op
):
return
op
.
desc
.
has_attr
(
"op_namescope"
)
\
and
op
.
desc
.
attr
(
"op_namescope"
).
startswith
(
"/regularization"
)
def
minimize
(
self
,
loss
,
startup_program
=
None
,
...
...
@@ -4438,17 +4761,29 @@ class PipelineOptimizer(object):
startup_program
=
default_startup_program
()
optimize_ops
,
params_grads
=
self
.
_optimizer
.
minimize
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
self
.
_param_device_map
=
self
.
_optimizer
.
_param_device_map
# Step1: add default op_device attribute for regulization and clip ops
self
.
_add_opdevice_attr_for_regularization_clip
(
main_block
)
# Step2: add default op_device attribute for ops whose op_device
# attribute have not been set yet. Then check all ops have the
# op_device attribute.
self
.
_add_default_opdevice_attr
(
main_block
)
device_specs
=
self
.
_check_validation
(
main_block
)
self
.
_param_device_map
=
self
.
_origin_optimizer
.
_param_device_map
assert
main_block
.
program
.
_pipeline_opt
\
and
'local_rank'
in
main_block
.
program
.
_pipeline_opt
,
\
'Please use pipeline with fleet.'
local_rank
=
main_block
.
program
.
_pipeline_opt
[
'local_rank'
]
self
.
use_sharding
=
False
if
'use_sharding'
in
main_block
.
program
.
_pipeline_opt
:
self
.
use_sharding
=
main_block
.
program
.
_pipeline_opt
[
'use_sharding'
]
self
.
ring_id
=
0
if
'ring_id'
in
main_block
.
program
.
_pipeline_opt
:
self
.
ring_id
=
main_block
.
program
.
_pipeline_opt
[
'ring_id'
]
if
main_block
.
program
.
_pipeline_opt
[
'global_rank'
]
==
0
:
with
open
(
"startup_raw"
,
'w'
)
as
f
:
f
.
writelines
(
str
(
startup_program
))
with
open
(
"main_raw"
,
'w'
)
as
f
:
f
.
writelines
(
str
(
main_block
.
program
))
# Step1: add default op_device attribute for ops.
self
.
_add_op_device_attr
(
main_block
)
device_list
=
self
.
_check_validation
(
main_block
)
def
device_cmp
(
device1
,
device2
):
dev1_id
=
int
(
device1
.
split
(
':'
)[
1
])
...
...
@@ -4460,66 +4795,62 @@ class PipelineOptimizer(object):
else
:
return
0
sorted_device_spec
=
sorted
(
device_specs
,
key
=
cmp_to_key
(
device_cmp
))
assert
sorted_device_spec
==
device_specs
,
(
"With pipeline "
"parallelism, you must use gpu devices one after another "
"in the order of their ids."
)
sorted_device_list
=
sorted
(
device_list
,
key
=
cmp_to_key
(
device_cmp
))
assert
sorted_device_list
==
device_list
,
(
"With pipeline parallelism, you must use gpu devices one after "
"another in the order of their ids."
)
# Step
3
: add send and recv ops between section boundaries
# Step
2
: add send and recv ops between section boundaries
self
.
_insert_sendrecv_ops_for_boundaries
(
main_block
)
# Step
4
: split program into sections and add pairs of
# Step
3
: split program into sections and add pairs of
# send and recv ops for data var.
main_program
=
main_block
.
program
program_list
=
self
.
_split_program
(
main_program
,
device_
specs
)
program_list
=
self
.
_split_program
(
main_program
,
device_
list
)
for
p
in
program_list
:
self
.
_create_vars
(
p
[
"program"
].
block
(
0
),
main_program
.
global_block
())
self
.
_create_vars
(
p
[
"program"
].
block
(
0
),
main_block
)
self
.
_insert_sendrecv_for_data_var
(
main_block
,
program_list
,
startup_program
,
device_
specs
)
startup_program
,
device_
list
)
# Step
5
: Special Case: process persistable vars that exist in
# Step
4
: Special Case: process persistable vars that exist in
# multiple sections
self
.
_process_persistable_vars_in_multi_sections
(
main_program
,
startup_program
,
program_list
)
# Step
6
: Add sub blocks for section programs
# Step
5
: Add sub blocks for section programs
self
.
_add_sub_blocks
(
main_block
,
program_list
)
assert
(
main_program
.
_pipeline_opt
and
isinstance
(
main_program
.
_pipeline_opt
,
dict
)
and
'local_rank'
in
main_program
.
_pipeline_opt
),
\
"You must use pipeline with fleet"
local_rank
=
main_program
.
_pipeline_opt
[
'local_rank'
]
%
len
(
device_specs
)
local_rank
=
main_program
.
_pipeline_opt
[
'local_rank'
]
%
len
(
device_list
)
place_list
=
[]
for
dev
_spec
in
device_specs
:
dev_index
=
dev_spec
.
split
(
":"
)[
1
]
place_list
.
append
(
core
.
CUDAPlace
(
local_rank
))
for
dev
in
device_list
:
dev_index
=
int
(
dev
.
split
(
":"
)[
1
])
place_list
.
append
(
core
.
CUDAPlace
(
dev_index
))
# Step
7
: Split startup program
# Step
6
: Split startup program
new_startup_program
=
self
.
_split_startup_program
(
startup_program
,
local_rank
)
# Step8: clear gradients before each mini-batch and
# accumulate gradients during backward
self
.
_clear_gradients
(
program_list
[
local_rank
][
'program'
].
global_block
(),
dev_spec
=
device_specs
[
local_rank
])
self
.
_accumulate_gradients
(
program_list
[
local_rank
][
'program'
]
.
global_block
())
startup_program
.
_pipeline_opt
=
{
"startup_program"
:
new_startup_program
,
}
real_block
=
program_list
[
local_rank
][
'program'
].
global_block
()
self
.
_insert_loss_scale
(
real_block
)
if
not
self
.
use_sharding
:
# Step7: clear gradients before each mini-batch and
# accumulate gradients during backward
param_list
=
[]
for
param
,
grad
in
params_grads
:
if
real_block
.
has_var
(
param
):
param_list
.
append
(
param
)
#self._clear_gradients(real_block, param_list)
self
.
_rename_gradient_var_name
(
real_block
)
self
.
_accumulate_gradients
(
real_block
)
place_id
=
int
(
os
.
getenv
(
"FLAGS_selected_gpus"
,
"0"
))
main_program
.
_pipeline_opt
=
{
"trainer"
:
"PipelineTrainer"
,
"device_worker"
:
"Section"
,
"inner_parallelism"
:
len
(
device_
specs
),
"inner_parallelism"
:
len
(
device_
list
),
"section_program"
:
program_list
[
local_rank
],
"place"
:
place_list
[
local_rank
],
"place_id"
:
place_id
,
...
...
@@ -5487,7 +5818,7 @@ class GradientMergeOptimizer(object):
def
_is_the_backward_op
(
self
,
op
):
op_maker
=
core
.
op_proto_and_checker_maker
backward
=
core
.
op_proto_and_checker_maker
.
OpRole
.
Backward
backward
=
core
.
op_proto_and_checker_maker
.
OpRole
.
B
c
ackward
if
op_maker
.
kOpRoleVarAttrName
()
in
op
.
attr_names
and
\
int
(
op
.
all_attrs
()[
op_maker
.
kOpRoleAttrName
()])
==
int
(
backward
):
return
True
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录