Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
c4d789af
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
c4d789af
编写于
3月 15, 2021
作者:
S
sandyhouse
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
update
上级
1de80a5e
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
40 addition
and
177 deletion
+40
-177
python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
...distributed/fleet/meta_optimizers/sharding/fp16_helper.py
+3
-1
python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py
...ed/fleet/meta_optimizers/sharding/gradient_clip_helper.py
+1
-1
python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
...e/distributed/fleet/meta_optimizers/sharding_optimizer.py
+12
-157
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+24
-18
未找到文件。
python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
浏览文件 @
c4d789af
...
...
@@ -81,7 +81,9 @@ class FP16Utils(object):
if
not
FP16Utils
.
is_fp32_cast_op
(
block
,
op
):
continue
output_name
=
op
.
desc
.
output_arg_names
()[
0
]
param_name
=
output_name
.
strip
(
"@GRAD"
)
param_name
=
output_name
.
strip
(
"@GRAD@MERGED"
)
if
"@MERGED"
in
output_name
else
output_name
.
strip
(
"@GRAD"
)
if
param_name
not
in
shard
.
global_params
:
raise
ValueError
(
"Output 'X' of cast_op must be a grad of"
"model param, but {} is not a grad"
.
format
(
...
...
python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py
浏览文件 @
c4d789af
...
...
@@ -41,7 +41,7 @@ class GradientClipHelper(object):
for
input_name
in
op
.
desc
.
input_arg_names
():
if
input_name
in
deperated_vars
:
deperate_op
=
True
param_name
=
input_name
.
strip
(
"@GRAD"
)
param_name
=
input_name
.
strip
(
"@GRAD
@MERGED
"
)
if
shard
.
is_param
(
param_name
)
and
\
not
shard
.
has_param
(
param_name
):
deperate_op
=
True
...
...
python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
浏览文件 @
c4d789af
...
...
@@ -188,13 +188,6 @@ class ShardingOptimizer(MetaOptimizerBase):
# pp_optimizer._rename_gradient_var_name(main_block)
# crop ops
for
idx
,
op
in
reversed
(
list
(
enumerate
(
main_block
.
ops
))):
# if op.type == 'fill_constant' and int(op.attr('op_role')) == 16:
# out_name = op.output_arg_names[0]
# if not 'GRAD' in out_name: continue
# param_name = out_name.strip("@GRAD")
# #if main_block.has_var(out_name): continue
# if self._shard.has_param(param_name): continue
# main_block._remove_op(idx)
if
is_update_op
(
op
):
op_role_var
=
op
.
attr
(
'op_role_var'
)
param_name
=
op_role_var
[
0
]
...
...
@@ -208,13 +201,6 @@ class ShardingOptimizer(MetaOptimizerBase):
#if self._shard.has_param(param_name): continue
if
in_name
not
in
main_block
.
vars
:
main_block
.
_remove_op
(
idx
)
#param_list = []
#for param_name, grad_name in params_grads:
# if self._shard.has_param(param_name):
# param_list.append(param_name)
#pp_optimizer._clear_gradients(main_block, param_list)
#accumulated_grad_names = pp_optimizer._accumulate_gradients(
# main_block)
# accumulated_grad_names = sorted(accumulated_grad_names)
if
self
.
pp_allreduce_in_optimize
:
print
(
"persistable FP32 grad: "
)
...
...
@@ -229,149 +215,7 @@ class ShardingOptimizer(MetaOptimizerBase):
self
.
_shard
,
core
.
op_proto_and_checker_maker
.
OpRole
.
Optimize
,
use_calc_stream
=
True
)
#if not self._shard.has_param(param_name): continue
##if not main_block.has_var(grad_name): continue
#assert main_block.has_var(grad_name)
#grad_var = main_block.vars[grad_name]
#grad_var.persistable = True
#main_block._insert_op(
# index=0,
# type='fill_constant',
# inputs={},
# outputs={'Out': [grad_var]},
# attrs={
# 'shape': grad_var.shape,
# 'dtype': grad_var.dtype,
# 'value': float(0),
# #self._op_device_key: device,
# # a trick to run this op once per mini-batch
# 'op_role': core.op_proto_and_checker_maker.OpRole.LRSched,
# })
#def _create_var(block, ref_var, name):
# """
# Create a new var for block, which has the same type,
# shape and dtype as ref_var, then rename it with the
# name `name`.
# """
# new_var = block.create_var(
# name=name,
# shape=ref_var.shape,
# dtype=ref_var.dtype,
# type=ref_var.type,
# lod_level=ref_var.lod_level,
# persistable=ref_var.persistable,
# is_data=ref_var.is_data,
# need_check_feed=ref_var.desc.need_check_feed())
# new_var.stop_gradient = ref_var.stop_gradient
# return new_var
#def _rename_arg(op, old_name, new_name):
# op_desc = op.desc
# if isinstance(op_desc, tuple):
# op_desc = op_desc[0]
# op_desc._rename_input(old_name, new_name)
# op_desc._rename_output(old_name, new_name)
#print("params_grads:", params_grads)
#for param_name, grad_name in params_grads:
# if not self._shard.has_param(param_name): continue
# #if not main_block.has_var(grad_name): continue
# assert main_block.has_var(grad_name)
# use_fp16 = False
# fp16_grad_name = param_name + '.cast_fp16@GRAD'
# if main_block.has_var(grad_name):
# fp16_grad_var = main_block.vars[fp16_grad_name]
# use_fp16 = True
# grad_var = main_block.vars[grad_name]
# if use_fp16:
# cast_grad_var_name = paddle.fluid.unique_name.generate(
# grad_name)
# cast_var = _create_var(main_block, fp16_grad_var,
# cast_grad_var_name)
# cast_var.persistable = False
# main_block.append_op(
# #index=offset + 1,
# type='cast',
# inputs={'X': grad_var},
# outputs={'Out': cast_var},
# attrs={
# 'in_dtype': grad_var.dtype,
# 'out_dtype': cast_var.dtype,
# 'op_role':
# core.op_proto_and_checker_maker.OpRole.Backward,
# })
# #offset += 1
# main_block.append_op(
# #index=offset + 1,
# type='sum',
# inputs={'X': [fp16_grad_var, cast_var]},
# outputs={'Out': fp16_grad_var},
# attrs={
# 'op_role':
# core.op_proto_and_checker_maker.OpRole.Backward,
# 'op_role_var': op_role_var
# })
# for index, op in reversed(tuple(enumerate(list(main_block.ops)))):
# offset = index
# if is_backward_op(op) and (
# 'op_role_var' in op.attr_names):
# op_role_var = op.all_attrs()['op_role_var']
# if len(op_role_var) == 0:
# continue
# assert len(op_role_var) % 2 == 0
# offset = index
# for i in range(0, len(op_role_var), 2):
# grad_name = op_role_var[i + 1]
# if not main_block.has_var(grad_name): continue
# grad_var = main_block.vars[grad_name]
# if not 'cast_fp16' in grad_name:
# new_grad_var_name = paddle.fluid.unique_name.generate(grad_name)
# new_var = _create_var(main_block, grad_var,
# new_grad_var_name)
# new_var.persistable = False
# _rename_arg(op, grad_name, new_grad_var_name)
# main_block._insert_op(
# index=offset + 1,
# type='sum',
# inputs={'X': [grad_var, new_var]},
# outputs={'Out': grad_var},
# attrs={
# 'op_role': core.op_proto_and_checker_maker.OpRole.Backward,
# 'op_role_var': op_role_var
# })
# offset += 1
# if 'cast_fp16' in grad_name:
# param_name = op_role_var[i]
# fp32_grad_var_name = param_name + "@GRAD"
# fp32_grad_var = main_block.vars[grad_name]
# cast_grad_var_name = paddle.fluid.unique_name.generate(
# fp32_grad_var_name)
# cast_var = _create_var(main_block, grad_var,
# cast_grad_var_name)
# cast_var.persistable = False
# main_block._insert_op(
# index=offset + 1,
# type='cast',
# inputs={'X': fp32_grad_var},
# outputs={'Out': cast_var},
# attrs={
# 'in_dtype': fp32_grad_var.dtype,
# 'out_dtype': cast_var.dtype,
# 'op_role': core.op_proto_and_checker_maker.OpRole.Backward,
# # self._op_role_var_key: op_role_var
# })
# offset += 1
# main_block._insert_op(
# index=offset + 1,
# type='sum',
# inputs={'X': [grad_var, cast_var]},
# outputs={'Out': grad_var},
# attrs={
# 'op_role': core.op_proto_and_checker_maker.OpRole.Backward,
# 'op_role_var': op_role_var})
main_block
.
_sync_with_cpp
()
# TODO(wangxi): add optimize offload
...
...
@@ -699,6 +543,17 @@ class ShardingOptimizer(MetaOptimizerBase):
for
idx
in
range
(
len
(
self
.
_segments
)):
assert
len
(
self
.
_segments
[
idx
].
_allreduce_vars
)
==
0
# fix the _end_idx for segments[-1] if pp is used.
new_end_idx
=
self
.
_segments
[
-
1
].
_end_idx
for
idx
in
range
(
self
.
_segments
[
-
1
].
_end_idx
-
1
,
self
.
_segments
[
-
1
].
_start_idx
-
1
,
-
1
):
op
=
block
.
ops
[
idx
]
if
op
.
type
==
"fill_constant"
or
op
.
type
==
"sum"
:
if
"MERGED"
in
op
.
output_arg_names
[
0
]:
new_end_idx
=
idx
+
1
elif
op
.
type
==
"cast"
:
if
"@TMP"
in
op
.
output_arg_names
[
0
]:
new_end_idx
=
idx
+
1
self
.
_segments
[
-
1
].
_end_idx
=
new_end_idx
if
self
.
_segments
[
-
1
].
_allreduce_vars
:
shard_allredue_vars
=
self
.
_shard
.
filter_grads
(
self
.
_segments
[
-
1
]
.
_allreduce_vars
)
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
c4d789af
...
...
@@ -4293,7 +4293,7 @@ class PipelineOptimizer(object):
input_name
=
op
.
input_arg_names
[
0
]
output_name
=
op
.
output_arg_names
[
0
]
if
'@Fetch'
in
output_name
:
post_op
=
self
.
_find_
real_
post_op
(
block
.
ops
,
op
,
output_name
)
post_op
=
self
.
_find_post_op
(
block
.
ops
,
op
,
output_name
)
op
.
_set_attr
(
'op_device'
,
post_op
.
attr
(
'op_device'
))
else
:
prev_op
=
self
.
_find_real_prev_op
(
block
.
ops
,
op
,
...
...
@@ -4849,6 +4849,9 @@ class PipelineOptimizer(object):
Create a new merged gradient for each parameter and accumulate the
corresponding gradient to it.
"""
merged_gradient_names
=
[]
first_opt_op_idx
=
None
for
index
,
op
in
reversed
(
tuple
(
enumerate
(
list
(
block
.
ops
)))):
# remove the cast op of fp16 grad to fp32 grad
if
self
.
_is_optimize_op
(
op
)
and
op
.
type
==
'cast'
:
...
...
@@ -4859,6 +4862,9 @@ class PipelineOptimizer(object):
block
.
_remove_op
(
index
)
continue
if
self
.
_is_backward_op
(
op
)
and
not
first_opt_op_idx
:
first_opt_op_idx
=
index
+
1
if
self
.
_is_backward_op
(
op
)
and
(
self
.
_op_role_var_key
in
op
.
attr_names
):
op_role_var
=
op
.
attr
(
self
.
_op_role_var_key
)
...
...
@@ -4868,7 +4874,7 @@ class PipelineOptimizer(object):
assert
len
(
op_role_var
)
%
2
==
0
op
.
_remove_attr
(
self
.
_op_role_var_key
)
for
i
in
range
(
0
,
len
(
op_role_var
),
2
):
offset
=
1
offset
=
0
param_name
=
op_role_var
[
i
]
assert
block
.
has_var
(
param_name
),
(
"parameter {} not in "
...
...
@@ -4886,7 +4892,7 @@ class PipelineOptimizer(object):
merged_param_grad_var
=
block
.
var
(
merged_param_grad_name
)
merged_param_grad_var
.
persistable
=
True
block
.
_insert_op
(
index
=
inde
x
+
offset
,
index
=
first_opt_op_id
x
+
offset
,
type
=
'fill_constant'
,
inputs
=
{},
outputs
=
{
'Out'
:
[
merged_param_grad_var
]},
...
...
@@ -4902,7 +4908,7 @@ class PipelineOptimizer(object):
grad_var
=
block
.
vars
[
grad_name
]
if
not
'cast_fp16'
in
grad_name
:
block
.
_insert_op
(
index
=
inde
x
+
offset
,
index
=
first_opt_op_id
x
+
offset
,
type
=
'sum'
,
inputs
=
{
'X'
:
[
grad_var
,
merged_param_grad_var
]},
outputs
=
{
'Out'
:
merged_param_grad_var
},
...
...
@@ -4918,7 +4924,7 @@ class PipelineOptimizer(object):
cast_grad_var_name
)
cast_grad_var
.
persistable
=
False
block
.
_insert_op
(
index
=
inde
x
+
offset
,
index
=
first_opt_op_id
x
+
offset
,
type
=
'cast'
,
inputs
=
{
'X'
:
grad_var
},
outputs
=
{
'Out'
:
cast_grad_var
},
...
...
@@ -4929,7 +4935,7 @@ class PipelineOptimizer(object):
})
offset
+=
1
block
.
_insert_op
(
index
=
inde
x
+
offset
,
index
=
first_opt_op_id
x
+
offset
,
type
=
'sum'
,
inputs
=
{
'X'
:
[
merged_param_grad_var
,
cast_grad_var
]
...
...
@@ -5705,10 +5711,10 @@ class RecomputeOptimizer(Optimizer):
for
output_var
in
output_vars
:
if
output_var
in
need_offload_checkpoint_names
:
assert
len
(
output_vars
)
==
1
,
"chekpoint should be the only Output of a certain op, but [{}] is from [{}]"
.
format
(
output_var
,
op
)
#
assert len(
#
output_vars
#
) == 1, "chekpoint should be the only Output of a certain op, but [{}] is from [{}]".format(
#
output_var, op)
if
output_var
in
self
.
un_offload_checkpoint_names
:
# insert sync op if last checkpoint has not been sync
...
...
@@ -5733,14 +5739,14 @@ class RecomputeOptimizer(Optimizer):
format
(
output_var
))
# need to sync the last need to offload checkpoint before the last checkpoint as output op
if
output_var
==
last_checkpoint
:
assert
len
(
output_vars
)
==
1
,
"chekpoint should be the only Output of a certain op, but [{}] is from [{}]"
.
format
(
output_var
,
op
)
assert
last_offload_checkpoint
==
self
.
sorted_checkpoint_names
[
-
2
],
"the last offload chekpoint before [{}] is suppose to be [{}], but got [{}]"
.
format
(
last_checkpoint
,
self
.
sorted_checkpoint_names
[
-
2
],
last_offload_checkpoint
)
#
assert len(
#
output_vars
#
) == 1, "chekpoint should be the only Output of a certain op, but [{}] is from [{}]".format(
#
output_var, op)
#
assert last_offload_checkpoint == self.sorted_checkpoint_names[
#
-2], "the last offload chekpoint before [{}] is suppose to be [{}], but got [{}]".format(
#
last_checkpoint, self.sorted_checkpoint_names[-2],
#
last_offload_checkpoint)
# sync if last checkpoint has not been sync
if
self
.
checkpoint_usage_count_and_idx
[
last_offload_checkpoint
][
'idx'
]
==
0
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录