Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
03d8304f
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
03d8304f
编写于
10月 14, 2021
作者:
Y
Yuang Liu
提交者:
GitHub
10月 14, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[hybrid enhance] add flag to control the avg position for grad merge under pipeline mode (#36384)
上级
b857d755
变更
4
显示空白变更内容
内联
并排
Showing
4 changed file
with
263 addition
and
2 deletion
+263
-2
paddle/fluid/framework/distributed_strategy.proto
paddle/fluid/framework/distributed_strategy.proto
+4
-0
python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
...e/distributed/fleet/meta_optimizers/sharding_optimizer.py
+61
-1
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+3
-1
python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
...uid/tests/unittests/test_fleet_sharding_meta_optimizer.py
+195
-0
未找到文件。
paddle/fluid/framework/distributed_strategy.proto
浏览文件 @
03d8304f
...
...
@@ -133,6 +133,10 @@ message GradientScaleConfig {
// Else if sum, the gradient will accumulated among multiple
// devices.
optional
string
scale_strategy
=
1
[
default
=
'avg'
];
// The avg_loss flag is used to determine the position of average
// If scale_gradient is False, it will avg the loss@Grad before grad merge.
// Otherwise, it will do grad merge firstly, then avg the grad after merging.
optional
bool
scale_gradient
=
2
[
default
=
false
];
}
message
AsyncConfig
{
...
...
python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
浏览文件 @
03d8304f
...
...
@@ -18,7 +18,7 @@ import paddle.fluid as fluid
from
paddle.static
import
default_startup_program
,
device_guard
from
paddle.fluid
import
layers
from
.common
import
OpRole
,
OP_ROLE_VAR_KEY
,
CollectiveHelper
from
.common
import
OpRole
,
OP_ROLE_VAR_KEY
,
CollectiveHelper
,
OP_ROLE_KEY
from
.common
import
is_backward_op
,
is_optimizer_op
,
is_update_op
from
.meta_optimizer_base
import
MetaOptimizerBase
from
.sharding.shard
import
Shard
,
ProgramSegment
...
...
@@ -193,6 +193,14 @@ class ShardingOptimizer(MetaOptimizerBase):
else
:
gm_mode
=
"pp_gm"
gm_acc_step
=
strategy
.
pipeline_configs
[
'accumulate_steps'
]
gradient_scale_configs
=
strategy
.
gradient_scale_configs
assert
gradient_scale_configs
[
'scale_strategy'
]
==
'avg'
,
\
'For pipeline mode, the '
'gradient scale mode should '
\
'be "avg", but got {}'
.
format
(
gradient_scale_configs
[
'scale_strategy'
])
# Note (Yuang Liu): this avg_loss flag determines where to do the average op for grad merge.
# If True, will do sum firstly for gradient merge, then do scale by gm_acc_step.
# If False, will scale loss by gm_acc_step first, then do sum for gradient merge.
self
.
scale_gradient
=
gradient_scale_configs
[
'scale_gradient'
]
if
gm_acc_step
>
1
:
logger
.
info
(
"Gradient merge in [{}], acc step = [{}]"
.
format
(
gm_mode
,
gm_acc_step
))
...
...
@@ -241,6 +249,7 @@ class ShardingOptimizer(MetaOptimizerBase):
'global_ring_id'
:
3
,
'mp_degree'
:
self
.
mp_degree
,
'mp_rank'
:
global_rank
%
self
.
mp_degree
,
'scale_gradient'
:
self
.
scale_gradient
}
main_program
=
loss
.
block
.
program
main_program
.
_pipeline_opt
=
pipeline_opt
...
...
@@ -362,6 +371,8 @@ class ShardingOptimizer(MetaOptimizerBase):
main_block
,
strategy
=
strategy
,
shard
=
shard
)
len_of_ops
=
len
(
main_block
.
ops
)
if
self
.
scale_gradient
:
self
.
_avg_grad_merge_after_sum
(
main_block
,
accumulated_grad_names
)
first_optimize_op_index
=
get_first_optimize_op_idx
(
main_block
)
if
self
.
pp_allreduce_in_optimize
:
...
...
@@ -429,6 +440,55 @@ class ShardingOptimizer(MetaOptimizerBase):
# FIXME(wangxi): if fp16_allreduce, put cast fp16->fp32 to there?
def
_avg_grad_merge_after_sum
(
self
,
main_block
,
accumulated_grad_names
):
if
self
.
user_defined_strategy
.
amp
and
\
self
.
user_defined_strategy
.
amp_configs
[
'use_dynamic_loss_scaling'
]:
# For AMP, if using dynamic loss scaling the avg
# operation can be simple done by modify the LossScaling op.
for
idx
,
op
in
enumerate
(
main_block
.
ops
):
if
op
.
type
==
'check_finite_and_unscale'
:
loss_scale_name
=
op
.
input
(
'Scale'
)[
0
]
loss_scaling_var
=
main_block
.
var
(
loss_scale_name
)
loss_scale_tmp_var_name
=
loss_scale_name
+
'@TMP'
loss_scale_tmp_var
=
main_block
.
create_var
(
name
=
loss_scale_tmp_var_name
,
shape
=
loss_scaling_var
.
shape
,
dtype
=
loss_scaling_var
.
dtype
)
main_block
.
_insert_op_without_sync
(
idx
,
type
=
'scale'
,
inputs
=
{
'X'
:
loss_scaling_var
},
outputs
=
{
'Out'
:
loss_scale_tmp_var
},
attrs
=
{
'scale'
:
self
.
_gradient_merge_acc_step
,
'bias'
:
0.0
,
'bias_after_scale'
:
False
,
OP_ROLE_KEY
:
OpRole
.
Optimize
})
op
.
_rename_input
(
loss_scale_name
,
loss_scale_tmp_var_name
)
break
else
:
# For pp, do the avg operation for gradient merge after merging
# the gradient to meet the logic for gradient merge under pure dp.
tmp_first_opt_idx
=
None
for
idx
,
op
in
enumerate
(
main_block
.
ops
):
if
is_optimizer_op
(
op
)
and
op
.
type
!=
'c_sync_comm_stream'
:
tmp_first_opt_idx
=
idx
break
assert
tmp_first_opt_idx
is
not
None
,
'Occurs some errors, no optimize ops'
for
grad
in
accumulated_grad_names
:
main_block
.
_insert_op_without_sync
(
tmp_first_opt_idx
,
type
=
'scale'
,
inputs
=
{
'X'
:
grad
},
outputs
=
{
'Out'
:
grad
},
attrs
=
{
'scale'
:
1.0
/
self
.
_gradient_merge_acc_step
,
'bias'
:
0.0
,
'bias_after_scale'
:
False
,
OP_ROLE_KEY
:
OpRole
.
Optimize
})
def
_adapt_amp_clip_without_sharding
(
self
):
# if not use sharding, adapt amp/clip, for remain parallelism.
# cast --> amp --> clip --> opt
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
03d8304f
...
...
@@ -5820,6 +5820,7 @@ class PipelineOptimizer(object):
self
.
global_ring_id
=
pipeline_opt
[
'global_ring_id'
]
self
.
mp_degree
=
pipeline_opt
[
'mp_degree'
]
self
.
mp_rank
=
pipeline_opt
[
'mp_rank'
]
self
.
scale_gradient
=
pipeline_opt
.
get
(
'scale_gradient'
,
False
)
assert
self
.
mp_degree
>=
1
assert
0
<=
self
.
mp_rank
<
self
.
mp_degree
...
...
@@ -5886,6 +5887,7 @@ class PipelineOptimizer(object):
"startup_program"
:
new_startup_program
,
}
real_block
=
program_list
[
self
.
local_rank
].
global_block
()
if
not
self
.
scale_gradient
:
self
.
_insert_loss_scale
(
real_block
)
if
not
self
.
use_sharding
:
# Step7: clear gradients before each mini-batch and
...
...
python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
浏览文件 @
03d8304f
...
...
@@ -1272,6 +1272,201 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer):
self
.
assertEqual
(
dp_group_waiting_ports
,
[
'127.0.0.1:36002'
])
def
test_hybrid_with_pp_dp_amp_with_gradient_fuse_and_avg_after_sum
(
self
):
train_prog
,
startup_prog
=
paddle
.
fluid
.
Program
(),
paddle
.
fluid
.
Program
(
)
avg_cost
,
strategy
=
self
.
pp_net
(
train_prog
,
startup_prog
)
strategy
.
amp
=
True
strategy
.
amp_configs
=
{
'custom_black_varnames'
:
[
'fc_6.b_0'
],
}
strategy
.
sharding
=
True
strategy
.
sharding_configs
=
{
"sharding_degree"
:
1
,
"mp_degree"
:
1
,
"pp_degree"
:
2
,
"dp_degree"
:
2
,
}
strategy
.
pipeline
=
True
strategy
.
pipeline_configs
=
{
"schedule_mode"
:
"1F1B"
,
"micro_batch_size"
:
2
,
"accumulate_steps"
:
4
}
strategy
.
gradient_scale_configs
=
{
'scale_strategy'
:
'avg'
,
'scale_gradient'
:
True
}
strategy
.
fuse_grad_merge
=
True
self
.
optimizer
(
avg_cost
,
strategy
,
train_prog
,
startup_prog
)
train_prog
=
train_prog
.
_pipeline_opt
[
'section_program'
]
startup_prog
=
startup_prog
.
_pipeline_opt
[
'startup_program'
]
startup_prog_ops
=
startup_prog
.
global_block
().
ops
main_prog_ops
=
train_prog
.
global_block
().
ops
# check program
startup_prog_op_types
=
[
op
.
type
for
op
in
startup_prog_ops
]
main_prog_op_types
=
[
op
.
type
for
op
in
main_prog_ops
]
self
.
assertEqual
(
startup_prog_op_types
,
[
'uniform_random'
,
'fill_constant'
,
'uniform_random'
,
'fill_constant'
,
'uniform_random'
,
'fill_constant'
,
'uniform_random'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'c_gen_nccl_id'
,
'c_comm_init'
,
'c_gen_nccl_id'
,
'c_comm_init'
,
'c_gen_nccl_id'
,
'c_comm_init'
,
'c_gen_nccl_id'
,
'c_comm_init'
,
'c_broadcast'
,
'c_broadcast'
,
'c_broadcast'
,
'c_broadcast'
,
'c_broadcast'
,
'c_broadcast'
,
'c_broadcast'
,
'c_broadcast'
])
self
.
assertEqual
(
main_prog_op_types
,
[
'recv_v2'
,
'cast'
,
'mul'
,
'cast'
,
'elementwise_add'
,
'tanh'
,
'cast'
,
'mul'
,
'cast'
,
'elementwise_add'
,
'tanh'
,
'cast'
,
'mul'
,
'cast'
,
'elementwise_add'
,
'tanh'
,
'cast'
,
'mul'
,
'cast'
,
'elementwise_add'
,
'softmax'
,
'cross_entropy2'
,
'mean'
,
'elementwise_mul'
,
'coalesce_tensor'
,
'coalesce_tensor'
,
'coalesce_tensor'
,
'coalesce_tensor'
,
'fill_constant'
,
'elementwise_mul_grad'
,
'mean_grad'
,
'cross_entropy_grad2'
,
'softmax_grad'
,
'elementwise_add_grad'
,
'cast'
,
'mul_grad'
,
'tanh_grad'
,
'elementwise_add_grad'
,
'mul_grad'
,
'tanh_grad'
,
'elementwise_add_grad'
,
'mul_grad'
,
'tanh_grad'
,
'elementwise_add_grad'
,
'mul_grad'
,
'c_sync_calc_stream'
,
'send_v2'
,
'cast'
,
'sum'
,
'sum'
,
'c_allreduce_sum'
,
'c_allreduce_sum'
,
'c_sync_comm_stream'
,
'scale'
,
'check_finite_and_unscale'
,
'cast'
,
'c_allreduce_max'
,
'cast'
,
'update_loss_scaling'
,
'momentum'
,
'momentum'
,
'momentum'
,
'momentum'
,
'momentum'
,
'momentum'
,
'momentum'
,
'momentum'
])
def
test_hybrid_with_pp_dp_with_gradient_fuse_and_avg_after_sum
(
self
):
train_prog
,
startup_prog
=
paddle
.
fluid
.
Program
(),
paddle
.
fluid
.
Program
(
)
avg_cost
,
strategy
=
self
.
pp_net
(
train_prog
,
startup_prog
)
strategy
.
sharding
=
True
strategy
.
sharding_configs
=
{
"sharding_degree"
:
1
,
"mp_degree"
:
1
,
"pp_degree"
:
2
,
"dp_degree"
:
2
,
}
strategy
.
pipeline
=
True
strategy
.
pipeline_configs
=
{
"schedule_mode"
:
"1F1B"
,
"micro_batch_size"
:
2
,
"accumulate_steps"
:
4
}
strategy
.
gradient_scale_configs
=
{
'scale_strategy'
:
'avg'
,
'scale_gradient'
:
True
}
strategy
.
fuse_grad_merge
=
True
self
.
optimizer
(
avg_cost
,
strategy
,
train_prog
,
startup_prog
)
train_prog
=
train_prog
.
_pipeline_opt
[
'section_program'
]
startup_prog
=
startup_prog
.
_pipeline_opt
[
'startup_program'
]
startup_prog_ops
=
startup_prog
.
global_block
().
ops
main_prog_ops
=
train_prog
.
global_block
().
ops
# check program
startup_prog_op_types
=
[
op
.
type
for
op
in
startup_prog_ops
]
main_prog_op_types
=
[
op
.
type
for
op
in
main_prog_ops
]
self
.
assertEqual
(
startup_prog_op_types
,
[
'uniform_random'
,
'fill_constant'
,
'uniform_random'
,
'fill_constant'
,
'uniform_random'
,
'fill_constant'
,
'uniform_random'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'c_gen_nccl_id'
,
'c_comm_init'
,
'c_gen_nccl_id'
,
'c_comm_init'
,
'c_gen_nccl_id'
,
'c_comm_init'
,
'c_gen_nccl_id'
,
'c_comm_init'
,
'c_broadcast'
,
'c_broadcast'
,
'c_broadcast'
,
'c_broadcast'
,
'c_broadcast'
,
'c_broadcast'
,
'c_broadcast'
,
'c_broadcast'
])
self
.
assertEqual
(
main_prog_op_types
,
[
'recv_v2'
,
'mul'
,
'elementwise_add'
,
'tanh'
,
'mul'
,
'elementwise_add'
,
'tanh'
,
'mul'
,
'elementwise_add'
,
'tanh'
,
'mul'
,
'elementwise_add'
,
'softmax'
,
'cross_entropy2'
,
'mean'
,
'coalesce_tensor'
,
'coalesce_tensor'
,
'fill_constant'
,
'mean_grad'
,
'cross_entropy_grad2'
,
'softmax_grad'
,
'elementwise_add_grad'
,
'mul_grad'
,
'tanh_grad'
,
'elementwise_add_grad'
,
'mul_grad'
,
'tanh_grad'
,
'elementwise_add_grad'
,
'mul_grad'
,
'tanh_grad'
,
'elementwise_add_grad'
,
'mul_grad'
,
'c_sync_calc_stream'
,
'send_v2'
,
'sum'
,
'c_allreduce_sum'
,
'c_sync_comm_stream'
,
'scale'
,
'momentum'
,
'momentum'
,
'momentum'
,
'momentum'
,
'momentum'
,
'momentum'
,
'momentum'
,
'momentum'
])
def
test_hybrid_with_pp_dp_with_amp_no_dynamic_gradient_fuse_and_avg_after_sum
(
self
):
train_prog
,
startup_prog
=
paddle
.
fluid
.
Program
(),
paddle
.
fluid
.
Program
(
)
avg_cost
,
strategy
=
self
.
pp_net
(
train_prog
,
startup_prog
)
strategy
.
sharding
=
True
strategy
.
sharding_configs
=
{
"sharding_degree"
:
1
,
"mp_degree"
:
1
,
"pp_degree"
:
2
,
"dp_degree"
:
2
,
}
strategy
.
amp
=
True
strategy
.
amp_configs
=
{
'custom_black_varnames'
:
[
'fc_6.b_0'
],
'use_dynamic_loss_scaling'
:
False
}
strategy
.
pipeline
=
True
strategy
.
pipeline_configs
=
{
"schedule_mode"
:
"1F1B"
,
"micro_batch_size"
:
2
,
"accumulate_steps"
:
4
}
strategy
.
gradient_scale_configs
=
{
'scale_strategy'
:
'avg'
,
'scale_gradient'
:
True
}
strategy
.
fuse_grad_merge
=
True
self
.
optimizer
(
avg_cost
,
strategy
,
train_prog
,
startup_prog
)
train_prog
=
train_prog
.
_pipeline_opt
[
'section_program'
]
startup_prog
=
startup_prog
.
_pipeline_opt
[
'startup_program'
]
startup_prog_ops
=
startup_prog
.
global_block
().
ops
main_prog_ops
=
train_prog
.
global_block
().
ops
# check program
startup_prog_op_types
=
[
op
.
type
for
op
in
startup_prog_ops
]
main_prog_op_types
=
[
op
.
type
for
op
in
main_prog_ops
]
self
.
assertEqual
(
startup_prog_op_types
,
[
'uniform_random'
,
'fill_constant'
,
'uniform_random'
,
'fill_constant'
,
'uniform_random'
,
'fill_constant'
,
'uniform_random'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'c_gen_nccl_id'
,
'c_comm_init'
,
'c_gen_nccl_id'
,
'c_comm_init'
,
'c_gen_nccl_id'
,
'c_comm_init'
,
'c_gen_nccl_id'
,
'c_comm_init'
,
'c_broadcast'
,
'c_broadcast'
,
'c_broadcast'
,
'c_broadcast'
,
'c_broadcast'
,
'c_broadcast'
,
'c_broadcast'
,
'c_broadcast'
])
self
.
assertEqual
(
main_prog_op_types
,
[
'recv_v2'
,
'cast'
,
'mul'
,
'cast'
,
'elementwise_add'
,
'tanh'
,
'cast'
,
'mul'
,
'cast'
,
'elementwise_add'
,
'tanh'
,
'cast'
,
'mul'
,
'cast'
,
'elementwise_add'
,
'tanh'
,
'cast'
,
'mul'
,
'cast'
,
'elementwise_add'
,
'softmax'
,
'cross_entropy2'
,
'mean'
,
'elementwise_mul'
,
'coalesce_tensor'
,
'coalesce_tensor'
,
'coalesce_tensor'
,
'coalesce_tensor'
,
'fill_constant'
,
'elementwise_mul_grad'
,
'mean_grad'
,
'cross_entropy_grad2'
,
'softmax_grad'
,
'elementwise_add_grad'
,
'cast'
,
'mul_grad'
,
'tanh_grad'
,
'elementwise_add_grad'
,
'mul_grad'
,
'tanh_grad'
,
'elementwise_add_grad'
,
'mul_grad'
,
'tanh_grad'
,
'elementwise_add_grad'
,
'mul_grad'
,
'c_sync_calc_stream'
,
'send_v2'
,
'cast'
,
'sum'
,
'sum'
,
'c_allreduce_sum'
,
'c_allreduce_sum'
,
'c_sync_comm_stream'
,
'scale'
,
'scale'
,
'check_finite_and_unscale'
,
'momentum'
,
'momentum'
,
'momentum'
,
'momentum'
,
'momentum'
,
'momentum'
,
'momentum'
,
'momentum'
])
if
__name__
==
"__main__"
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录