Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
b0cca48e
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
b0cca48e
编写于
1月 25, 2022
作者:
H
Haohongxiang
提交者:
GitHub
1月 25, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Dygraph] Support param groups in grad_clip (#39175)
* support param groups in grad_clip * update * modify for review
上级
faf517b2
变更
6
显示空白变更内容
内联
并排
Showing
6 changed file
with
49 addition
and
21 deletion
+49
-21
python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
...optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
+10
-7
python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
...optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+7
-0
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
...istributed/fleet/meta_parallel/sharding/sharding_utils.py
+4
-7
python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
...n/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
+8
-0
python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py
.../fluid/tests/unittests/dygraph_sharding_stage2_offload.py
+6
-7
python/paddle/fluid/tests/unittests/hybrid_parallel_pp_clip_grad.py
...dle/fluid/tests/unittests/hybrid_parallel_pp_clip_grad.py
+14
-0
未找到文件。
python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
浏览文件 @
b0cca48e
...
@@ -49,8 +49,6 @@ class HybridParallelClipGrad:
...
@@ -49,8 +49,6 @@ class HybridParallelClipGrad:
@
imperative_base
.
no_grad
@
imperative_base
.
no_grad
def
_dygraph_clip
(
self
,
params_grads
):
def
_dygraph_clip
(
self
,
params_grads
):
params_and_grads
=
[]
sum_square_dist_fp16
=
[]
sum_square_dist_fp16
=
[]
sum_square_dist_fp32
=
[]
sum_square_dist_fp32
=
[]
sum_square_not_dist_fp16
=
[]
sum_square_not_dist_fp16
=
[]
...
@@ -153,15 +151,14 @@ class HybridParallelClipGrad:
...
@@ -153,15 +151,14 @@ class HybridParallelClipGrad:
if
g
is
None
:
if
g
is
None
:
continue
continue
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
params_and_grads
.
append
((
p
,
g
))
continue
continue
if
p
.
dtype
==
paddle
.
float16
:
if
p
.
dtype
==
paddle
.
float16
:
new_grad
=
layers
.
elementwise_mul
(
x
=
g
,
y
=
clip_var_fp16
)
g
.
scale_
(
clip_var_fp16
)
else
:
else
:
new_grad
=
layers
.
elementwise_mul
(
x
=
g
,
y
=
clip_var
)
g
.
scale_
(
clip_var
)
p
arams_and_grads
.
append
((
p
,
new_grad
)
)
p
.
_reset_grad_inplace_version
(
True
)
return
params_
and_
grads
return
params_grads
def
__getattr__
(
self
,
item
):
def
__getattr__
(
self
,
item
):
return
getattr
(
self
.
_clip
,
item
)
return
getattr
(
self
.
_clip
,
item
)
...
@@ -201,6 +198,12 @@ class HybridParallelOptimizer:
...
@@ -201,6 +198,12 @@ class HybridParallelOptimizer:
else
:
else
:
self
.
_inner_opt
.
_grad_clip
=
HybridParallelClipGrad
(
self
.
_inner_opt
.
_grad_clip
=
HybridParallelClipGrad
(
self
.
_inner_opt
.
_grad_clip
,
hcg
)
self
.
_inner_opt
.
_grad_clip
,
hcg
)
if
self
.
_inner_opt
.
_parameter_list
and
isinstance
(
self
.
_inner_opt
.
_parameter_list
[
0
],
dict
):
for
item
in
self
.
_inner_opt
.
_param_groups
:
if
"grad_clip"
in
item
.
keys
():
item
[
"grad_clip"
]
=
HybridParallelClipGrad
(
self
.
_inner_opt
.
_grad_clip
,
hcg
)
@
imperative_base
.
no_grad
@
imperative_base
.
no_grad
@
framework
.
dygraph_only
@
framework
.
dygraph_only
...
...
python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
浏览文件 @
b0cca48e
...
@@ -109,6 +109,13 @@ class ShardingOptimizerStage2(Optimizer):
...
@@ -109,6 +109,13 @@ class ShardingOptimizerStage2(Optimizer):
self
.
_optim
.
_grad_clip
=
ShardingClipGrad
(
self
.
_optim
.
_grad_clip
,
self
.
_optim
.
_grad_clip
=
ShardingClipGrad
(
self
.
_optim
.
_grad_clip
,
paddle
.
get_device
(),
paddle
.
get_device
(),
self
.
group
)
self
.
group
)
if
self
.
_optim
.
_parameter_list
and
isinstance
(
self
.
_optim
.
_parameter_list
[
0
],
dict
):
for
item
in
self
.
_optim
.
_param_groups
:
if
"grad_clip"
in
item
.
keys
():
item
[
"grad_clip"
]
=
ShardingClipGrad
(
self
.
_optim
.
_grad_clip
,
paddle
.
get_device
(),
self
.
group
)
if
offload
:
if
offload
:
assert
self
.
_pfp16
,
"Only support offload strategy while using
\'
Adam
\'
,
\'
AdamW
\'
and
\'
Momentum
\'
optimizer with AMP/Pure FP16"
assert
self
.
_pfp16
,
"Only support offload strategy while using
\'
Adam
\'
,
\'
AdamW
\'
and
\'
Momentum
\'
optimizer with AMP/Pure FP16"
...
...
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
浏览文件 @
b0cca48e
...
@@ -57,8 +57,6 @@ class ShardingClipGrad:
...
@@ -57,8 +57,6 @@ class ShardingClipGrad:
@
imperative_base
.
no_grad
@
imperative_base
.
no_grad
def
_dygraph_clip
(
self
,
params_grads
):
def
_dygraph_clip
(
self
,
params_grads
):
params_and_grads
=
[]
sum_square_fp16
=
[]
sum_square_fp16
=
[]
sum_square_fp32
=
[]
sum_square_fp32
=
[]
...
@@ -114,15 +112,14 @@ class ShardingClipGrad:
...
@@ -114,15 +112,14 @@ class ShardingClipGrad:
if
g
is
None
:
if
g
is
None
:
continue
continue
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
params_and_grads
.
append
((
p
,
g
))
continue
continue
if
p
.
dtype
==
paddle
.
float16
:
if
p
.
dtype
==
paddle
.
float16
:
new_grad
=
layers
.
elementwise_mul
(
x
=
g
,
y
=
clip_var_fp16
)
g
.
scale_
(
clip_var_fp16
)
else
:
else
:
new_grad
=
layers
.
elementwise_mul
(
x
=
g
,
y
=
clip_var
)
g
.
scale_
(
clip_var
)
p
arams_and_grads
.
append
((
p
,
new_grad
)
)
p
.
_reset_grad_inplace_version
(
True
)
return
params_
and_
grads
return
params_grads
def
__getattr__
(
self
,
item
):
def
__getattr__
(
self
,
item
):
return
getattr
(
self
.
_clip
,
item
)
return
getattr
(
self
.
_clip
,
item
)
...
...
python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
浏览文件 @
b0cca48e
...
@@ -159,10 +159,13 @@ def test_dp_stage2():
...
@@ -159,10 +159,13 @@ def test_dp_stage2():
mlp2
=
MLP
()
mlp2
=
MLP
()
mlp3
=
MLP
()
mlp3
=
MLP
()
mlp4
=
MLP
()
mlp4
=
MLP
()
mlp5
=
MLP
()
mlp1
.
set_state_dict
(
state_dict
)
mlp1
.
set_state_dict
(
state_dict
)
mlp2
.
set_state_dict
(
state_dict
)
mlp2
.
set_state_dict
(
state_dict
)
mlp3
.
set_state_dict
(
state_dict
)
mlp3
.
set_state_dict
(
state_dict
)
mlp4
.
set_state_dict
(
state_dict
)
mlp4
.
set_state_dict
(
state_dict
)
mlp5
.
set_state_dict
(
state_dict
)
dp_params
=
train_mlp
(
dp_params
=
train_mlp
(
mlp1
,
sharding_stage
=
"dp"
,
use_pure_fp16
=
False
,
opt_group
=
False
)
mlp1
,
sharding_stage
=
"dp"
,
use_pure_fp16
=
False
,
opt_group
=
False
)
stage2_params
=
train_mlp
(
stage2_params
=
train_mlp
(
...
@@ -181,6 +184,11 @@ def test_dp_stage2():
...
@@ -181,6 +184,11 @@ def test_dp_stage2():
rtol
=
1e-5
,
rtol
=
1e-5
,
atol
=
1e-5
)
atol
=
1e-5
)
stage2_params
=
train_mlp
(
mlp2
,
sharding_stage
=
2
,
use_pure_fp16
=
False
,
opt_group
=
True
)
for
i
in
range
(
len
(
dp_params
)):
np
.
testing
.
assert_allclose
(
dp_params
[
i
].
numpy
(),
stage2_params
[
i
].
numpy
(),
rtol
=
1e-6
)
return
return
...
...
python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py
浏览文件 @
b0cca48e
...
@@ -49,7 +49,7 @@ def train_mlp(model, offload=False):
...
@@ -49,7 +49,7 @@ def train_mlp(model, offload=False):
optimizer
=
ShardingOptimizerStage2
(
optimizer
=
ShardingOptimizerStage2
(
params
=
model
.
parameters
(),
optim
=
optimizer
,
offload
=
offload
)
params
=
model
.
parameters
(),
optim
=
optimizer
,
offload
=
offload
)
model
=
ShardingStage2
(
model
=
ShardingStage2
(
model
,
optimizer
,
buffer_max_size
=
2
**
21
,
accumulate_grads
=
Tru
e
)
model
,
optimizer
,
buffer_max_size
=
2
**
21
,
accumulate_grads
=
Fals
e
)
train_reader
=
paddle
.
batch
(
train_reader
=
paddle
.
batch
(
reader_decorator
(
linear_size
),
batch_size
=
batch_size
,
drop_last
=
True
)
reader_decorator
(
linear_size
),
batch_size
=
batch_size
,
drop_last
=
True
)
...
@@ -98,12 +98,11 @@ def test_sharding_stage2_offload():
...
@@ -98,12 +98,11 @@ def test_sharding_stage2_offload():
mlp_offload_params
=
train_mlp
(
mlp_offload
,
offload
=
True
)
mlp_offload_params
=
train_mlp
(
mlp_offload
,
offload
=
True
)
for
i
in
range
(
len
(
mlp_params
)):
for
i
in
range
(
len
(
mlp_params
)):
for
j
in
range
(
len
(
mlp_offload_params
)):
if
mlp_params
[
i
].
name
==
mlp_offload_params
[
j
].
name
:
np
.
testing
.
assert_allclose
(
np
.
testing
.
assert_allclose
(
mlp_params
[
i
].
numpy
(),
mlp_params
[
i
].
numpy
(),
mlp_offload_params
[
j
].
numpy
(),
mlp_offload_params
[
i
].
numpy
(),
rtol
=
1e-6
)
rtol
=
5e-3
,
atol
=
5e-3
)
return
return
...
...
python/paddle/fluid/tests/unittests/hybrid_parallel_pp_clip_grad.py
浏览文件 @
b0cca48e
...
@@ -31,5 +31,19 @@ class TestPPClipGrad(TestDistPPTraning):
...
@@ -31,5 +31,19 @@ class TestPPClipGrad(TestDistPPTraning):
return
scheduler
,
optimizer
return
scheduler
,
optimizer
class
TestPPClipGradParamGroup
(
TestDistPPTraning
):
def
build_optimizer
(
self
,
model
):
grad_clip
=
paddle
.
nn
.
ClipGradByGlobalNorm
(
0.5
)
scheduler
=
paddle
.
optimizer
.
lr
.
PiecewiseDecay
(
boundaries
=
[
2
],
values
=
[
0.001
,
0.002
],
verbose
=
True
)
optimizer
=
paddle
.
optimizer
.
Momentum
(
learning_rate
=
scheduler
,
grad_clip
=
grad_clip
,
parameters
=
[{
"params"
:
model
.
parameters
()
}])
return
scheduler
,
optimizer
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
unittest
.
main
()
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录