Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
ffd8adca
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
ffd8adca
编写于
8月 08, 2022
作者:
H
Haohongxiang
提交者:
GitHub
8月 08, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix_bugs_of_sharding (#44982)
上级
031debb7
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
81 addition
and
27 deletion
+81
-27
python/paddle/distributed/fleet/base/fleet_base.py
python/paddle/distributed/fleet/base/fleet_base.py
+2
-3
python/paddle/distributed/sharding/group_sharded.py
python/paddle/distributed/sharding/group_sharded.py
+1
-1
python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py
...paddle/fluid/tests/unittests/dygraph_group_sharded_api.py
+42
-11
python/paddle/fluid/tests/unittests/dygraph_group_sharded_api_eager.py
.../fluid/tests/unittests/dygraph_group_sharded_api_eager.py
+36
-12
未找到文件。
python/paddle/distributed/fleet/base/fleet_base.py
浏览文件 @
ffd8adca
...
@@ -1856,9 +1856,8 @@ class Fleet(object):
...
@@ -1856,9 +1856,8 @@ class Fleet(object):
group
=
None
)
group
=
None
)
self
.
_found_inf
=
is_found_inf
.
numpy
()[
0
]
self
.
_found_inf
=
is_found_inf
.
numpy
()[
0
]
# Only tensor_parallel and pipeline_parallel need to modify scaler
# Only data_parallel doesn't need to modify scaler
if
self
.
_hcg
.
get_parallel_mode
()
in
(
ParallelMode
.
TENSOR_PARALLEL
,
if
self
.
_hcg
.
get_parallel_mode
()
is
not
ParallelMode
.
DATA_PARALLEL
:
ParallelMode
.
PIPELINE_PARALLEL
):
scaler
.
_unscale
=
MethodType
(
unscale_method
,
scaler
)
scaler
.
_unscale
=
MethodType
(
unscale_method
,
scaler
)
return
scaler
return
scaler
python/paddle/distributed/sharding/group_sharded.py
浏览文件 @
ffd8adca
...
@@ -159,7 +159,7 @@ def group_sharded_parallel(model,
...
@@ -159,7 +159,7 @@ def group_sharded_parallel(model,
sync_comm
=
sync_comm
)
sync_comm
=
sync_comm
)
else
:
else
:
raise
ValueError
(
"Please enter the correct level."
)
raise
ValueError
(
"Please enter the correct level."
)
if
params_fp16
and
isinstance
(
scaler
,
paddle
.
amp
.
GradScaler
):
if
isinstance
(
scaler
,
paddle
.
amp
.
GradScaler
):
if
in_dygraph_mode
():
if
in_dygraph_mode
():
scaler
=
GroupShardedScaler
(
scaler
)
scaler
=
GroupShardedScaler
(
scaler
)
else
:
else
:
...
...
python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py
浏览文件 @
ffd8adca
...
@@ -61,7 +61,7 @@ def reader_decorator(linear_size=1000):
...
@@ -61,7 +61,7 @@ def reader_decorator(linear_size=1000):
return
__reader__
return
__reader__
def
optimizer_setting
(
model
,
use_
pure_fp16
,
opt_group
=
False
):
def
optimizer_setting
(
model
,
use_
multi_precision
,
opt_group
=
False
):
clip
=
paddle
.
nn
.
ClipGradByGlobalNorm
(
clip_norm
=
1.0
)
clip
=
paddle
.
nn
.
ClipGradByGlobalNorm
(
clip_norm
=
1.0
)
optimizer
=
paddle
.
optimizer
.
Momentum
(
optimizer
=
paddle
.
optimizer
.
Momentum
(
parameters
=
[{
parameters
=
[{
...
@@ -70,16 +70,23 @@ def optimizer_setting(model, use_pure_fp16, opt_group=False):
...
@@ -70,16 +70,23 @@ def optimizer_setting(model, use_pure_fp16, opt_group=False):
learning_rate
=
0.001
,
learning_rate
=
0.001
,
weight_decay
=
0.00001
,
weight_decay
=
0.00001
,
grad_clip
=
clip
,
grad_clip
=
clip
,
multi_precision
=
use_
pure_fp16
)
multi_precision
=
use_
multi_precision
)
return
optimizer
return
optimizer
def
train_mlp
(
model
,
shard_level
,
use_pure_fp16
,
output_dir
):
def
train_mlp
(
model
,
shard_level
,
use_multi_precision
,
output_dir
,
amp_level
=
'O1'
):
group
=
paddle
.
distributed
.
new_group
([
0
,
1
])
group
=
paddle
.
distributed
.
new_group
([
0
,
1
])
optimizer
=
optimizer_setting
(
model
=
model
,
use_pure_fp16
=
use_pure_fp16
)
optimizer
=
optimizer_setting
(
model
=
model
,
model
=
paddle
.
amp
.
decorate
(
models
=
model
,
level
=
'O2'
,
save_dtype
=
'float32'
)
use_multi_precision
=
use_multi_precision
)
model
=
paddle
.
amp
.
decorate
(
models
=
model
,
level
=
amp_level
,
save_dtype
=
'float32'
)
scaler
=
paddle
.
amp
.
GradScaler
(
init_loss_scaling
=
32768
)
scaler
=
paddle
.
amp
.
GradScaler
(
init_loss_scaling
=
32768
)
model
,
optimizer
,
scaler
=
group_sharded_parallel
(
model
=
model
,
model
,
optimizer
,
scaler
=
group_sharded_parallel
(
model
=
model
,
...
@@ -104,13 +111,13 @@ def train_mlp(model, shard_level, use_pure_fp16, output_dir):
...
@@ -104,13 +111,13 @@ def train_mlp(model, shard_level, use_pure_fp16, output_dir):
img
,
label
=
data
img
,
label
=
data
label
.
stop_gradient
=
True
label
.
stop_gradient
=
True
img
.
stop_gradient
=
True
img
.
stop_gradient
=
True
with
paddle
.
amp
.
auto_cast
(
True
,
level
=
'O2'
):
with
paddle
.
amp
.
auto_cast
(
True
,
level
=
amp_level
):
out
=
model
(
img
)
out
=
model
(
img
)
loss
=
paddle
.
nn
.
functional
.
cross_entropy
(
input
=
out
,
loss
=
paddle
.
nn
.
functional
.
cross_entropy
(
input
=
out
,
label
=
label
)
label
=
label
)
avg_loss
=
paddle
.
mean
(
x
=
loss
.
cast
(
dtype
=
paddle
.
float32
))
avg_loss
=
paddle
.
mean
(
x
=
loss
.
cast
(
dtype
=
paddle
.
float32
))
if
not
use_
pure_fp16
:
if
not
use_
multi_precision
:
avg_loss
.
backward
()
avg_loss
.
backward
()
optimizer
.
step
()
optimizer
.
step
()
else
:
else
:
...
@@ -135,12 +142,36 @@ def test_sharding_api():
...
@@ -135,12 +142,36 @@ def test_sharding_api():
# fp16
# fp16
stage2_params
=
train_mlp
(
mlp1
,
stage2_params
=
train_mlp
(
mlp1
,
shard_level
=
"os_g"
,
shard_level
=
"os_g"
,
use_pure_fp16
=
True
,
use_multi_precision
=
True
,
output_dir
=
output_dir
)
output_dir
=
output_dir
,
amp_level
=
'O2'
)
stage3_params
=
train_mlp
(
mlp2
,
stage3_params
=
train_mlp
(
mlp2
,
shard_level
=
"p_g_os"
,
shard_level
=
"p_g_os"
,
use_pure_fp16
=
True
,
use_multi_precision
=
True
,
output_dir
=
output_dir
)
output_dir
=
output_dir
,
amp_level
=
'O2'
)
for
i
in
range
(
len
(
stage3_params
)):
np
.
testing
.
assert_allclose
(
stage2_params
[
i
].
numpy
(),
stage3_params
[
i
].
numpy
(),
rtol
=
1e-4
,
atol
=
1e-3
)
# AMP
mlp3
,
mlp4
=
MLP
(),
MLP
()
mlp3
.
set_state_dict
(
state_dict
)
mlp4
.
set_state_dict
(
state_dict
)
stage2_params
=
train_mlp
(
mlp3
,
shard_level
=
"os_g"
,
use_multi_precision
=
True
,
output_dir
=
output_dir
,
amp_level
=
'O1'
)
stage3_params
=
train_mlp
(
mlp4
,
shard_level
=
"p_g_os"
,
use_multi_precision
=
True
,
output_dir
=
output_dir
,
amp_level
=
'O1'
)
for
i
in
range
(
len
(
stage3_params
)):
for
i
in
range
(
len
(
stage3_params
)):
np
.
testing
.
assert_allclose
(
stage2_params
[
i
].
numpy
(),
np
.
testing
.
assert_allclose
(
stage2_params
[
i
].
numpy
(),
...
...
python/paddle/fluid/tests/unittests/dygraph_group_sharded_api_eager.py
浏览文件 @
ffd8adca
...
@@ -61,7 +61,7 @@ def reader_decorator(linear_size=1000):
...
@@ -61,7 +61,7 @@ def reader_decorator(linear_size=1000):
return
__reader__
return
__reader__
def
optimizer_setting
(
model
,
use_
pure_fp16
,
opt_group
=
False
):
def
optimizer_setting
(
model
,
use_
multi_precision
,
opt_group
=
False
):
clip
=
paddle
.
nn
.
ClipGradByGlobalNorm
(
clip_norm
=
1.0
)
clip
=
paddle
.
nn
.
ClipGradByGlobalNorm
(
clip_norm
=
1.0
)
optimizer
=
paddle
.
optimizer
.
Momentum
(
optimizer
=
paddle
.
optimizer
.
Momentum
(
parameters
=
[{
parameters
=
[{
...
@@ -70,14 +70,21 @@ def optimizer_setting(model, use_pure_fp16, opt_group=False):
...
@@ -70,14 +70,21 @@ def optimizer_setting(model, use_pure_fp16, opt_group=False):
learning_rate
=
0.001
,
learning_rate
=
0.001
,
weight_decay
=
0.00001
,
weight_decay
=
0.00001
,
grad_clip
=
clip
,
grad_clip
=
clip
,
multi_precision
=
use_
pure_fp16
)
multi_precision
=
use_
multi_precision
)
return
optimizer
return
optimizer
def
train_mlp
(
model
,
shard_level
,
use_pure_fp16
,
output_dir
):
def
train_mlp
(
model
,
optimizer
=
optimizer_setting
(
model
=
model
,
use_pure_fp16
=
use_pure_fp16
)
shard_level
,
model
=
paddle
.
amp
.
decorate
(
models
=
model
,
level
=
'O2'
,
save_dtype
=
'float32'
)
use_multi_precision
,
output_dir
,
amp_level
=
'O1'
):
optimizer
=
optimizer_setting
(
model
=
model
,
use_multi_precision
=
use_multi_precision
)
model
=
paddle
.
amp
.
decorate
(
models
=
model
,
level
=
amp_level
,
save_dtype
=
'float32'
)
scaler
=
paddle
.
amp
.
GradScaler
(
init_loss_scaling
=
32768
)
scaler
=
paddle
.
amp
.
GradScaler
(
init_loss_scaling
=
32768
)
model
,
optimizer
,
scaler
=
group_sharded_parallel
(
model
=
model
,
model
,
optimizer
,
scaler
=
group_sharded_parallel
(
model
=
model
,
...
@@ -102,13 +109,13 @@ def train_mlp(model, shard_level, use_pure_fp16, output_dir):
...
@@ -102,13 +109,13 @@ def train_mlp(model, shard_level, use_pure_fp16, output_dir):
img
,
label
=
data
img
,
label
=
data
label
.
stop_gradient
=
True
label
.
stop_gradient
=
True
img
.
stop_gradient
=
True
img
.
stop_gradient
=
True
with
paddle
.
amp
.
auto_cast
(
True
,
level
=
'O2'
):
with
paddle
.
amp
.
auto_cast
(
True
,
level
=
amp_level
):
out
=
model
(
img
)
out
=
model
(
img
)
loss
=
paddle
.
nn
.
functional
.
cross_entropy
(
input
=
out
,
loss
=
paddle
.
nn
.
functional
.
cross_entropy
(
input
=
out
,
label
=
label
)
label
=
label
)
avg_loss
=
paddle
.
mean
(
x
=
loss
.
cast
(
dtype
=
paddle
.
float32
))
avg_loss
=
paddle
.
mean
(
x
=
loss
.
cast
(
dtype
=
paddle
.
float32
))
if
not
use_
pure_fp16
:
if
not
use_
multi_precision
:
avg_loss
.
backward
()
avg_loss
.
backward
()
optimizer
.
step
()
optimizer
.
step
()
else
:
else
:
...
@@ -134,19 +141,36 @@ def test_sharding_api():
...
@@ -134,19 +141,36 @@ def test_sharding_api():
# fp16
# fp16
stage2_params
=
train_mlp
(
mlp1
,
stage2_params
=
train_mlp
(
mlp1
,
shard_level
=
"os_g"
,
shard_level
=
"os_g"
,
use_pure_fp16
=
True
,
use_multi_precision
=
True
,
output_dir
=
output_dir
)
output_dir
=
output_dir
,
amp_level
=
'O2'
)
stage3_params
=
train_mlp
(
mlp2
,
stage3_params
=
train_mlp
(
mlp2
,
shard_level
=
"p_g_os"
,
shard_level
=
"p_g_os"
,
use_pure_fp16
=
True
,
use_multi_precision
=
True
,
output_dir
=
output_dir
)
output_dir
=
output_dir
,
amp_level
=
'O2'
)
for
i
in
range
(
len
(
stage3_params
)):
for
i
in
range
(
len
(
stage3_params
)):
np
.
testing
.
assert_allclose
(
stage2_params
[
i
].
numpy
(),
np
.
testing
.
assert_allclose
(
stage2_params
[
i
].
numpy
(),
stage3_params
[
i
].
numpy
(),
stage3_params
[
i
].
numpy
(),
rtol
=
1e-4
,
rtol
=
1e-4
,
atol
=
1e-3
)
atol
=
1e-3
)
shutil
.
rmtree
(
output_dir
)
# AMP
mlp3
,
mlp4
=
MLP
(),
MLP
()
mlp3
.
set_state_dict
(
state_dict
)
mlp4
.
set_state_dict
(
state_dict
)
stage2_params
=
train_mlp
(
mlp3
,
shard_level
=
"os_g"
,
use_multi_precision
=
True
,
output_dir
=
output_dir
,
amp_level
=
'O1'
)
stage3_params
=
train_mlp
(
mlp4
,
shard_level
=
"p_g_os"
,
use_multi_precision
=
True
,
output_dir
=
output_dir
,
amp_level
=
'O1'
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录