Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
90f44c6f
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
90f44c6f
编写于
1月 28, 2022
作者:
B
Baibaifan
提交者:
GitHub
1月 28, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix_stage2_minimize (#39285)
上级
0bb3e5f1
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
45 addition
and
7 deletion
+45
-7
python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
...optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+5
-1
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
...stributed/fleet/meta_parallel/sharding/sharding_stage3.py
+6
-0
python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py
...luid/tests/unittests/dygraph_sharding_optimizer_stage2.py
+11
-2
python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
...n/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
+23
-4
未找到文件。
python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
浏览文件 @
90f44c6f
...
...
@@ -70,7 +70,7 @@ class ShardingOptimizerStage2(Optimizer):
device
=
"gpu"
,
**
kw
):
#
super().__init__(optim._learning_rate, params, kw)
super
().
__init__
(
optim
.
_learning_rate
,
params
,
kw
)
# Segmentation information
self
.
_dtype_rank_params
=
OrderedDict
(
...
...
@@ -363,6 +363,10 @@ class ShardingOptimizerStage2(Optimizer):
# Synchronize all the updated shards in between the ranks
self
.
_broadcast_params
()
def
minimize
(
self
):
raise
RuntimeError
(
"optimizer.minimize() not support now, please use optimizer.step()"
)
def
_clear_cache
(
self
):
self
.
__segment_params
.
clear
()
self
.
_dtype_rank_params
.
clear
()
...
...
python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
浏览文件 @
90f44c6f
...
...
@@ -506,7 +506,13 @@ class ShardingStage3(nn.Layer):
else
:
opt_step
()
def
_opt_minimize
(
self
):
raise
RuntimeError
(
"optimizer.minimize() not support now, please use optimizer.step()"
)
self
.
_optim
.
step
=
MethodType
(
_opt_step
,
self
.
_optim
)
self
.
_optim
.
minimize
=
MethodType
(
_opt_minimize
,
self
.
_optim
)
def
_redefine_opt_clear
(
self
):
clear_func
=
self
.
_clear_gradients
...
...
python/paddle/fluid/tests/unittests/dygraph_sharding_optimizer_stage2.py
浏览文件 @
90f44c6f
...
...
@@ -124,8 +124,17 @@ def train_mlp():
avg_loss
.
backward
()
oss_optimizer
.
step
()
# oss_optimizer clear cache
oss_optimizer
.
_clear_cache
()
# oss_optimizer clear cache
oss_optimizer
.
_clear_cache
()
# check optimizer.minimize() error
try
:
oss_optimizer
.
minimize
()
except
:
print
(
"====== Find sharding_stage2_optimizer.minimize() error ======"
)
return
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
浏览文件 @
90f44c6f
...
...
@@ -83,7 +83,8 @@ def train_mlp(model,
accumulate_grad
=
False
,
batch_size
=
100
,
opt_group
=
False
,
recompute
=
False
):
recompute
=
False
,
test_minimize
=
False
):
group
=
paddle
.
distributed
.
new_group
([
0
,
1
])
if
opt_group
:
optimizer
=
optimizer_setting
(
...
...
@@ -113,6 +114,15 @@ def train_mlp(model,
accumulate_grads
=
batch_size
==
20
,
sync_comm
=
recompute
)
# check optimizer.minimize() error
if
test_minimize
:
try
:
optimizer
.
minimize
()
except
:
print
(
"====== Find sharding_stage3_optimizer.minimize() error ======"
)
return
train_reader
=
paddle
.
batch
(
reader_decorator
(),
batch_size
=
batch_size
,
drop_last
=
True
)
...
...
@@ -160,8 +170,8 @@ def train_mlp(model,
def
test_stage2_stage3
():
mlp
,
mlp1
,
mlp2
,
mlp3
,
mlp4
,
mlp5
,
mlp6
,
mlp7
,
mlp8
=
MLP
(),
MLP
(),
MLP
(
),
MLP
(),
MLP
(),
MLP
(),
MLP
(),
MLP
(),
MLP
()
mlp
,
mlp1
,
mlp2
,
mlp3
,
mlp4
,
mlp5
,
mlp6
,
mlp7
,
mlp8
,
mlp9
=
MLP
(),
MLP
(
),
MLP
(),
MLP
(),
MLP
(),
MLP
(),
MLP
(),
MLP
()
,
MLP
(),
MLP
()
state_dict
=
mlp
.
state_dict
()
mlp1
.
set_state_dict
(
state_dict
)
mlp2
.
set_state_dict
(
state_dict
)
...
...
@@ -171,6 +181,8 @@ def test_stage2_stage3():
mlp6
.
set_state_dict
(
state_dict
)
mlp7
.
set_state_dict
(
state_dict
)
mlp8
.
set_state_dict
(
state_dict
)
mlp9
.
set_state_dict
(
state_dict
)
# fp32
stage2_params
=
train_mlp
(
mlp1
,
sharding_stage
=
2
,
use_pure_fp16
=
False
,
opt_group
=
False
)
...
...
@@ -229,7 +241,14 @@ def test_stage2_stage3():
for
i
in
range
(
len
(
stage3_params
)):
np
.
testing
.
assert_allclose
(
stage3_params
[
i
].
numpy
(),
stage3_params_re
[
i
].
numpy
(),
rtol
=
1e-6
)
return
# check optimizer.minimize() error
train_mlp
(
mlp9
,
sharding_stage
=
3
,
use_pure_fp16
=
False
,
opt_group
=
False
,
test_minimize
=
True
)
if
__name__
==
'__main__'
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录