Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
562b184c
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
562b184c
编写于
6月 13, 2022
作者:
Z
zhaoyingli
提交者:
GitHub
6月 13, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[AutoParallel] fix fetch list (#43412)
* fix fetch list * fix unittest
上级
c92b3805
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
121 addition
and
110 deletion
+121
-110
python/paddle/distributed/auto_parallel/engine.py
python/paddle/distributed/auto_parallel/engine.py
+108
-104
python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
.../paddle/fluid/tests/unittests/auto_parallel/engine_api.py
+13
-6
未找到文件。
python/paddle/distributed/auto_parallel/engine.py
浏览文件 @
562b184c
...
...
@@ -17,6 +17,7 @@ import logging
from
collections
import
defaultdict
import
paddle
import
paddle.utils
as
utils
import
paddle.distributed.auto_parallel
as
auto
from
paddle
import
fluid
,
static
...
...
@@ -26,9 +27,9 @@ from paddle.static import InputSpec
from
paddle.fluid
import
core
from
paddle.fluid
import
program_guard
from
paddle.fluid.layers.utils
import
flatten
from
paddle.fluid.executor
import
global_scope
from
paddle.fluid.executor
import
global_scope
,
_to_name_str
from
paddle.fluid.backward
import
append_backward
from
paddle.fluid.framework
import
Operator
,
Variable
from
paddle.fluid.framework
import
Operator
from
paddle.fluid.framework
import
_current_expected_place
as
_get_device
from
paddle.fluid.dygraph.parallel
import
ParallelEnv
from
paddle.distributed
import
fleet
...
...
@@ -137,7 +138,8 @@ class Engine:
metrics
=
[]
serial_main_prog
=
self
.
_orig_main_prog
.
clone
()
serial_startup_prog
=
self
.
_orig_startup_prog
.
clone
()
with
static
.
program_guard
(
serial_main_prog
,
serial_startup_prog
):
with
static
.
program_guard
(
serial_main_prog
,
serial_startup_prog
),
\
utils
.
unique_name
.
guard
():
inputs_spec
=
self
.
inputs_spec
labels_spec
=
self
.
labels_spec
if
self
.
labels_spec
else
[]
inputs
=
[
s
.
_create_feed_layer
()
for
s
in
inputs_spec
]
...
...
@@ -256,7 +258,7 @@ class Engine:
train_data
,
batch_size
=
1
,
epochs
=
1
,
fetch
_list
=
None
,
fetch
es
=
None
,
steps_per_epoch
=
None
,
use_program_cache
=
False
,
return_numpy
=
True
):
...
...
@@ -267,134 +269,131 @@ class Engine:
"train model is not ready, please call `engine.prepare()` first."
train_dataloader
=
self
.
_create_dataloader
(
train_data
,
batch_size
,
epochs
,
steps_per_epoch
)
self
.
_usr_fetch_list
=
fetch_list
outputs
=
[]
usr_fetch
=
self
.
_to_map_fetch
(
fetches
)
fetch_loss
=
self
.
_inner_fetch
(
self
.
fetch_vars
[
"loss"
])
fetch_list
,
fetch_map
=
self
.
_fetch_map
(
fetch_loss
,
usr_fetch
)
for
epoch
in
range
(
epochs
):
for
step
,
data
in
enumerate
(
train_dataloader
):
logs
,
outs
=
self
.
_train_step
(
data
,
use_program_cache
,
return_numpy
)
outputs
.
append
(
outs
)
train_logs
=
{
"train_"
+
name
:
val
for
name
,
val
in
logs
.
items
()
}
train_logs
=
{
"epoch"
:
epoch
}
for
step
,
_
in
enumerate
(
train_dataloader
):
outs
=
self
.
_executor
.
run
(
self
.
main_program
,
fetch_list
=
fetch_list
,
use_program_cache
=
use_program_cache
,
return_numpy
=
return_numpy
)
train_logs
[
"step"
]
=
step
# inner fetches
if
fetch_loss
:
train_logs
[
"train_loss"
]
=
outs
[
0
][
0
]
# user fetches
user_outs
=
outs
[
len
(
fetch_loss
):]
user_fetch_list
=
fetch_list
[
len
(
fetch_loss
):]
for
i
,
out
in
enumerate
(
user_outs
):
train_logs
[
"train_"
+
fetch_map
[
user_fetch_list
[
i
]]]
=
out
[
0
]
self
.
_logger
.
info
(
train_logs
)
return
outputs
def
evaluate
(
self
,
eval_data
,
batch_size
=
1
,
fetch
_list
=
None
,
fetch
es
=
None
,
use_program_cache
=
False
,
return_numpy
=
True
):
self
.
mode
=
'eval'
assert
self
.
mode
in
self
.
_dist_main_progs
,
\
"eval model is not ready, please call `engine.prepare()` first."
eval_dataloader
=
self
.
_create_dataloader
(
eval_data
,
batch_size
)
self
.
_usr_fetch_list
=
fetch_list
for
step
,
data
in
enumerate
(
eval_dataloader
):
eval_logs
=
dict
()
logs
,
outs
=
self
.
_eval_step
(
data
,
use_program_cache
,
return_numpy
)
eval_logs
[
"eval_loss"
]
=
outs
[
0
]
if
len
(
outs
)
>
0
else
[]
for
metric
in
self
.
_metrics
:
results
=
metric
.
accumulate
()
for
i
,
res
in
enumerate
(
to_list
(
results
)):
eval_logs
[
"eval_"
+
metric
.
name
()[
i
]]
=
res
for
name
,
val
in
logs
.
items
():
eval_logs
[
"eval_"
+
name
]
=
val
usr_fetch
=
self
.
_to_map_fetch
(
fetches
)
fetch_loss
=
self
.
_inner_fetch
(
self
.
fetch_vars
[
"loss"
])
fetch_metrics
=
self
.
_inner_fetch
(
self
.
fetch_vars
[
"metrics"
])
inner_fetch
=
dict
(
fetch_loss
,
**
fetch_metrics
)
fetch_list
,
fetch_map
=
self
.
_fetch_map
(
inner_fetch
,
usr_fetch
)
for
step
,
_
in
enumerate
(
eval_dataloader
):
eval_logs
=
{
"step"
:
step
}
outs
=
self
.
_executor
.
run
(
self
.
main_program
,
fetch_list
=
fetch_list
,
use_program_cache
=
use_program_cache
,
return_numpy
=
return_numpy
)
# inner fetches
if
fetch_loss
:
eval_logs
[
"eval_loss"
]
=
outs
[
0
]
# Metric
if
fetch_metrics
:
metric_out
=
outs
[
len
(
fetch_loss
):
len
(
inner_fetch
)]
for
metric
in
self
.
_metrics
:
metric
.
update
(
*
metric_out
)
results
=
metric
.
accumulate
()
for
i
,
res
in
enumerate
(
to_list
(
results
)):
eval_logs
[
"eval_"
+
metric
.
name
()[
i
]]
=
res
# usr fetches
usr_out
=
outs
[
len
(
inner_fetch
):]
usr_fetch_list
=
fetch_list
[
len
(
inner_fetch
):]
for
i
,
out
in
enumerate
(
usr_out
):
eval_logs
[
"eval_"
+
fetch_map
[
usr_fetch_list
[
i
]]]
=
out
# logger
self
.
_logger
.
info
(
eval_logs
)
return
eval_logs
def
predict
(
self
,
test_data
,
batch_size
=
1
,
fetch
_list
=
None
,
fetch
es
=
None
,
use_program_cache
=
False
,
return_numpy
=
True
):
self
.
mode
=
'predict'
assert
self
.
mode
in
self
.
_dist_main_progs
,
\
"predict model is not ready, please call `engine.prepare()` first."
test_dataloader
=
self
.
_create_dataloader
(
test_data
,
batch_size
)
self
.
_usr_fetch_list
=
fetch_list
usr_fetch
=
self
.
_to_map_fetch
(
fetches
)
fetch_outputs
=
self
.
_inner_fetch
(
self
.
fetch_vars
[
"outputs"
])
fetch_list
,
fetch_map
=
self
.
_fetch_map
(
fetch_outputs
,
usr_fetch
)
outputs
=
[]
for
step
,
data
in
enumerate
(
test_dataloader
):
logs
,
outs
=
self
.
_predict_step
(
data
,
use_program_cache
,
return_numpy
)
outputs
.
append
(
outs
)
predict_logs
=
{
"pred_"
+
name
:
val
for
name
,
val
in
logs
.
items
()}
for
step
,
_
in
enumerate
(
test_dataloader
):
predict_logs
=
{
"step"
:
step
}
outs
=
self
.
_executor
.
run
(
self
.
main_program
,
fetch_list
=
fetch_list
,
use_program_cache
=
use_program_cache
,
return_numpy
=
return_numpy
)
outputs
.
append
(
outs
[:
len
(
fetch_outputs
)])
for
i
,
out
in
enumerate
(
outs
):
predict_logs
[
"pred_"
+
fetch_map
[
fetch_list
[
i
]]]
=
out
[
0
]
self
.
_logger
.
info
(
predict_logs
)
return
outputs
def
_train_step
(
self
,
data
,
use_program_cache
=
False
,
return_numpy
=
True
):
logs
=
{}
fetch_vars
=
self
.
_fetch_vars
[
self
.
mode
][
"loss"
]
fetch_list
,
usr_fetch_list
=
self
.
_fetch_list
(
fetch_vars
)
fetch_list
+=
usr_fetch_list
outs
=
self
.
_executor
.
run
(
self
.
main_program
,
fetch_list
=
fetch_list
,
use_program_cache
=
use_program_cache
,
return_numpy
=
return_numpy
)
for
i
,
out
in
enumerate
(
outs
):
logs
[
fetch_list
[
i
]]
=
out
return
logs
,
outs
def
_eval_step
(
self
,
data
,
use_program_cache
=
False
,
return_numpy
=
True
):
logs
=
{}
metrics
=
self
.
_fetch_vars
[
self
.
mode
][
"metrics"
]
losses
=
self
.
_fetch_vars
[
self
.
mode
][
"loss"
]
fetch_loss
,
usr_fetch_list
=
self
.
_fetch_list
(
losses
)
fetch_metrics
,
usr_fetch_list
=
self
.
_fetch_list
(
metrics
)
fetch_list
=
fetch_loss
+
fetch_metrics
outs
=
self
.
_executor
.
run
(
self
.
main_program
,
fetch_list
=
fetch_list
+
usr_fetch_list
,
use_program_cache
=
use_program_cache
,
return_numpy
=
return_numpy
)
usr_out
=
outs
[
len
(
fetch_list
):]
for
i
,
out
in
enumerate
(
usr_out
):
logs
[
usr_fetch_list
[
i
]]
=
out
outs
=
outs
[:
len
(
fetch_list
)]
if
not
outs
[
len
(
fetch_loss
):]:
return
logs
,
outs
[:
len
(
fetch_loss
)]
for
metric
in
self
.
_metrics
:
metric
.
update
(
*
outs
[
len
(
fetch_loss
):])
return
logs
,
outs
[:
len
(
fetch_loss
)]
def
_predict_step
(
self
,
data
,
use_program_cache
=
False
,
return_numpy
=
True
):
logs
=
{}
fetch_vars
=
self
.
_fetch_vars
[
self
.
mode
][
"outputs"
]
fetch_list
,
usr_fetch_list
=
self
.
_fetch_list
(
fetch_vars
)
fetch_list
+=
usr_fetch_list
outs
=
self
.
_executor
.
run
(
self
.
main_program
,
fetch_list
=
fetch_list
,
use_program_cache
=
use_program_cache
,
return_numpy
=
return_numpy
)
for
i
,
out
in
enumerate
(
outs
):
logs
[
fetch_list
[
i
]]
=
out
return
logs
,
outs
def
_fetch_list
(
self
,
fetch_vars
):
fetch_list
=
[]
for
var
in
fetch_vars
:
if
var
.
name
in
self
.
main_program
.
global_block
().
vars
:
fetch_list
.
append
(
var
.
name
)
usr_fetch_list
=
[]
if
self
.
_usr_fetch_list
:
assert
isinstance
(
self
.
_usr_fetch_list
,
list
),
"'fetch_list' type should be list."
for
var
in
self
.
_usr_fetch_list
:
if
isinstance
(
var
,
str
):
if
var
in
self
.
main_program
.
global_block
().
vars
:
usr_fetch_list
.
append
(
var
)
elif
isinstance
(
var
,
Variable
):
if
var
.
name
in
self
.
main_program
.
global_block
().
vars
:
usr_fetch_list
.
append
(
var
.
name
)
return
fetch_list
,
usr_fetch_list
def
_local_var
(
self
,
var
):
var_name
=
_to_name_str
(
var
)
return
var_name
in
self
.
main_program
.
global_block
().
vars
def
_to_map_fetch
(
self
,
fetches
):
if
not
fetches
:
return
{}
if
isinstance
(
fetches
,
dict
):
fetch_var_names
=
list
(
map
(
_to_name_str
,
fetches
.
values
()))
usr_fetches
=
dict
(
zip
(
fetch_var_names
,
list
(
fetches
.
keys
())))
elif
isinstance
(
fetches
,
list
):
fetch_var_names
=
list
(
map
(
_to_name_str
,
fetches
))
usr_fetches
=
dict
(
zip
(
fetch_var_names
,
fetch_var_names
))
return
dict
(
filter
(
lambda
x
:
self
.
_local_var
(
x
[
0
]),
usr_fetches
.
items
()))
def
_inner_fetch
(
self
,
fetch_vars
):
fetch_list
=
list
(
map
(
lambda
x
:
x
.
name
,
list
(
filter
(
self
.
_local_var
,
fetch_vars
))))
inner_fetches
=
dict
(
zip
(
fetch_list
,
fetch_list
))
return
inner_fetches
def
_fetch_map
(
self
,
inner_fetch
,
usr_fetch
):
# replace inner fetch name if usr set for it
for
iname
in
inner_fetch
:
if
iname
in
usr_fetch
:
inner_fetch
[
iname
]
=
usr_fetch
[
iname
]
usr_fetch
.
pop
(
iname
)
fetches
=
dict
(
inner_fetch
,
**
usr_fetch
)
return
list
(
fetches
.
keys
()),
fetches
def
_create_dataloader
(
self
,
dataset
,
...
...
@@ -515,7 +514,8 @@ class Engine:
mode
=
self
.
mode
if
training
:
assert
'train'
in
self
.
_serial_main_progs
,
"training model is not ready, please call `engine.prepare(mode='train')` first."
assert
'train'
in
self
.
_serial_main_progs
,
\
"training model is not ready, please call `engine.prepare()` first."
serial_program
=
self
.
_serial_main_progs
[
"train"
]
dist_main_prog
=
self
.
_dist_main_progs
[
"train"
][
self
.
_cur_rank
]
dist_context
=
self
.
_dist_contexts
[
"train"
]
...
...
@@ -571,3 +571,7 @@ class Engine:
@
property
def
serial_startup_program
(
self
):
return
self
.
_serial_startup_progs
[
self
.
mode
]
@
property
def
fetch_vars
(
self
):
return
self
.
_fetch_vars
[
self
.
mode
]
python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
浏览文件 @
562b184c
...
...
@@ -96,10 +96,11 @@ class MLPLayer(nn.Layer):
PP_MESH_1
})(
out
)[
0
]
out
=
self
.
dropout
(
out
)
out
=
self
.
linear2
(
out
)
self
.
out
=
out
return
out
def
train
():
def
train
(
fetch
):
mlp
=
MLPLayer
(
hidden_size
=
hidden_size
,
intermediate_size
=
4
*
hidden_size
,
dropout_ratio
=
0.1
,
...
...
@@ -118,7 +119,6 @@ def train():
dist_strategy
.
amp
=
False
dist_strategy
.
pipeline
=
False
dist_strategy
.
recompute
=
False
# init parallel optimizer
dist_strategy
.
semi_auto
=
True
fleet
.
init
(
is_collective
=
True
,
strategy
=
dist_strategy
)
...
...
@@ -129,20 +129,26 @@ def train():
strategy
=
dist_strategy
)
engine
.
prepare
(
optimizer
,
loss
,
metrics
=
paddle
.
metric
.
Accuracy
())
# fetch
if
fetch
:
fetches
=
{
'out'
:
mlp
.
out
}
else
:
fetches
=
None
# train
train_dataset
=
MyDataset
(
batch_num
*
batch_size
)
engine
.
fit
(
train_dataset
,
batch_size
=
batch_size
,
steps_per_epoch
=
batch_num
*
batch_size
,
fetch
_list
=
[
'label'
]
)
fetch
es
=
fetches
)
# eval
eval_dataset
=
MyDataset
(
batch_size
)
engine
.
evaluate
(
eval_dataset
,
batch_size
,
fetch
_list
=
[
'label'
]
)
engine
.
evaluate
(
eval_dataset
,
batch_size
,
fetch
es
=
fetches
)
# predict
test_dataset
=
MyDataset
(
batch_size
)
engine
.
predict
(
test_dataset
,
batch_size
,
fetch
_list
=
[
'label'
]
)
engine
.
predict
(
test_dataset
,
batch_size
,
fetch
es
=
fetches
)
# save
temp_dir
=
tempfile
.
TemporaryDirectory
()
...
...
@@ -152,4 +158,5 @@ def train():
if
__name__
==
"__main__"
:
train
()
train
(
fetch
=
True
)
train
(
fetch
=
False
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录