Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
hapi
提交
a501b8c8
H
hapi
项目概览
PaddlePaddle
/
hapi
通知
11
Star
2
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
4
列表
看板
标记
里程碑
合并请求
7
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
H
hapi
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
4
Issue
4
列表
看板
标记
里程碑
合并请求
7
合并请求
7
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
a501b8c8
编写于
3月 25, 2020
作者:
L
LielinJiang
浏览文件
操作
浏览文件
下载
差异文件
refine code
上级
4bccb6aa
b1862cf7
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
241 addition
and
104 deletion
+241
-104
callbacks.py
callbacks.py
+13
-8
distributed.py
distributed.py
+99
-1
mnist.py
mnist.py
+11
-9
model.py
model.py
+111
-81
tests/test_model.py
tests/test_model.py
+7
-5
未找到文件。
callbacks.py
浏览文件 @
a501b8c8
...
@@ -18,6 +18,7 @@ import copy
...
@@ -18,6 +18,7 @@ import copy
from
progressbar
import
ProgressBar
from
progressbar
import
ProgressBar
from
paddle.fluid.dygraph.parallel
import
Env
from
paddle.fluid.dygraph.parallel
import
Env
def
config_callbacks
(
callbacks
=
None
,
def
config_callbacks
(
callbacks
=
None
,
model
=
None
,
model
=
None
,
batch_size
=
None
,
batch_size
=
None
,
...
@@ -26,6 +27,7 @@ def config_callbacks(callbacks=None,
...
@@ -26,6 +27,7 @@ def config_callbacks(callbacks=None,
log_freq
=
2
,
log_freq
=
2
,
verbose
=
2
,
verbose
=
2
,
save_freq
=
1
,
save_freq
=
1
,
save_dir
=
None
,
metrics
=
None
,
metrics
=
None
,
mode
=
'train'
):
mode
=
'train'
):
cbks
=
callbacks
or
[]
cbks
=
callbacks
or
[]
...
@@ -34,7 +36,7 @@ def config_callbacks(callbacks=None,
...
@@ -34,7 +36,7 @@ def config_callbacks(callbacks=None,
cbks
=
cbks
+
[
ProgBarLogger
(
log_freq
,
verbose
=
verbose
)]
cbks
=
cbks
+
[
ProgBarLogger
(
log_freq
,
verbose
=
verbose
)]
if
not
any
(
isinstance
(
k
,
ModelCheckpoint
)
for
k
in
cbks
):
if
not
any
(
isinstance
(
k
,
ModelCheckpoint
)
for
k
in
cbks
):
cbks
=
cbks
+
[
ModelCheckpoint
(
save_freq
)]
cbks
=
cbks
+
[
ModelCheckpoint
(
save_freq
,
save_dir
)]
cbk_list
=
CallbackList
(
cbks
)
cbk_list
=
CallbackList
(
cbks
)
cbk_list
.
set_model
(
model
)
cbk_list
.
set_model
(
model
)
...
@@ -209,7 +211,7 @@ class ProgBarLogger(Callback):
...
@@ -209,7 +211,7 @@ class ProgBarLogger(Callback):
def
on_train_batch_end
(
self
,
step
,
logs
=
None
):
def
on_train_batch_end
(
self
,
step
,
logs
=
None
):
logs
=
logs
or
{}
logs
=
logs
or
{}
self
.
train_step
=
step
self
.
train_step
+=
1
if
self
.
train_step
%
self
.
log_freq
==
0
and
self
.
verbose
and
Env
().
local_rank
==
0
:
if
self
.
train_step
%
self
.
log_freq
==
0
and
self
.
verbose
and
Env
().
local_rank
==
0
:
# if steps is not None, last step will update in on_epoch_end
# if steps is not None, last step will update in on_epoch_end
...
@@ -247,21 +249,24 @@ class ProgBarLogger(Callback):
...
@@ -247,21 +249,24 @@ class ProgBarLogger(Callback):
class
ModelCheckpoint
(
Callback
):
class
ModelCheckpoint
(
Callback
):
def
__init__
(
self
,
save_freq
=
1
,
save_
file
=
'output'
):
def
__init__
(
self
,
save_freq
=
1
,
save_
dir
=
None
):
self
.
save_freq
=
save_freq
self
.
save_freq
=
save_freq
self
.
save_
file
=
save_file
self
.
save_
dir
=
save_dir
def
on_epoch_begin
(
self
,
epoch
=
None
,
logs
=
None
):
def
on_epoch_begin
(
self
,
epoch
=
None
,
logs
=
None
):
self
.
epoch
=
epoch
self
.
epoch
=
epoch
def
_is_save
(
self
):
return
self
.
model
and
self
.
save_dir
and
Env
().
local_rank
==
0
def
on_epoch_end
(
self
,
epoch
,
logs
=
None
):
def
on_epoch_end
(
self
,
epoch
,
logs
=
None
):
if
self
.
model
and
self
.
epoch
%
self
.
save_freq
==
0
and
Env
().
local_rank
==
0
:
if
self
.
_is_save
()
and
self
.
epoch
%
self
.
save_freq
==
0
:
path
=
'{}/{}'
.
format
(
self
.
save_
file
,
epoch
)
path
=
'{}/{}'
.
format
(
self
.
save_
dir
,
epoch
)
print
(
'save checkpoint at {}'
.
format
(
path
))
print
(
'save checkpoint at {}'
.
format
(
path
))
self
.
model
.
save
(
path
)
self
.
model
.
save
(
path
)
def
on_train_end
(
self
,
logs
=
None
):
def
on_train_end
(
self
,
logs
=
None
):
if
self
.
model
and
Env
().
local_rank
==
0
:
if
self
.
_is_save
()
:
path
=
'{}/final'
.
format
(
self
.
save_
file
)
path
=
'{}/final'
.
format
(
self
.
save_
dir
)
print
(
'save checkpoint at {}'
.
format
(
path
))
print
(
'save checkpoint at {}'
.
format
(
path
))
self
.
model
.
save
(
path
)
self
.
model
.
save
(
path
)
distributed.py
浏览文件 @
a501b8c8
...
@@ -13,6 +13,7 @@
...
@@ -13,6 +13,7 @@
# limitations under the License.
# limitations under the License.
import
os
import
os
import
sys
import
sys
import
six
import
time
import
time
import
math
import
math
import
socket
import
socket
...
@@ -21,10 +22,13 @@ import numpy as np
...
@@ -21,10 +22,13 @@ import numpy as np
from
paddle
import
fluid
from
paddle
import
fluid
from
paddle.fluid.layers
import
collective
from
paddle.fluid.layers
import
collective
from
paddle.fluid.dygraph.parallel
import
Env
from
paddle.fluid.dygraph.parallel
import
Env
,
ParallelStrategy
from
paddle.fluid.io
import
BatchSampler
from
paddle.fluid.io
import
BatchSampler
_parallel_context_initialized
=
False
class
DistributedBatchSampler
(
BatchSampler
):
class
DistributedBatchSampler
(
BatchSampler
):
"""Sampler that restricts data loading to a subset of the dataset.
"""Sampler that restricts data loading to a subset of the dataset.
...
@@ -100,3 +104,97 @@ class DistributedBatchSampler(BatchSampler):
...
@@ -100,3 +104,97 @@ class DistributedBatchSampler(BatchSampler):
def
_all_gather
(
x
,
nranks
,
ring_id
=
0
,
use_calc_stream
=
True
):
def
_all_gather
(
x
,
nranks
,
ring_id
=
0
,
use_calc_stream
=
True
):
return
collective
.
_c_allgather
(
x
,
nranks
,
ring_id
=
ring_id
,
use_calc_stream
=
use_calc_stream
)
return
collective
.
_c_allgather
(
x
,
nranks
,
ring_id
=
ring_id
,
use_calc_stream
=
use_calc_stream
)
def
wait_server_ready
(
endpoints
):
assert
not
isinstance
(
endpoints
,
six
.
string_types
)
while
True
:
all_ok
=
True
not_ready_endpoints
=
[]
for
ep
in
endpoints
:
ip_port
=
ep
.
split
(
":"
)
with
contextlib
.
closing
(
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
))
as
sock
:
sock
.
settimeout
(
2
)
result
=
sock
.
connect_ex
((
ip_port
[
0
],
int
(
ip_port
[
1
])))
if
result
!=
0
:
all_ok
=
False
not_ready_endpoints
.
append
(
ep
)
if
not
all_ok
:
time
.
sleep
(
3
)
else
:
break
def
init_communicator
(
program
,
rank
,
nranks
,
wait_port
,
current_endpoint
,
endpoints
):
if
nranks
<
2
:
return
other_endpoints
=
endpoints
[:]
other_endpoints
.
remove
(
current_endpoint
)
if
rank
==
0
and
wait_port
:
wait_server_ready
(
other_endpoints
)
block
=
program
.
global_block
()
nccl_id_var
=
block
.
create_var
(
name
=
fluid
.
unique_name
.
generate
(
'nccl_id'
),
persistable
=
True
,
type
=
fluid
.
core
.
VarDesc
.
VarType
.
RAW
)
block
.
append_op
(
type
=
'c_gen_nccl_id'
,
inputs
=
{},
outputs
=
{
'Out'
:
nccl_id_var
},
attrs
=
{
'rank'
:
rank
,
'endpoint'
:
current_endpoint
,
'other_endpoints'
:
other_endpoints
})
block
.
append_op
(
type
=
'c_comm_init'
,
inputs
=
{
'X'
:
nccl_id_var
},
outputs
=
{},
attrs
=
{
'nranks'
:
nranks
,
'rank'
:
rank
,
'ring_id'
:
0
,
})
def
prepare_distributed_context
(
place
=
None
):
if
place
is
None
:
place
=
fluid
.
CUDAPlace
(
Env
().
dev_id
)
if
Env
().
nranks
>
1
\
else
fluid
.
CUDAPlace
(
0
)
strategy
=
ParallelStrategy
()
strategy
.
nranks
=
Env
().
nranks
strategy
.
local_rank
=
Env
().
local_rank
strategy
.
trainer_endpoints
=
Env
().
trainer_endpoints
strategy
.
current_endpoint
=
Env
().
current_endpoint
if
strategy
.
nranks
<
2
:
return
global
_parallel_context_initialized
if
not
_parallel_context_initialized
and
isinstance
(
place
,
fluid
.
CUDAPlace
):
def
_init_context
():
communicator_prog
=
fluid
.
Program
()
init_communicator
(
communicator_prog
,
strategy
.
local_rank
,
strategy
.
nranks
,
True
,
strategy
.
current_endpoint
,
strategy
.
trainer_endpoints
)
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
communicator_prog
)
if
fluid
.
in_dygraph_mode
():
fluid
.
disable_dygraph
()
_init_context
()
fluid
.
enable_dygraph
(
place
)
else
:
_init_context
()
else
:
assert
(
"Only support CUDAPlace for now."
)
_parallel_context_initialized
=
True
return
strategy
\ No newline at end of file
mnist.py
浏览文件 @
a501b8c8
...
@@ -107,24 +107,26 @@ class MNIST(Model):
...
@@ -107,24 +107,26 @@ class MNIST(Model):
def
main
():
def
main
():
init_context
(
'dynamic'
if
FLAGS
.
dynamic
else
'static'
)
init_context
(
'dynamic'
if
FLAGS
.
dynamic
else
'static'
)
train_dataset
=
MnistDataset
(
mode
=
'train'
)
train_dataset
=
MnistDataset
(
mode
=
'train'
)
val_dataset
=
MnistDataset
(
mode
=
'test'
)
val_dataset
=
MnistDataset
(
mode
=
'test'
)
inputs
=
[
Input
([
None
,
784
],
'float32'
,
name
=
'image'
)]
inputs
=
[
Input
([
None
,
784
],
'float32'
,
name
=
'image'
)]
labels
=
[
Input
([
None
,
1
],
'int64'
,
name
=
'label'
)]
labels
=
[
Input
([
None
,
1
],
'int64'
,
name
=
'label'
)]
model
=
MNIST
()
model
=
MNIST
()
optim
=
Momentum
(
optim
=
Momentum
(
learning_rate
=
FLAGS
.
lr
,
learning_rate
=
FLAGS
.
lr
,
momentum
=
.
9
,
parameter_list
=
model
.
parameters
())
momentum
=
.
9
,
parameter_list
=
model
.
parameters
())
model
.
prepare
(
optim
,
CrossEntropy
(),
Accuracy
(
topk
=
(
1
,
2
)),
inputs
,
labels
)
model
.
prepare
(
optim
,
CrossEntropy
(),
Accuracy
(
topk
=
(
1
,
2
)),
inputs
,
labels
)
if
FLAGS
.
resume
is
not
None
:
if
FLAGS
.
resume
is
not
None
:
model
.
load
(
FLAGS
.
resume
)
model
.
load
(
FLAGS
.
resume
)
model
.
fit
(
train_dataset
,
val_dataset
,
epochs
=
FLAGS
.
epoch
,
batch_size
=
FLAGS
.
batch_size
)
model
.
fit
(
train_dataset
,
val_dataset
,
epochs
=
FLAGS
.
epoch
,
batch_size
=
FLAGS
.
batch_size
,
save_dir
=
'mnist_checkpoint'
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
model.py
浏览文件 @
a501b8c8
...
@@ -21,7 +21,7 @@ import numpy as np
...
@@ -21,7 +21,7 @@ import numpy as np
import
six
import
six
import
warnings
import
warnings
from
collections
import
Iterable
,
OrderedDict
from
collections
import
Iterable
from
paddle
import
fluid
from
paddle
import
fluid
from
paddle.fluid.framework
import
in_dygraph_mode
,
Variable
from
paddle.fluid.framework
import
in_dygraph_mode
,
Variable
from
paddle.fluid.executor
import
global_scope
from
paddle.fluid.executor
import
global_scope
...
@@ -32,14 +32,12 @@ from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
...
@@ -32,14 +32,12 @@ from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
from
paddle.fluid.incubate.fleet.base
import
role_maker
from
paddle.fluid.incubate.fleet.base
import
role_maker
from
paddle.fluid.io
import
DataLoader
from
paddle.fluid.io
import
DataLoader
from
distributed
import
DistributedBatchSampler
,
_all_gather
from
distributed
import
DistributedBatchSampler
,
_all_gather
,
prepare_distributed_context
,
_parallel_context_initialized
from
metrics
import
Metric
from
metrics
import
Metric
from
callbacks
import
config_callbacks
from
callbacks
import
config_callbacks
__all__
=
[
'Model'
,
'Loss'
,
'CrossEntropy'
,
'Input'
]
__all__
=
[
'Model'
,
'Loss'
,
'CrossEntropy'
,
'Input'
]
_parallel_context_inited
=
False
def
to_list
(
value
):
def
to_list
(
value
):
if
value
is
None
:
if
value
is
None
:
...
@@ -142,9 +140,13 @@ class StaticGraphAdapter(object):
...
@@ -142,9 +140,13 @@ class StaticGraphAdapter(object):
self
.
_progs
=
{}
self
.
_progs
=
{}
self
.
_compiled_progs
=
{}
self
.
_compiled_progs
=
{}
self
.
_merge_count
=
{
'eval_total'
:
0
,
'test_total'
:
0
,
self
.
_merge_count
=
{
'eval_batch'
:
0
,
'test_batch'
:
0
}
'eval_total'
:
0
,
'test_total'
:
0
,
'eval_batch'
:
0
,
'test_batch'
:
0
}
self
.
_nranks
=
Env
().
nranks
self
.
_nranks
=
Env
().
nranks
self
.
_local_rank
=
Env
().
local_rank
self
.
_local_rank
=
Env
().
local_rank
...
@@ -251,7 +253,8 @@ class StaticGraphAdapter(object):
...
@@ -251,7 +253,8 @@ class StaticGraphAdapter(object):
# When using static learning rate, static-graph would make it
# When using static learning rate, static-graph would make it
# a persistable var named 'unique_name.generate("learning_rate")',
# a persistable var named 'unique_name.generate("learning_rate")',
# However, dygraph wouldn't save it.
# However, dygraph wouldn't save it.
if
var
.
name
not
in
state
:
continue
if
var
.
name
not
in
state
:
continue
else
:
else
:
# moment and other accumulators
# moment and other accumulators
if
var
.
name
not
in
converted_state
:
if
var
.
name
not
in
converted_state
:
...
@@ -350,16 +353,19 @@ class StaticGraphAdapter(object):
...
@@ -350,16 +353,19 @@ class StaticGraphAdapter(object):
for
metric
,
state
in
zip
(
self
.
model
.
_metrics
,
metric_states
):
for
metric
,
state
in
zip
(
self
.
model
.
_metrics
,
metric_states
):
# cut off padding size
# cut off padding size
if
self
.
mode
!=
'train'
and
self
.
model
.
_test_dataloader
is
not
None
\
if
self
.
mode
!=
'train'
and
self
.
model
.
_test_dataloader
is
not
None
\
and
isinstance
(
self
.
model
.
_test_dataloader
,
DataLoader
)
\
and
isinstance
(
self
.
model
.
_test_dataloader
,
DataLoader
)
\
and
self
.
_nranks
>
1
:
and
self
.
_nranks
>
1
:
total_size
=
len
(
self
.
model
.
_test_dataloader
.
dataset
)
total_size
=
len
(
self
.
model
.
_test_dataloader
.
dataset
)
# TODO: fixme if have better way to get batch size
# TODO: fixme if have better way to get batch size
samples
=
state
[
0
].
shape
[
0
]
samples
=
state
[
0
].
shape
[
0
]
current_count
=
self
.
_merge_count
.
get
(
self
.
mode
+
'_total'
,
0
)
current_count
=
self
.
_merge_count
.
get
(
self
.
mode
+
'_total'
,
0
)
if
current_count
+
samples
>=
total_size
:
if
current_count
+
samples
>=
total_size
:
state
=
[
s
[:
total_size
-
current_count
,
...]
for
s
in
state
]
state
=
[
s
[:
total_size
-
current_count
,
...]
for
s
in
state
]
self
.
_merge_count
[
self
.
mode
+
'_total'
]
=
0
self
.
_merge_count
[
self
.
mode
+
'_total'
]
=
0
self
.
_merge_count
[
self
.
mode
+
'_batch'
]
=
total_size
-
current_count
self
.
_merge_count
[
self
.
mode
+
'_batch'
]
=
total_size
-
current_count
else
:
else
:
self
.
_merge_count
[
self
.
mode
+
'_total'
]
+=
samples
self
.
_merge_count
[
self
.
mode
+
'_total'
]
+=
samples
self
.
_merge_count
[
self
.
mode
+
'_batch'
]
=
samples
self
.
_merge_count
[
self
.
mode
+
'_batch'
]
=
samples
...
@@ -388,17 +394,17 @@ class StaticGraphAdapter(object):
...
@@ -388,17 +394,17 @@ class StaticGraphAdapter(object):
for
op
in
list
(
prog
.
global_block
().
ops
):
for
op
in
list
(
prog
.
global_block
().
ops
):
prog
.
global_block
().
_remove_op
(
0
)
prog
.
global_block
().
_remove_op
(
0
)
if
mode
==
'train'
and
self
.
model
.
_optimizer
\
if
mode
==
'train'
and
self
.
model
.
_optimizer
\
and
self
.
model
.
_optimizer
.
_learning_rate_map
:
and
self
.
model
.
_optimizer
.
_learning_rate_map
:
# HACK workaround learning rate map issue
# HACK workaround learning rate map issue
lr_var
=
self
.
model
.
_optimizer
.
_learning_rate_map
[
self
.
_orig_prog
]
lr_var
=
self
.
model
.
_optimizer
.
_learning_rate_map
[
self
.
_orig_prog
]
self
.
model
.
_optimizer
.
_learning_rate_map
[
prog
]
=
lr_var
self
.
model
.
_optimizer
.
_learning_rate_map
[
prog
]
=
lr_var
losses
=
[]
losses
=
[]
metrics
=
[]
metrics
=
[]
with
fluid
.
program_guard
(
prog
,
self
.
_startup_prog
):
with
fluid
.
program_guard
(
prog
,
self
.
_startup_prog
):
if
isinstance
(
self
.
model
.
_inputs
,
dict
):
if
isinstance
(
self
.
model
.
_inputs
,
dict
):
ins
=
[
self
.
model
.
_inputs
[
n
]
\
ins
=
[
self
.
model
.
_inputs
[
n
]
for
n
in
extract_args
(
self
.
model
.
forward
)
if
n
!=
'self'
]
for
n
in
extract_args
(
self
.
model
.
forward
)
if
n
!=
'self'
]
else
:
else
:
ins
=
self
.
model
.
_inputs
ins
=
self
.
model
.
_inputs
lbls
=
self
.
model
.
_labels
if
self
.
model
.
_labels
else
[]
lbls
=
self
.
model
.
_labels
if
self
.
model
.
_labels
else
[]
...
@@ -408,16 +414,17 @@ class StaticGraphAdapter(object):
...
@@ -408,16 +414,17 @@ class StaticGraphAdapter(object):
outputs
=
to_list
(
self
.
model
.
forward
(
*
inputs
))
outputs
=
to_list
(
self
.
model
.
forward
(
*
inputs
))
if
mode
!=
'test'
and
self
.
model
.
_loss_function
:
if
mode
!=
'test'
and
self
.
model
.
_loss_function
:
losses
=
self
.
model
.
_loss_function
(
outputs
,
labels
)
losses
=
self
.
model
.
_loss_function
(
outputs
,
labels
)
if
self
.
_nranks
>
1
and
mode
!=
'train'
:
if
self
.
_nranks
>
1
and
mode
!=
'train'
:
outputs
=
[
_all_gather
(
o
,
self
.
_nranks
)
for
o
in
outputs
]
outputs
=
[
_all_gather
(
o
,
self
.
_nranks
)
for
o
in
outputs
]
if
mode
!=
'test'
:
if
mode
!=
'test'
:
labels
=
[
_all_gather
(
l
,
self
.
_nranks
)
for
l
in
labels
]
labels
=
[
_all_gather
(
l
,
self
.
_nranks
)
for
l
in
labels
]
if
mode
!=
'test'
:
if
mode
!=
'test'
:
for
metric
in
self
.
model
.
_metrics
:
for
metric
in
self
.
model
.
_metrics
:
metrics
.
append
(
to_list
(
metric
.
add_metric_op
(
outputs
,
labels
)))
metrics
.
append
(
to_list
(
metric
.
add_metric_op
(
outputs
,
labels
)))
if
mode
==
'train'
and
self
.
model
.
_optimizer
:
if
mode
==
'train'
and
self
.
model
.
_optimizer
:
self
.
_loss_endpoint
=
fluid
.
layers
.
sum
(
losses
)
self
.
_loss_endpoint
=
fluid
.
layers
.
sum
(
losses
)
...
@@ -427,16 +434,16 @@ class StaticGraphAdapter(object):
...
@@ -427,16 +434,16 @@ class StaticGraphAdapter(object):
dist_strategy
=
DistributedStrategy
()
dist_strategy
=
DistributedStrategy
()
dist_strategy
.
mode
=
"collective"
dist_strategy
.
mode
=
"collective"
dist_strategy
.
collective_mode
=
"grad_allreduce"
dist_strategy
.
collective_mode
=
"grad_allreduce"
self
.
model
.
_optimizer
=
fleet
.
distributed_optimizer
(
self
.
model
.
_optimizer
,
self
.
model
.
_optimizer
=
fleet
.
distributed_optimizer
(
strategy
=
dist_strategy
)
self
.
model
.
_optimizer
,
strategy
=
dist_strategy
)
self
.
model
.
_optimizer
.
minimize
(
self
.
_loss_endpoint
)
self
.
model
.
_optimizer
.
minimize
(
self
.
_loss_endpoint
)
if
mode
!=
'train'
:
# clone again to put it in test mode
if
mode
!=
'train'
:
# clone again to put it in test mode
prog
=
prog
.
clone
(
for_test
=
True
)
prog
=
prog
.
clone
(
for_test
=
True
)
self
.
_input_vars
[
mode
]
=
inputs
self
.
_input_vars
[
mode
]
=
inputs
self
.
_progs
[
mode
]
=
prog
self
.
_progs
[
mode
]
=
prog
self
.
_endpoints
[
mode
]
=
{
self
.
_endpoints
[
mode
]
=
{
"output"
:
outputs
,
"output"
:
outputs
,
...
@@ -444,7 +451,6 @@ class StaticGraphAdapter(object):
...
@@ -444,7 +451,6 @@ class StaticGraphAdapter(object):
"metric"
:
metrics
"metric"
:
metrics
}
}
def
_compile_and_initialize
(
self
,
prog
,
mode
):
def
_compile_and_initialize
(
self
,
prog
,
mode
):
compiled_prog
=
self
.
_compiled_progs
.
get
(
mode
,
None
)
compiled_prog
=
self
.
_compiled_progs
.
get
(
mode
,
None
)
if
compiled_prog
is
not
None
:
if
compiled_prog
is
not
None
:
...
@@ -464,7 +470,8 @@ class StaticGraphAdapter(object):
...
@@ -464,7 +470,8 @@ class StaticGraphAdapter(object):
if
self
.
_executor
is
None
:
if
self
.
_executor
is
None
:
if
self
.
_nranks
>
1
and
device
.
lower
()
==
'gpu'
:
if
self
.
_nranks
>
1
and
device
.
lower
()
==
'gpu'
:
gpu_id
=
int
(
Env
().
dev_id
)
gpu_id
=
int
(
Env
().
dev_id
)
place
=
fluid
.
CUDAPlace
(
gpu_id
)
if
device
.
lower
()
==
'gpu'
else
fluid
.
CPUPlace
()
place
=
fluid
.
CUDAPlace
(
gpu_id
)
if
device
.
lower
()
==
'gpu'
else
fluid
.
CPUPlace
()
else
:
else
:
place
=
places
[
0
]
place
=
places
[
0
]
self
.
_executor
=
fluid
.
Executor
(
place
)
self
.
_executor
=
fluid
.
Executor
(
place
)
...
@@ -473,7 +480,7 @@ class StaticGraphAdapter(object):
...
@@ -473,7 +480,7 @@ class StaticGraphAdapter(object):
for
var_py
in
self
.
_startup_prog
.
list_vars
():
for
var_py
in
self
.
_startup_prog
.
list_vars
():
var
=
fluid
.
global_scope
().
find_var
(
var_py
.
name
)
var
=
fluid
.
global_scope
().
find_var
(
var_py
.
name
)
if
not
var_py
.
name
.
startswith
(
'nccl_id'
)
and
var
and
\
if
not
var_py
.
name
.
startswith
(
'nccl_id'
)
and
var
and
\
var
.
get_tensor
().
_is_initialized
():
var
.
get_tensor
().
_is_initialized
():
continue
continue
uninitialized
.
append
(
var_py
)
uninitialized
.
append
(
var_py
)
...
@@ -484,7 +491,7 @@ class StaticGraphAdapter(object):
...
@@ -484,7 +491,7 @@ class StaticGraphAdapter(object):
if
self
.
_nranks
<
2
:
if
self
.
_nranks
<
2
:
compiled_prog
=
fluid
.
CompiledProgram
(
prog
)
compiled_prog
=
fluid
.
CompiledProgram
(
prog
)
else
:
else
:
compiled_prog
=
prog
#fleet.main_program
compiled_prog
=
prog
if
len
(
places
)
>
1
:
if
len
(
places
)
>
1
:
loss_name
=
None
loss_name
=
None
...
@@ -501,8 +508,12 @@ class DynamicGraphAdapter(object):
...
@@ -501,8 +508,12 @@ class DynamicGraphAdapter(object):
self
.
model
=
model
self
.
model
=
model
self
.
_nranks
=
Env
().
nranks
self
.
_nranks
=
Env
().
nranks
self
.
_local_rank
=
Env
().
local_rank
self
.
_local_rank
=
Env
().
local_rank
self
.
_merge_count
=
{
'eval_total'
:
0
,
'test_total'
:
0
,
self
.
_merge_count
=
{
'eval_batch'
:
0
,
'test_batch'
:
0
}
'eval_total'
:
0
,
'test_total'
:
0
,
'eval_batch'
:
0
,
'test_batch'
:
0
}
if
self
.
_nranks
>
1
:
if
self
.
_nranks
>
1
:
stradegy
=
fluid
.
dygraph
.
parallel
.
ParallelStrategy
()
stradegy
=
fluid
.
dygraph
.
parallel
.
ParallelStrategy
()
...
@@ -510,7 +521,8 @@ class DynamicGraphAdapter(object):
...
@@ -510,7 +521,8 @@ class DynamicGraphAdapter(object):
stradegy
.
local_rank
=
Env
().
local_rank
stradegy
.
local_rank
=
Env
().
local_rank
stradegy
.
trainer_endpoints
=
Env
().
trainer_endpoints
stradegy
.
trainer_endpoints
=
Env
().
trainer_endpoints
stradegy
.
current_endpoint
=
Env
().
current_endpoint
stradegy
.
current_endpoint
=
Env
().
current_endpoint
self
.
ddp_model
=
fluid
.
dygraph
.
parallel
.
DataParallel
(
self
.
model
,
stradegy
)
self
.
ddp_model
=
fluid
.
dygraph
.
parallel
.
DataParallel
(
self
.
model
,
stradegy
)
@
property
@
property
def
mode
(
self
):
def
mode
(
self
):
...
@@ -546,12 +558,13 @@ class DynamicGraphAdapter(object):
...
@@ -546,12 +558,13 @@ class DynamicGraphAdapter(object):
self
.
model
.
clear_gradients
()
self
.
model
.
clear_gradients
()
metrics
=
[]
metrics
=
[]
for
metric
in
self
.
model
.
_metrics
:
for
metric
in
self
.
model
.
_metrics
:
metric_outs
=
metric
.
add_metric_op
(
to_list
(
outputs
),
to_list
(
labels
))
metric_outs
=
metric
.
add_metric_op
(
to_list
(
outputs
),
to_list
(
labels
))
m
=
metric
.
update
(
*
[
to_numpy
(
m
)
for
m
in
to_list
(
metric_outs
)])
m
=
metric
.
update
(
*
[
to_numpy
(
m
)
for
m
in
to_list
(
metric_outs
)])
metrics
.
append
(
m
)
metrics
.
append
(
m
)
return
([
to_numpy
(
l
)
for
l
in
losses
],
metrics
)
\
return
([
to_numpy
(
l
)
for
l
in
losses
],
metrics
)
\
if
len
(
metrics
)
>
0
else
[
to_numpy
(
l
)
for
l
in
losses
]
if
len
(
metrics
)
>
0
else
[
to_numpy
(
l
)
for
l
in
losses
]
def
eval
(
self
,
inputs
,
labels
=
None
):
def
eval
(
self
,
inputs
,
labels
=
None
):
super
(
Model
,
self
.
model
).
eval
()
super
(
Model
,
self
.
model
).
eval
()
...
@@ -576,15 +589,17 @@ class DynamicGraphAdapter(object):
...
@@ -576,15 +589,17 @@ class DynamicGraphAdapter(object):
samples
=
outputs
[
0
].
shape
[
0
]
samples
=
outputs
[
0
].
shape
[
0
]
current_count
=
self
.
_merge_count
.
get
(
self
.
mode
+
'_total'
,
0
)
current_count
=
self
.
_merge_count
.
get
(
self
.
mode
+
'_total'
,
0
)
if
current_count
+
samples
>=
total_size
:
if
current_count
+
samples
>=
total_size
:
outputs
=
[
o
[:
total_size
-
metric
.
count
[
0
]]
for
o
in
outputs
]
outputs
=
[
o
[:
total_size
-
metric
.
count
[
0
]]
for
o
in
outputs
]
labels
=
[
l
[:
total_size
-
metric
.
count
[
0
]]
for
l
in
labels
]
labels
=
[
l
[:
total_size
-
metric
.
count
[
0
]]
for
l
in
labels
]
self
.
_merge_count
[
self
.
mode
+
'_total'
]
=
0
self
.
_merge_count
[
self
.
mode
+
'_total'
]
=
0
self
.
_merge_count
[
self
.
mode
+
'_batch'
]
=
total_size
-
current_count
self
.
_merge_count
[
self
.
mode
+
'_batch'
]
=
total_size
-
current_count
else
:
else
:
self
.
_merge_count
[
self
.
mode
+
'_total'
]
+=
samples
self
.
_merge_count
[
self
.
mode
+
'_total'
]
+=
samples
self
.
_merge_count
[
self
.
mode
+
'_batch'
]
=
samples
self
.
_merge_count
[
self
.
mode
+
'_batch'
]
=
samples
metric_outs
=
metric
.
add_metric_op
(
to_list
(
outputs
),
labels
)
metric_outs
=
metric
.
add_metric_op
(
to_list
(
outputs
),
labels
)
m
=
metric
.
update
(
*
[
to_numpy
(
m
)
for
m
in
to_list
(
metric_outs
)])
m
=
metric
.
update
(
*
[
to_numpy
(
m
)
for
m
in
to_list
(
metric_outs
)])
metrics
.
append
(
m
)
metrics
.
append
(
m
)
...
@@ -592,7 +607,7 @@ class DynamicGraphAdapter(object):
...
@@ -592,7 +607,7 @@ class DynamicGraphAdapter(object):
# To be consistent with static graph
# To be consistent with static graph
# return empty loss if loss_function is None
# return empty loss if loss_function is None
return
([
to_numpy
(
l
)
for
l
in
losses
],
metrics
)
\
return
([
to_numpy
(
l
)
for
l
in
losses
],
metrics
)
\
if
len
(
metrics
)
>
0
else
[
to_numpy
(
l
)
for
l
in
losses
]
if
len
(
metrics
)
>
0
else
[
to_numpy
(
l
)
for
l
in
losses
]
def
test
(
self
,
inputs
):
def
test
(
self
,
inputs
):
super
(
Model
,
self
.
model
).
eval
()
super
(
Model
,
self
.
model
).
eval
()
...
@@ -689,19 +704,18 @@ class Model(fluid.dygraph.Layer):
...
@@ -689,19 +704,18 @@ class Model(fluid.dygraph.Layer):
# init multiple gpus context
# init multiple gpus context
self
.
_place
=
fluid
.
CUDAPlace
(
Env
().
dev_id
)
\
self
.
_place
=
fluid
.
CUDAPlace
(
Env
().
dev_id
)
\
if
Env
().
nranks
>
1
else
fluid
.
CUDAPlace
(
0
)
if
Env
().
nranks
>
1
else
fluid
.
CUDAPlace
(
0
)
global
_parallel_context_inited
global
_parallel_context_init
ializ
ed
if
Env
().
nranks
>
1
and
not
_parallel_context_inited
:
if
Env
().
nranks
>
1
and
not
_parallel_context_init
ializ
ed
:
if
fluid
.
in_dygraph_mode
():
if
fluid
.
in_dygraph_mode
():
fluid
.
disable_dygraph
()
fluid
.
disable_dygraph
()
fluid
.
enable_dygraph
(
self
.
_place
)
fluid
.
enable_dygraph
(
self
.
_place
)
fluid
.
dygraph
.
parallel
.
prepare_context
()
fluid
.
dygraph
.
parallel
.
prepare_context
()
else
:
else
:
fluid
.
enable_dygraph
(
self
.
_place
)
prepare_distributed_context
(
self
.
_place
)
fluid
.
dygraph
.
parallel
.
prepare_context
()
fluid
.
disable_dygraph
()
_parallel_context_initialized
=
True
_parallel_context_inited
=
True
# init backend
# init backend
if
fluid
.
in_dygraph_mode
():
if
fluid
.
in_dygraph_mode
():
...
@@ -850,13 +864,14 @@ class Model(fluid.dygraph.Layer):
...
@@ -850,13 +864,14 @@ class Model(fluid.dygraph.Layer):
metrics
=
metrics
or
[]
metrics
=
metrics
or
[]
for
metric
in
to_list
(
metrics
):
for
metric
in
to_list
(
metrics
):
assert
isinstance
(
metric
,
Metric
),
\
assert
isinstance
(
metric
,
Metric
),
\
"{} is not sub class of Metric"
.
format
(
metric
.
__class__
.
__name__
)
"{} is not sub class of Metric"
.
format
(
metric
.
__class__
.
__name__
)
self
.
_metrics
=
to_list
(
metrics
)
self
.
_metrics
=
to_list
(
metrics
)
self
.
_inputs
=
inputs
self
.
_inputs
=
inputs
self
.
_labels
=
labels
self
.
_labels
=
labels
self
.
_device
=
device
self
.
_device
=
device
if
device
is
None
:
if
device
is
None
:
self
.
_device
=
'GPU'
if
fluid
.
is_compiled_with_cuda
()
else
'CPU'
self
.
_device
=
'GPU'
if
fluid
.
is_compiled_with_cuda
()
else
'CPU'
self
.
_device_ids
=
device_ids
self
.
_device_ids
=
device_ids
...
@@ -873,6 +888,7 @@ class Model(fluid.dygraph.Layer):
...
@@ -873,6 +888,7 @@ class Model(fluid.dygraph.Layer):
epochs
=
1
,
epochs
=
1
,
eval_freq
=
1
,
eval_freq
=
1
,
log_freq
=
10
,
log_freq
=
10
,
save_dir
=
None
,
save_freq
=
1
,
save_freq
=
1
,
verbose
=
2
,
verbose
=
2
,
drop_last
=
False
,
drop_last
=
False
,
...
@@ -882,25 +898,32 @@ class Model(fluid.dygraph.Layer):
...
@@ -882,25 +898,32 @@ class Model(fluid.dygraph.Layer):
"""
"""
FIXME: add more comments and usage
FIXME: add more comments and usage
Args:
Args:
train_loader (DataLoader): an iterable data loader is used for train.
train_dataset (Dataset): An instance of paddle.fluid.io.Dataset.
eval_loader (DataLoader): an iterable data loader is used for
eval_dataset (Dataset): An instance of paddle.fluid.io.Dataset.
train_loader (DataLoader): An iterable data loader is used for train.
eval_loader (DataLoader): An iterable data loader is used for
evaluation at the end of epoch. If None, will not do evaluation.
evaluation at the end of epoch. If None, will not do evaluation.
epochs (int): number of epochs to train the model.
epochs (int): Integer number. The number of epochs to train the model.
eval_freq (int): evaluation frequency in epoch.
eval_freq (int): The frequency, in number of epochs, an evalutation
log_freq (int): frequency to print log during training.
is performed.
save_freq (int): frequency to save checkpoint during training.
log_freq (int): The frequency, in number of steps, the training logs
verbose (int): verbosity mode, should be 0, 1, or 2.
is printed.
save_dir(str|None): The directory to save checkpoint during training.
If None, will not save checkpoint.
save_freq (int): The frequency, in number of epochs, to save checkpoint.
verbose (int): The verbosity mode, should be 0, 1, or 2.
0 = silent, 1 = progress bar, 2 = one line per epoch.
0 = silent, 1 = progress bar, 2 = one line per epoch.
callbacks (Callback|None): list of `Callback` instances to apply
callbacks (Callback|None): A list of `Callback` instances to apply
during training.
during training. If None, `ProgBarLogger` and `ModelCheckpoint`
are automatically inserted.
"""
"""
assert
train_dataset
is
not
None
or
train_loader
is
not
None
,
\
assert
train_dataset
is
not
None
or
train_loader
is
not
None
,
\
"train_dataset or train_loader must be given"
"train_dataset or train_loader must be given"
assert
(
train_loader
is
not
None
and
train_dataset
is
None
)
or
\
assert
(
train_loader
is
not
None
and
train_dataset
is
None
)
or
\
(
train_loader
is
None
and
train_dataset
is
not
None
),
\
(
train_loader
is
None
and
train_dataset
is
not
None
),
\
"train_dataset should not be set when train_loader is given"
"train_dataset should not be set when train_loader is given"
if
fluid
.
in_dygraph_mode
():
if
fluid
.
in_dygraph_mode
():
feed_list
=
None
feed_list
=
None
...
@@ -908,42 +931,48 @@ class Model(fluid.dygraph.Layer):
...
@@ -908,42 +931,48 @@ class Model(fluid.dygraph.Layer):
feed_list
=
[
x
.
forward
()
for
x
in
self
.
_inputs
+
self
.
_labels
]
feed_list
=
[
x
.
forward
()
for
x
in
self
.
_inputs
+
self
.
_labels
]
if
train_loader
is
None
:
if
train_loader
is
None
:
train_sampler
=
DistributedBatchSampler
(
train_dataset
,
train_sampler
=
DistributedBatchSampler
(
batch_size
=
batch_size
,
train_dataset
,
shuffle
=
shuffle
,
batch_size
=
batch_size
,
drop_last
=
drop_last
)
shuffle
=
shuffle
,
train_loader
=
DataLoader
(
train_dataset
,
drop_last
=
drop_last
)
batch_sampler
=
train_sampler
,
train_loader
=
DataLoader
(
places
=
self
.
_place
,
train_dataset
,
feed_list
=
feed_list
,
batch_sampler
=
train_sampler
,
num_workers
=
num_workers
,
places
=
self
.
_place
,
return_list
=
True
)
feed_list
=
feed_list
,
num_workers
=
num_workers
,
return_list
=
True
)
if
eval_loader
is
None
and
eval_dataset
is
not
None
:
if
eval_loader
is
None
and
eval_dataset
is
not
None
:
eval_sampler
=
DistributedBatchSampler
(
eval_dataset
,
eval_sampler
=
DistributedBatchSampler
(
batch_size
=
batch_size
)
eval_dataset
,
batch_size
=
batch_size
)
eval_loader
=
DataLoader
(
eval_dataset
,
eval_loader
=
DataLoader
(
batch_sampler
=
eval_sampler
,
eval_dataset
,
places
=
self
.
_place
,
batch_sampler
=
eval_sampler
,
feed_list
=
feed_list
,
places
=
self
.
_place
,
num_workers
=
num_workers
,
feed_list
=
feed_list
,
return_list
=
True
)
num_workers
=
num_workers
,
return_list
=
True
)
do_eval
=
eval_loader
is
not
None
do_eval
=
eval_loader
is
not
None
self
.
_test_dataloader
=
eval_loader
self
.
_test_dataloader
=
eval_loader
metrics_name
=
self
.
_metrics_name
()
metrics_name
=
self
.
_metrics_name
()
steps
=
len
(
train_loader
)
if
hasattr
(
train_loader
,
'__len__'
)
else
None
cbks
=
config_callbacks
(
cbks
=
config_callbacks
(
callbacks
,
callbacks
,
model
=
self
,
model
=
self
,
epochs
=
epochs
,
epochs
=
epochs
,
steps
=
None
,
steps
=
steps
,
log_freq
=
log_freq
,
log_freq
=
log_freq
,
save_freq
=
save_freq
,
save_freq
=
save_freq
,
save_dir
=
save_dir
,
verbose
=
verbose
,
verbose
=
verbose
,
metrics
=
self
.
_metrics_name
(),
)
metrics
=
self
.
_metrics_name
(),
)
def
_run_one_epoch
(
data_loader
,
callbacks
,
mode
):
def
_run_one_epoch
(
data_loader
,
callbacks
,
mode
):
size
=
data_loader
.
size
if
hasattr
(
data_loader
,
'size'
)
else
None
size
=
len
(
data_loader
)
if
hasattr
(
data_loader
,
'__len__'
)
else
None
logs
=
{
logs
=
{
'steps'
:
size
,
'steps'
:
size
,
'metrics_name'
:
metrics_name
,
'metrics_name'
:
metrics_name
,
...
@@ -969,7 +998,7 @@ class Model(fluid.dygraph.Layer):
...
@@ -969,7 +998,7 @@ class Model(fluid.dygraph.Layer):
for
metric
in
self
.
_metrics
:
for
metric
in
self
.
_metrics
:
res
=
metric
.
accumulate
()
res
=
metric
.
accumulate
()
metrics
.
extend
(
to_list
(
res
))
metrics
.
extend
(
to_list
(
res
))
assert
len
(
metrics_name
)
==
len
(
metrics
)
assert
len
(
metrics_name
)
==
len
(
metrics
)
for
k
,
v
in
zip
(
metrics_name
,
metrics
):
for
k
,
v
in
zip
(
metrics_name
,
metrics
):
logs
[
k
]
=
v
logs
[
k
]
=
v
...
@@ -978,7 +1007,8 @@ class Model(fluid.dygraph.Layer):
...
@@ -978,7 +1007,8 @@ class Model(fluid.dygraph.Layer):
if
mode
==
'train'
or
self
.
_adapter
.
_merge_count
.
get
(
mode
+
'_batch'
,
0
)
<=
0
:
if
mode
==
'train'
or
self
.
_adapter
.
_merge_count
.
get
(
mode
+
'_batch'
,
0
)
<=
0
:
logs
[
'batch_size'
]
=
batch_size
*
Env
().
nranks
logs
[
'batch_size'
]
=
batch_size
*
Env
().
nranks
else
:
else
:
logs
[
'batch_size'
]
=
self
.
_adapter
.
_merge_count
[
mode
+
'_batch'
]
logs
[
'batch_size'
]
=
self
.
_adapter
.
_merge_count
[
mode
+
'_batch'
]
cbks
.
on_batch_end
(
mode
,
step
,
logs
)
cbks
.
on_batch_end
(
mode
,
step
,
logs
)
self
.
_reset_metrics
()
self
.
_reset_metrics
()
...
@@ -1000,7 +1030,7 @@ class Model(fluid.dygraph.Layer):
...
@@ -1000,7 +1030,7 @@ class Model(fluid.dygraph.Layer):
loader
=
eval_loader
loader
=
eval_loader
if
not
isinstance
(
eval_loader
,
Iterable
):
if
not
isinstance
(
eval_loader
,
Iterable
):
loader
=
eval_loader
()
loader
=
eval_loader
()
logs
=
_run_one_epoch
(
eval_loader
()
,
cbks
,
'eval'
)
logs
=
_run_one_epoch
(
eval_loader
,
cbks
,
'eval'
)
cbks
.
on_end
(
'eval'
,
logs
)
cbks
.
on_end
(
'eval'
,
logs
)
cbks
.
on_end
(
'train'
,
logs
)
cbks
.
on_end
(
'train'
,
logs
)
...
...
tests/test_model.py
浏览文件 @
a501b8c8
...
@@ -151,16 +151,18 @@ class TestModel(unittest.TestCase):
...
@@ -151,16 +151,18 @@ class TestModel(unittest.TestCase):
train_dataset
=
MnistDataset
(
mode
=
'train'
)
train_dataset
=
MnistDataset
(
mode
=
'train'
)
val_dataset
=
MnistDataset
(
mode
=
'test'
)
val_dataset
=
MnistDataset
(
mode
=
'test'
)
model
=
MNIST
()
if
not
is_mlp
else
MLP
()
model
=
MNIST
()
if
not
is_mlp
else
MLP
()
optim
=
fluid
.
optimizer
.
Momentum
(
optim
=
fluid
.
optimizer
.
Momentum
(
learning_rate
=
0.01
,
learning_rate
=
0.01
,
momentum
=
.
9
,
parameter_list
=
model
.
parameters
())
momentum
=
.
9
,
parameter_list
=
model
.
parameters
())
loss
=
CrossEntropy
()
if
not
is_mlp
else
MyCrossEntropy
()
loss
=
CrossEntropy
()
if
not
is_mlp
else
MyCrossEntropy
()
model
.
prepare
(
optim
,
loss
,
Accuracy
(),
inputs
,
labels
)
model
.
prepare
(
optim
,
loss
,
Accuracy
(),
inputs
,
labels
)
cbk
=
ProgBarLogger
(
50
)
cbk
=
ProgBarLogger
(
50
)
model
.
fit
(
train_dataset
,
val_dataset
,
epochs
=
2
,
batch_size
=
batch_size
,
callbacks
=
cbk
)
model
.
fit
(
train_dataset
,
val_dataset
,
epochs
=
2
,
batch_size
=
batch_size
,
callbacks
=
cbk
)
def
test_fit_static
(
self
):
def
test_fit_static
(
self
):
self
.
fit
(
False
)
self
.
fit
(
False
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录