Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
68bcc469
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
68bcc469
编写于
6月 22, 2021
作者:
H
Haoxin Ma
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
save best and test on tiny/s0
上级
6487ca60
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
230 addition
and
125 deletion
+230
-125
deepspeech/training/trainer.py
deepspeech/training/trainer.py
+10
-4
deepspeech/utils/checkpoint.py
deepspeech/utils/checkpoint.py
+216
-120
examples/tiny/s0/conf/deepspeech2.yaml
examples/tiny/s0/conf/deepspeech2.yaml
+4
-1
未找到文件。
deepspeech/training/trainer.py
浏览文件 @
68bcc469
...
@@ -18,7 +18,7 @@ import paddle
...
@@ -18,7 +18,7 @@ import paddle
from
paddle
import
distributed
as
dist
from
paddle
import
distributed
as
dist
from
tensorboardX
import
SummaryWriter
from
tensorboardX
import
SummaryWriter
from
deepspeech.utils
import
c
heckpoint
from
deepspeech.utils
.checkpoint
import
KBestC
heckpoint
from
deepspeech.utils
import
mp_tools
from
deepspeech.utils
import
mp_tools
from
deepspeech.utils.log
import
Log
from
deepspeech.utils.log
import
Log
...
@@ -139,9 +139,12 @@ class Trainer():
...
@@ -139,9 +139,12 @@ class Trainer():
"epoch"
:
self
.
epoch
,
"epoch"
:
self
.
epoch
,
"lr"
:
self
.
optimizer
.
get_lr
()
"lr"
:
self
.
optimizer
.
get_lr
()
})
})
checkpoint
.
save_parameters
(
self
.
checkpoint_dir
,
self
.
iteration
self
.
checkpoint
.
add_checkpoint
(
self
.
checkpoint_dir
,
self
.
iteration
if
tag
is
None
else
tag
,
self
.
model
,
if
tag
is
None
else
tag
,
self
.
model
,
self
.
optimizer
,
infos
)
self
.
optimizer
,
infos
)
# checkpoint.save_parameters(self.checkpoint_dir, self.iteration
# if tag is None else tag, self.model,
# self.optimizer, infos)
def
resume_or_scratch
(
self
):
def
resume_or_scratch
(
self
):
"""Resume from latest checkpoint at checkpoints in the output
"""Resume from latest checkpoint at checkpoints in the output
...
@@ -151,7 +154,7 @@ class Trainer():
...
@@ -151,7 +154,7 @@ class Trainer():
resume training.
resume training.
"""
"""
scratch
=
None
scratch
=
None
infos
=
checkpoint
.
load_parameters
(
infos
=
self
.
checkpoint
.
load_parameters
(
self
.
model
,
self
.
model
,
self
.
optimizer
,
self
.
optimizer
,
checkpoint_dir
=
self
.
checkpoint_dir
,
checkpoint_dir
=
self
.
checkpoint_dir
,
...
@@ -180,7 +183,7 @@ class Trainer():
...
@@ -180,7 +183,7 @@ class Trainer():
from_scratch
=
self
.
resume_or_scratch
()
from_scratch
=
self
.
resume_or_scratch
()
if
from_scratch
:
if
from_scratch
:
# save init model, i.e. 0 epoch
# save init model, i.e. 0 epoch
self
.
save
(
tag
=
'init'
)
self
.
save
(
tag
=
'init'
,
infos
=
None
)
self
.
lr_scheduler
.
step
(
self
.
iteration
)
self
.
lr_scheduler
.
step
(
self
.
iteration
)
if
self
.
parallel
:
if
self
.
parallel
:
...
@@ -263,6 +266,9 @@ class Trainer():
...
@@ -263,6 +266,9 @@ class Trainer():
self
.
checkpoint_dir
=
checkpoint_dir
self
.
checkpoint_dir
=
checkpoint_dir
self
.
checkpoint
=
KBestCheckpoint
(
max_size
=
self
.
config
.
training
.
max_epoch
,
last_size
=
self
.
config
.
training
.
last_epoch
)
@
mp_tools
.
rank_zero_only
@
mp_tools
.
rank_zero_only
def
destory
(
self
):
def
destory
(
self
):
"""Close visualizer to avoid hanging after training"""
"""Close visualizer to avoid hanging after training"""
...
...
deepspeech/utils/checkpoint.py
浏览文件 @
68bcc469
...
@@ -23,130 +23,226 @@ from paddle.optimizer import Optimizer
...
@@ -23,130 +23,226 @@ from paddle.optimizer import Optimizer
from
deepspeech.utils
import
mp_tools
from
deepspeech.utils
import
mp_tools
from
deepspeech.utils.log
import
Log
from
deepspeech.utils.log
import
Log
import
glob
logger
=
Log
(
__name__
).
getlog
()
logger
=
Log
(
__name__
).
getlog
()
__all__
=
[
"load_parameters"
,
"save_parameters"
]
__all__
=
[
"load_parameters"
,
"save_parameters"
]
class
KBestCheckpoint
(
object
):
def
__init__
(
self
,
max_size
:
int
=
5
,
last_size
:
int
=
1
):
self
.
best_records
:
Mapping
[
Path
,
float
]
=
{}
self
.
last_records
=
[]
self
.
max_size
=
max_size
self
.
last_size
=
last_size
self
.
_save_all
=
(
max_size
==
-
1
)
def
should_save_best
(
self
,
metric
:
float
)
->
bool
:
if
not
self
.
best_full
():
return
True
# already full
worst_record_path
=
max
(
self
.
best_records
,
key
=
self
.
best_records
.
get
)
worst_metric
=
self
.
best_records
[
worst_record_path
]
return
metric
<
worst_metric
def
best_full
(
self
):
return
(
not
self
.
_save_all
)
and
len
(
self
.
best_records
)
==
self
.
max_size
def
last_full
(
self
):
return
len
(
self
.
last_records
)
==
self
.
last_size
def
add_checkpoint
(
self
,
checkpoint_dir
,
tag_or_iteration
,
model
,
optimizer
,
infos
):
if
(
"val_loss"
not
in
infos
.
keys
()):
self
.
save_parameters
(
checkpoint_dir
,
tag_or_iteration
,
model
,
optimizer
,
infos
)
return
#save best
if
self
.
should_save_best
(
infos
[
"val_loss"
]):
self
.
save_checkpoint_and_update
(
infos
[
"val_loss"
],
checkpoint_dir
,
tag_or_iteration
,
model
,
optimizer
,
infos
)
#save last
self
.
save_last_checkpoint_and_update
(
checkpoint_dir
,
tag_or_iteration
,
model
,
optimizer
,
infos
)
if
isinstance
(
tag_or_iteration
,
int
):
self
.
_save_record
(
checkpoint_dir
,
tag_or_iteration
)
def
save_checkpoint_and_update
(
self
,
metric
,
checkpoint_dir
,
tag_or_iteration
,
model
,
optimizer
,
infos
):
# remove the worst
if
self
.
best_full
():
worst_record_path
=
max
(
self
.
best_records
,
key
=
self
.
best_records
.
get
)
self
.
best_records
.
pop
(
worst_record_path
)
if
(
worst_record_path
not
in
self
.
last_records
):
print
(
'----to remove (best)----'
)
print
(
worst_record_path
)
self
.
del_checkpoint
(
checkpoint_dir
,
worst_record_path
)
# add the new one
self
.
save_parameters
(
checkpoint_dir
,
tag_or_iteration
,
model
,
optimizer
,
infos
)
self
.
best_records
[
tag_or_iteration
]
=
metric
def
save_last_checkpoint_and_update
(
self
,
checkpoint_dir
,
tag_or_iteration
,
model
,
optimizer
,
infos
):
# remove the old
if
self
.
last_full
():
to_del_fn
=
self
.
last_records
.
pop
(
0
)
if
(
to_del_fn
not
in
self
.
best_records
.
keys
()):
print
(
'----to remove (last)----'
)
print
(
to_del_fn
)
self
.
del_checkpoint
(
checkpoint_dir
,
to_del_fn
)
self
.
last_records
.
append
(
tag_or_iteration
)
self
.
save_parameters
(
checkpoint_dir
,
tag_or_iteration
,
model
,
optimizer
,
infos
)
# with open(os.path.join(checkpoint_dir, "checkpoint"), "w") as handle:
# for iteration in self.best_records
# handle.write("model_checkpoint_path:{}\n".format(iteration))
def
del_checkpoint
(
self
,
checkpoint_dir
,
tag_or_iteration
):
checkpoint_path
=
os
.
path
.
join
(
checkpoint_dir
,
"{}"
.
format
(
tag_or_iteration
))
for
filename
in
glob
.
glob
(
checkpoint_path
+
".*"
):
os
.
remove
(
filename
)
print
(
"delete file: "
+
filename
)
def
_load_latest_checkpoint
(
self
,
checkpoint_dir
:
str
)
->
int
:
"""Get the iteration number corresponding to the latest saved checkpoint.
Args:
checkpoint_dir (str): the directory where checkpoint is saved.
Returns:
int: the latest iteration number. -1 for no checkpoint to load.
"""
checkpoint_record
=
os
.
path
.
join
(
checkpoint_dir
,
"checkpoint_last"
)
if
not
os
.
path
.
isfile
(
checkpoint_record
):
return
-
1
# Fetch the latest checkpoint index.
with
open
(
checkpoint_record
,
"rt"
)
as
handle
:
latest_checkpoint
=
handle
.
readlines
()[
-
1
].
strip
()
iteration
=
int
(
latest_checkpoint
.
split
(
":"
)[
-
1
])
return
iteration
def
_save_record
(
self
,
checkpoint_dir
:
str
,
iteration
:
int
):
"""Save the iteration number of the latest model to be checkpoint record.
Args:
checkpoint_dir (str): the directory where checkpoint is saved.
iteration (int): the latest iteration number.
Returns:
None
"""
checkpoint_record_last
=
os
.
path
.
join
(
checkpoint_dir
,
"checkpoint_last"
)
checkpoint_record_best
=
os
.
path
.
join
(
checkpoint_dir
,
"checkpoint_best"
)
# Update the latest checkpoint index.
# with open(checkpoint_record, "a+") as handle:
# handle.write("model_checkpoint_path:{}\n".format(iteration))
with
open
(
checkpoint_record_best
,
"w"
)
as
handle
:
for
i
in
self
.
best_records
.
keys
():
handle
.
write
(
"model_checkpoint_path:{}
\n
"
.
format
(
i
))
with
open
(
checkpoint_record_last
,
"w"
)
as
handle
:
for
i
in
self
.
last_records
:
handle
.
write
(
"model_checkpoint_path:{}
\n
"
.
format
(
i
))
def
load_parameters
(
self
,
model
,
optimizer
=
None
,
checkpoint_dir
=
None
,
checkpoint_path
=
None
):
"""Load a specific model checkpoint from disk.
Args:
model (Layer): model to load parameters.
optimizer (Optimizer, optional): optimizer to load states if needed.
Defaults to None.
checkpoint_dir (str, optional): the directory where checkpoint is saved.
checkpoint_path (str, optional): if specified, load the checkpoint
stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will
be ignored. Defaults to None.
Returns:
configs (dict): epoch or step, lr and other meta info should be saved.
"""
configs
=
{}
if
checkpoint_path
is
not
None
:
tag
=
os
.
path
.
basename
(
checkpoint_path
).
split
(
":"
)[
-
1
]
elif
checkpoint_dir
is
not
None
:
iteration
=
self
.
_load_latest_checkpoint
(
checkpoint_dir
)
if
iteration
==
-
1
:
return
configs
checkpoint_path
=
os
.
path
.
join
(
checkpoint_dir
,
"{}"
.
format
(
iteration
))
else
:
raise
ValueError
(
"At least one of 'checkpoint_dir' and 'checkpoint_path' should be specified!"
)
rank
=
dist
.
get_rank
()
params_path
=
checkpoint_path
+
".pdparams"
model_dict
=
paddle
.
load
(
params_path
)
model
.
set_state_dict
(
model_dict
)
logger
.
info
(
"Rank {}: loaded model from {}"
.
format
(
rank
,
params_path
))
def
_load_latest_checkpoint
(
checkpoint_dir
:
str
)
->
int
:
"""Get the iteration number corresponding to the latest saved checkpoint.
Args:
checkpoint_dir (str): the directory where checkpoint is saved.
Returns:
int: the latest iteration number. -1 for no checkpoint to load.
"""
checkpoint_record
=
os
.
path
.
join
(
checkpoint_dir
,
"checkpoint"
)
if
not
os
.
path
.
isfile
(
checkpoint_record
):
return
-
1
# Fetch the latest checkpoint index.
with
open
(
checkpoint_record
,
"rt"
)
as
handle
:
latest_checkpoint
=
handle
.
readlines
()[
-
1
].
strip
()
iteration
=
int
(
latest_checkpoint
.
split
(
":"
)[
-
1
])
return
iteration
def
_save_record
(
checkpoint_dir
:
str
,
iteration
:
int
):
"""Save the iteration number of the latest model to be checkpoint record.
Args:
checkpoint_dir (str): the directory where checkpoint is saved.
iteration (int): the latest iteration number.
Returns:
None
"""
checkpoint_record
=
os
.
path
.
join
(
checkpoint_dir
,
"checkpoint"
)
# Update the latest checkpoint index.
with
open
(
checkpoint_record
,
"a+"
)
as
handle
:
handle
.
write
(
"model_checkpoint_path:{}
\n
"
.
format
(
iteration
))
def
load_parameters
(
model
,
optimizer
=
None
,
checkpoint_dir
=
None
,
checkpoint_path
=
None
):
"""Load a specific model checkpoint from disk.
Args:
model (Layer): model to load parameters.
optimizer (Optimizer, optional): optimizer to load states if needed.
Defaults to None.
checkpoint_dir (str, optional): the directory where checkpoint is saved.
checkpoint_path (str, optional): if specified, load the checkpoint
stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will
be ignored. Defaults to None.
Returns:
configs (dict): epoch or step, lr and other meta info should be saved.
"""
configs
=
{}
if
checkpoint_path
is
not
None
:
tag
=
os
.
path
.
basename
(
checkpoint_path
).
split
(
":"
)[
-
1
]
elif
checkpoint_dir
is
not
None
:
iteration
=
_load_latest_checkpoint
(
checkpoint_dir
)
if
iteration
==
-
1
:
return
configs
checkpoint_path
=
os
.
path
.
join
(
checkpoint_dir
,
"{}"
.
format
(
iteration
))
else
:
raise
ValueError
(
"At least one of 'checkpoint_dir' and 'checkpoint_path' should be specified!"
)
rank
=
dist
.
get_rank
()
params_path
=
checkpoint_path
+
".pdparams"
model_dict
=
paddle
.
load
(
params_path
)
model
.
set_state_dict
(
model_dict
)
logger
.
info
(
"Rank {}: loaded model from {}"
.
format
(
rank
,
params_path
))
optimizer_path
=
checkpoint_path
+
".pdopt"
if
optimizer
and
os
.
path
.
isfile
(
optimizer_path
):
optimizer_dict
=
paddle
.
load
(
optimizer_path
)
optimizer
.
set_state_dict
(
optimizer_dict
)
logger
.
info
(
"Rank {}: loaded optimizer state from {}"
.
format
(
rank
,
optimizer_path
))
info_path
=
re
.
sub
(
'.pdparams$'
,
'.json'
,
params_path
)
if
os
.
path
.
exists
(
info_path
):
with
open
(
info_path
,
'r'
)
as
fin
:
configs
=
json
.
load
(
fin
)
return
configs
@
mp_tools
.
rank_zero_only
def
save_parameters
(
checkpoint_dir
:
str
,
tag_or_iteration
:
Union
[
int
,
str
],
model
:
paddle
.
nn
.
Layer
,
optimizer
:
Optimizer
=
None
,
infos
:
dict
=
None
):
"""Checkpoint the latest trained model parameters.
Args:
checkpoint_dir (str): the directory where checkpoint is saved.
tag_or_iteration (int or str): the latest iteration(step or epoch) number.
model (Layer): model to be checkpointed.
optimizer (Optimizer, optional): optimizer to be checkpointed.
Defaults to None.
infos (dict or None): any info you want to save.
Returns:
None
"""
checkpoint_path
=
os
.
path
.
join
(
checkpoint_dir
,
"{}"
.
format
(
tag_or_iteration
))
model_dict
=
model
.
state_dict
()
params_path
=
checkpoint_path
+
".pdparams"
paddle
.
save
(
model_dict
,
params_path
)
logger
.
info
(
"Saved model to {}"
.
format
(
params_path
))
if
optimizer
:
opt_dict
=
optimizer
.
state_dict
()
optimizer_path
=
checkpoint_path
+
".pdopt"
optimizer_path
=
checkpoint_path
+
".pdopt"
paddle
.
save
(
opt_dict
,
optimizer_path
)
if
optimizer
and
os
.
path
.
isfile
(
optimizer_path
):
logger
.
info
(
"Saved optimzier state to {}"
.
format
(
optimizer_path
))
optimizer_dict
=
paddle
.
load
(
optimizer_path
)
optimizer
.
set_state_dict
(
optimizer_dict
)
info_path
=
re
.
sub
(
'.pdparams$'
,
'.json'
,
params_path
)
logger
.
info
(
"Rank {}: loaded optimizer state from {}"
.
format
(
infos
=
{}
if
infos
is
None
else
infos
rank
,
optimizer_path
))
with
open
(
info_path
,
'w'
)
as
fout
:
data
=
json
.
dumps
(
infos
)
info_path
=
re
.
sub
(
'.pdparams$'
,
'.json'
,
params_path
)
fout
.
write
(
data
)
if
os
.
path
.
exists
(
info_path
):
with
open
(
info_path
,
'r'
)
as
fin
:
configs
=
json
.
load
(
fin
)
return
configs
@
mp_tools
.
rank_zero_only
def
save_parameters
(
self
,
checkpoint_dir
:
str
,
tag_or_iteration
:
Union
[
int
,
str
],
model
:
paddle
.
nn
.
Layer
,
optimizer
:
Optimizer
=
None
,
infos
:
dict
=
None
):
"""Checkpoint the latest trained model parameters.
Args:
checkpoint_dir (str): the directory where checkpoint is saved.
tag_or_iteration (int or str): the latest iteration(step or epoch) number.
model (Layer): model to be checkpointed.
optimizer (Optimizer, optional): optimizer to be checkpointed.
Defaults to None.
infos (dict or None): any info you want to save.
Returns:
None
"""
checkpoint_path
=
os
.
path
.
join
(
checkpoint_dir
,
"{}"
.
format
(
tag_or_iteration
))
model_dict
=
model
.
state_dict
()
params_path
=
checkpoint_path
+
".pdparams"
paddle
.
save
(
model_dict
,
params_path
)
logger
.
info
(
"Saved model to {}"
.
format
(
params_path
))
if
optimizer
:
opt_dict
=
optimizer
.
state_dict
()
optimizer_path
=
checkpoint_path
+
".pdopt"
paddle
.
save
(
opt_dict
,
optimizer_path
)
logger
.
info
(
"Saved optimzier state to {}"
.
format
(
optimizer_path
))
info_path
=
re
.
sub
(
'.pdparams$'
,
'.json'
,
params_path
)
infos
=
{}
if
infos
is
None
else
infos
with
open
(
info_path
,
'w'
)
as
fout
:
data
=
json
.
dumps
(
infos
)
fout
.
write
(
data
)
if
isinstance
(
tag_or_iteration
,
int
):
_save_record
(
checkpoint_dir
,
tag_or_iteration
)
examples/tiny/s0/conf/deepspeech2.yaml
浏览文件 @
68bcc469
...
@@ -43,12 +43,15 @@ model:
...
@@ -43,12 +43,15 @@ model:
share_rnn_weights
:
True
share_rnn_weights
:
True
training
:
training
:
n_epoch
:
24
n_epoch
:
6
lr
:
1e-5
lr
:
1e-5
lr_decay
:
1.0
lr_decay
:
1.0
weight_decay
:
1e-06
weight_decay
:
1e-06
global_grad_clip
:
5.0
global_grad_clip
:
5.0
log_interval
:
1
log_interval
:
1
max_epoch
:
3
last_epoch
:
2
decoding
:
decoding
:
batch_size
:
128
batch_size
:
128
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录