Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
ecb5d4f8
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
ecb5d4f8
编写于
9月 22, 2021
作者:
H
Hui Zhang
提交者:
GitHub
9月 22, 2021
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #847 from PaddlePaddle/resume_train
resume train with epoch and iteration increase
上级
ad761b55
bab29b94
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
37 addition
and
42 deletion
+37
-42
deepspeech/exps/u2/model.py
deepspeech/exps/u2/model.py
+3
-11
deepspeech/exps/u2_kaldi/model.py
deepspeech/exps/u2_kaldi/model.py
+1
-5
deepspeech/exps/u2_st/model.py
deepspeech/exps/u2_st/model.py
+1
-8
deepspeech/training/trainer.py
deepspeech/training/trainer.py
+32
-18
未找到文件。
deepspeech/exps/u2/model.py
浏览文件 @
ecb5d4f8
...
@@ -183,15 +183,7 @@ class U2Trainer(Trainer):
...
@@ -183,15 +183,7 @@ class U2Trainer(Trainer):
# script_model_path = str(self.checkpoint_dir / 'init')
# script_model_path = str(self.checkpoint_dir / 'init')
# paddle.jit.save(script_model, script_model_path)
# paddle.jit.save(script_model, script_model_path)
from_scratch
=
self
.
resume_or_scratch
()
self
.
before_train
()
if
from_scratch
:
# save init model, i.e. 0 epoch
self
.
save
(
tag
=
'init'
,
infos
=
None
)
# lr will resotre from optimizer ckpt
# self.lr_scheduler.step(self.iteration)
if
self
.
parallel
and
hasattr
(
self
.
train_loader
,
'batch_sampler'
):
self
.
train_loader
.
batch_sampler
.
set_epoch
(
self
.
epoch
)
logger
.
info
(
f
"Train Total Examples:
{
len
(
self
.
train_loader
.
dataset
)
}
"
)
logger
.
info
(
f
"Train Total Examples:
{
len
(
self
.
train_loader
.
dataset
)
}
"
)
while
self
.
epoch
<
self
.
config
.
training
.
n_epoch
:
while
self
.
epoch
<
self
.
config
.
training
.
n_epoch
:
...
@@ -207,8 +199,8 @@ class U2Trainer(Trainer):
...
@@ -207,8 +199,8 @@ class U2Trainer(Trainer):
report
(
"Rank"
,
dist
.
get_rank
())
report
(
"Rank"
,
dist
.
get_rank
())
report
(
"epoch"
,
self
.
epoch
)
report
(
"epoch"
,
self
.
epoch
)
report
(
'step'
,
self
.
iteration
)
report
(
'step'
,
self
.
iteration
)
report
(
'
step/total'
,
report
(
'
iter'
,
batch_index
+
1
)
(
batch_index
+
1
)
/
len
(
self
.
train_loader
))
report
(
'total'
,
len
(
self
.
train_loader
))
report
(
"lr"
,
self
.
lr_scheduler
())
report
(
"lr"
,
self
.
lr_scheduler
())
self
.
train_batch
(
batch_index
,
batch
,
msg
)
self
.
train_batch
(
batch_index
,
batch
,
msg
)
self
.
after_train_batch
()
self
.
after_train_batch
()
...
...
deepspeech/exps/u2_kaldi/model.py
浏览文件 @
ecb5d4f8
...
@@ -184,11 +184,7 @@ class U2Trainer(Trainer):
...
@@ -184,11 +184,7 @@ class U2Trainer(Trainer):
# script_model_path = str(self.checkpoint_dir / 'init')
# script_model_path = str(self.checkpoint_dir / 'init')
# paddle.jit.save(script_model, script_model_path)
# paddle.jit.save(script_model, script_model_path)
from_scratch
=
self
.
resume_or_scratch
()
self
.
before_train
()
if
from_scratch
:
# save init model, i.e. 0 epoch
self
.
save
(
tag
=
'init'
)
self
.
lr_scheduler
.
step
(
self
.
iteration
)
logger
.
info
(
f
"Train Total Examples:
{
len
(
self
.
train_loader
.
dataset
)
}
"
)
logger
.
info
(
f
"Train Total Examples:
{
len
(
self
.
train_loader
.
dataset
)
}
"
)
while
self
.
epoch
<
self
.
config
.
training
.
n_epoch
:
while
self
.
epoch
<
self
.
config
.
training
.
n_epoch
:
...
...
deepspeech/exps/u2_st/model.py
浏览文件 @
ecb5d4f8
...
@@ -198,14 +198,7 @@ class U2STTrainer(Trainer):
...
@@ -198,14 +198,7 @@ class U2STTrainer(Trainer):
# script_model_path = str(self.checkpoint_dir / 'init')
# script_model_path = str(self.checkpoint_dir / 'init')
# paddle.jit.save(script_model, script_model_path)
# paddle.jit.save(script_model, script_model_path)
from_scratch
=
self
.
resume_or_scratch
()
self
.
before_train
()
if
from_scratch
:
# save init model, i.e. 0 epoch
self
.
save
(
tag
=
'init'
)
self
.
lr_scheduler
.
step
(
self
.
iteration
)
if
self
.
parallel
:
self
.
train_loader
.
batch_sampler
.
set_epoch
(
self
.
epoch
)
logger
.
info
(
f
"Train Total Examples:
{
len
(
self
.
train_loader
.
dataset
)
}
"
)
logger
.
info
(
f
"Train Total Examples:
{
len
(
self
.
train_loader
.
dataset
)
}
"
)
while
self
.
epoch
<
self
.
config
.
training
.
n_epoch
:
while
self
.
epoch
<
self
.
config
.
training
.
n_epoch
:
...
...
deepspeech/training/trainer.py
浏览文件 @
ecb5d4f8
...
@@ -179,25 +179,47 @@ class Trainer():
...
@@ -179,25 +179,47 @@ class Trainer():
checkpoint_dir
=
self
.
checkpoint_dir
,
checkpoint_dir
=
self
.
checkpoint_dir
,
checkpoint_path
=
self
.
args
.
checkpoint_path
)
checkpoint_path
=
self
.
args
.
checkpoint_path
)
if
infos
:
if
infos
:
# restore from ckpt
# just restore ckpt
# lr will resotre from optimizer ckpt
self
.
iteration
=
infos
[
"step"
]
self
.
iteration
=
infos
[
"step"
]
self
.
epoch
=
infos
[
"epoch"
]
self
.
epoch
=
infos
[
"epoch"
]
scratch
=
False
scratch
=
False
logger
.
info
(
f
"Restore ckpt: epoch
{
self
.
epoch
}
, step
{
self
.
iteration
}
!"
)
else
:
else
:
self
.
iteration
=
0
self
.
iteration
=
0
self
.
epoch
=
0
self
.
epoch
=
0
scratch
=
True
scratch
=
True
logger
.
info
(
"Restore/Init checkpoint
!"
)
logger
.
info
(
"Init from scratch
!"
)
return
scratch
return
scratch
def
maybe_batch_sampler_step
(
self
):
""" batch_sampler seed by epoch """
if
hasattr
(
self
.
train_loader
,
"batch_sampler"
):
batch_sampler
=
self
.
train_loader
.
batch_sampler
if
isinstance
(
batch_sampler
,
paddle
.
io
.
DistributedBatchSampler
):
batch_sampler
.
set_epoch
(
self
.
epoch
)
def
before_train
(
self
):
from_scratch
=
self
.
resume_or_scratch
()
if
from_scratch
:
# scratch: save init model, i.e. 0 epoch
self
.
save
(
tag
=
'init'
,
infos
=
None
)
else
:
# resume: train next_epoch and next_iteration
self
.
epoch
+=
1
self
.
iteration
+=
1
logger
.
info
(
f
"Resume train: epoch
{
self
.
epoch
}
, step
{
self
.
iteration
}
!"
)
self
.
maybe_batch_sampler_step
()
def
new_epoch
(
self
):
def
new_epoch
(
self
):
"""Reset the train loader seed and increment `epoch`.
"""Reset the train loader seed and increment `epoch`.
"""
"""
# `iteration` increased by train step
self
.
epoch
+=
1
self
.
epoch
+=
1
if
self
.
parallel
and
hasattr
(
self
.
train_loader
,
"batch_sampler"
):
self
.
maybe_batch_sampler_step
()
batch_sampler
=
self
.
train_loader
.
batch_sampler
if
isinstance
(
batch_sampler
,
paddle
.
io
.
DistributedBatchSampler
):
batch_sampler
.
set_epoch
(
self
.
epoch
)
def
after_train_batch
(
self
):
def
after_train_batch
(
self
):
if
self
.
args
.
benchmark_max_step
and
self
.
iteration
>
self
.
args
.
benchmark_max_step
:
if
self
.
args
.
benchmark_max_step
and
self
.
iteration
>
self
.
args
.
benchmark_max_step
:
...
@@ -209,15 +231,7 @@ class Trainer():
...
@@ -209,15 +231,7 @@ class Trainer():
def
train
(
self
):
def
train
(
self
):
"""The training process control by epoch."""
"""The training process control by epoch."""
from_scratch
=
self
.
resume_or_scratch
()
self
.
before_train
()
if
from_scratch
:
# save init model, i.e. 0 epoch
self
.
save
(
tag
=
'init'
,
infos
=
None
)
# lr will resotre from optimizer ckpt
# self.lr_scheduler.step(self.epoch)
if
self
.
parallel
and
hasattr
(
self
.
train_loader
,
"batch_sampler"
):
self
.
train_loader
.
batch_sampler
.
set_epoch
(
self
.
epoch
)
logger
.
info
(
f
"Train Total Examples:
{
len
(
self
.
train_loader
.
dataset
)
}
"
)
logger
.
info
(
f
"Train Total Examples:
{
len
(
self
.
train_loader
.
dataset
)
}
"
)
while
self
.
epoch
<
self
.
config
.
training
.
n_epoch
:
while
self
.
epoch
<
self
.
config
.
training
.
n_epoch
:
...
@@ -233,8 +247,8 @@ class Trainer():
...
@@ -233,8 +247,8 @@ class Trainer():
report
(
"Rank"
,
dist
.
get_rank
())
report
(
"Rank"
,
dist
.
get_rank
())
report
(
"epoch"
,
self
.
epoch
)
report
(
"epoch"
,
self
.
epoch
)
report
(
'step'
,
self
.
iteration
)
report
(
'step'
,
self
.
iteration
)
report
(
'
step/total'
,
report
(
'
iter'
,
batch_index
+
1
)
(
batch_index
+
1
)
/
len
(
self
.
train_loader
))
report
(
'total'
,
len
(
self
.
train_loader
))
report
(
"lr"
,
self
.
lr_scheduler
())
report
(
"lr"
,
self
.
lr_scheduler
())
self
.
train_batch
(
batch_index
,
batch
,
msg
)
self
.
train_batch
(
batch_index
,
batch
,
msg
)
self
.
after_train_batch
()
self
.
after_train_batch
()
...
@@ -275,6 +289,7 @@ class Trainer():
...
@@ -275,6 +289,7 @@ class Trainer():
'epoch'
,
{
'cv_loss'
:
cv_loss
,
'epoch'
,
{
'cv_loss'
:
cv_loss
,
'lr'
:
self
.
lr_scheduler
()},
self
.
epoch
)
'lr'
:
self
.
lr_scheduler
()},
self
.
epoch
)
# after epoch
self
.
save
(
tag
=
self
.
epoch
,
infos
=
{
'val_loss'
:
cv_loss
})
self
.
save
(
tag
=
self
.
epoch
,
infos
=
{
'val_loss'
:
cv_loss
})
# step lr every epoch
# step lr every epoch
self
.
lr_scheduler
.
step
()
self
.
lr_scheduler
.
step
()
...
@@ -288,7 +303,6 @@ class Trainer():
...
@@ -288,7 +303,6 @@ class Trainer():
try
:
try
:
self
.
train
()
self
.
train
()
except
KeyboardInterrupt
:
except
KeyboardInterrupt
:
self
.
save
()
exit
(
-
1
)
exit
(
-
1
)
finally
:
finally
:
self
.
destory
()
self
.
destory
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录