Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
e411e0bd
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
e411e0bd
编写于
10月 05, 2021
作者:
H
Hui Zhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
tiny/s0/s1 can run all
上级
d40f2092
变更
40
隐藏空白更改
内联
并排
Showing
40 changed file
with
1047 addition
and
468 deletion
+1047
-468
deepspeech/exps/deepspeech2/bin/deploy/runtime.py
deepspeech/exps/deepspeech2/bin/deploy/runtime.py
+16
-9
deepspeech/exps/deepspeech2/bin/deploy/server.py
deepspeech/exps/deepspeech2/bin/deploy/server.py
+20
-9
deepspeech/exps/deepspeech2/model.py
deepspeech/exps/deepspeech2/model.py
+328
-104
deepspeech/exps/u2/bin/alignment.py
deepspeech/exps/u2/bin/alignment.py
+3
-0
deepspeech/exps/u2/bin/export.py
deepspeech/exps/u2/bin/export.py
+3
-0
deepspeech/exps/u2/bin/test.py
deepspeech/exps/u2/bin/test.py
+3
-0
deepspeech/exps/u2/bin/train.py
deepspeech/exps/u2/bin/train.py
+2
-0
deepspeech/exps/u2/model.py
deepspeech/exps/u2/model.py
+45
-45
deepspeech/frontend/featurizer/text_featurizer.py
deepspeech/frontend/featurizer/text_featurizer.py
+57
-47
deepspeech/frontend/utility.py
deepspeech/frontend/utility.py
+116
-31
deepspeech/io/collator.py
deepspeech/io/collator.py
+9
-3
deepspeech/io/dataset.py
deepspeech/io/dataset.py
+2
-1
deepspeech/models/ds2/conv.py
deepspeech/models/ds2/conv.py
+7
-7
deepspeech/models/ds2/deepspeech2.py
deepspeech/models/ds2/deepspeech2.py
+7
-21
deepspeech/models/ds2/rnn.py
deepspeech/models/ds2/rnn.py
+6
-6
deepspeech/models/ds2_online/deepspeech2.py
deepspeech/models/ds2_online/deepspeech2.py
+41
-55
deepspeech/utils/log.py
deepspeech/utils/log.py
+3
-3
examples/dataset/mini_librispeech/.gitignore
examples/dataset/mini_librispeech/.gitignore
+1
-0
examples/dataset/mini_librispeech/mini_librispeech.py
examples/dataset/mini_librispeech/mini_librispeech.py
+21
-0
examples/librispeech/s1/local/align.sh
examples/librispeech/s1/local/align.sh
+32
-0
examples/librispeech/s1/local/data.sh
examples/librispeech/s1/local/data.sh
+1
-1
examples/librispeech/s1/local/download_lm_en.sh
examples/librispeech/s1/local/download_lm_en.sh
+1
-1
examples/librispeech/s1/local/export.sh
examples/librispeech/s1/local/export.sh
+1
-7
examples/librispeech/s1/local/test.sh
examples/librispeech/s1/local/test.sh
+67
-39
examples/librispeech/s1/local/train.sh
examples/librispeech/s1/local/train.sh
+13
-2
examples/tiny/s0/conf/deepspeech2.yaml
examples/tiny/s0/conf/deepspeech2.yaml
+3
-0
examples/tiny/s0/conf/deepspeech2_online.yaml
examples/tiny/s0/conf/deepspeech2_online.yaml
+72
-0
examples/tiny/s0/local/download_lm_en.sh
examples/tiny/s0/local/download_lm_en.sh
+6
-1
examples/tiny/s0/local/export.sh
examples/tiny/s0/local/export.sh
+6
-11
examples/tiny/s0/local/test.sh
examples/tiny/s0/local/test.sh
+7
-10
examples/tiny/s0/local/train.sh
examples/tiny/s0/local/train.sh
+26
-12
examples/tiny/s0/path.sh
examples/tiny/s0/path.sh
+1
-1
examples/tiny/s0/run.sh
examples/tiny/s0/run.sh
+6
-5
examples/tiny/s1/conf/transformer.yaml
examples/tiny/s1/conf/transformer.yaml
+2
-0
examples/tiny/s1/local/align.sh
examples/tiny/s1/local/align.sh
+32
-0
examples/tiny/s1/local/data.sh
examples/tiny/s1/local/data.sh
+1
-1
examples/tiny/s1/local/export.sh
examples/tiny/s1/local/export.sh
+1
-7
examples/tiny/s1/local/test.sh
examples/tiny/s1/local/test.sh
+42
-15
examples/tiny/s1/local/train.sh
examples/tiny/s1/local/train.sh
+28
-11
examples/tiny/s1/run.sh
examples/tiny/s1/run.sh
+9
-3
未找到文件。
deepspeech/exps/deepspeech2/bin/deploy/runtime.py
浏览文件 @
e411e0bd
...
@@ -18,8 +18,10 @@ import numpy as np
...
@@ -18,8 +18,10 @@ import numpy as np
import
paddle
import
paddle
from
paddle.inference
import
Config
from
paddle.inference
import
Config
from
paddle.inference
import
create_predictor
from
paddle.inference
import
create_predictor
from
paddle.io
import
DataLoader
from
deepspeech.exps.deepspeech2.config
import
get_cfg_defaults
from
deepspeech.exps.deepspeech2.config
import
get_cfg_defaults
from
deepspeech.io.collator
import
SpeechCollator
from
deepspeech.io.dataset
import
ManifestDataset
from
deepspeech.io.dataset
import
ManifestDataset
from
deepspeech.models.ds2
import
DeepSpeech2Model
from
deepspeech.models.ds2
import
DeepSpeech2Model
from
deepspeech.training.cli
import
default_argument_parser
from
deepspeech.training.cli
import
default_argument_parser
...
@@ -78,26 +80,31 @@ def inference(config, args):
...
@@ -78,26 +80,31 @@ def inference(config, args):
def
start_server
(
config
,
args
):
def
start_server
(
config
,
args
):
"""Start the ASR server"""
"""Start the ASR server"""
config
.
defrost
()
config
.
defrost
()
config
.
data
.
manfiest
=
config
.
data
.
test_manifest
config
.
data
.
manifest
=
config
.
data
.
test_manifest
config
.
data
.
augmentation_config
=
""
config
.
data
.
keep_transcription_text
=
True
dataset
=
ManifestDataset
.
from_config
(
config
)
dataset
=
ManifestDataset
.
from_config
(
config
)
model
=
DeepSpeech2Model
.
from_pretrained
(
dataset
,
config
,
config
.
collator
.
augmentation_config
=
""
config
.
collator
.
keep_transcription_text
=
True
config
.
collator
.
batch_size
=
1
config
.
collator
.
num_workers
=
0
collate_fn
=
SpeechCollator
.
from_config
(
config
)
test_loader
=
DataLoader
(
dataset
,
collate_fn
=
collate_fn
,
num_workers
=
0
)
model
=
DeepSpeech2Model
.
from_pretrained
(
test_loader
,
config
,
args
.
checkpoint_path
)
args
.
checkpoint_path
)
model
.
eval
()
model
.
eval
()
# prepare ASR inference handler
# prepare ASR inference handler
def
file_to_transcript
(
filename
):
def
file_to_transcript
(
filename
):
feature
=
dataset
.
process_utterance
(
filename
,
""
)
feature
=
test_loader
.
collate_fn
.
process_utterance
(
filename
,
""
)
audio
=
np
.
array
([
feature
[
0
]]).
astype
(
'float32'
)
#[1,
D, T
]
audio
=
np
.
array
([
feature
[
0
]]).
astype
(
'float32'
)
#[1,
T, D
]
audio_len
=
feature
[
0
].
shape
[
1
]
audio_len
=
feature
[
0
].
shape
[
0
]
audio_len
=
np
.
array
([
audio_len
]).
astype
(
'int64'
)
# [1]
audio_len
=
np
.
array
([
audio_len
]).
astype
(
'int64'
)
# [1]
result_transcript
=
model
.
decode
(
result_transcript
=
model
.
decode
(
paddle
.
to_tensor
(
audio
),
paddle
.
to_tensor
(
audio
),
paddle
.
to_tensor
(
audio_len
),
paddle
.
to_tensor
(
audio_len
),
vocab_list
=
dataset
.
vocab_list
,
vocab_list
=
test_loader
.
collate_fn
.
vocab_list
,
decoding_method
=
config
.
decoding
.
decoding_method
,
decoding_method
=
config
.
decoding
.
decoding_method
,
lang_model_path
=
config
.
decoding
.
lang_model_path
,
lang_model_path
=
config
.
decoding
.
lang_model_path
,
beam_alpha
=
config
.
decoding
.
alpha
,
beam_alpha
=
config
.
decoding
.
alpha
,
...
@@ -138,7 +145,7 @@ if __name__ == "__main__":
...
@@ -138,7 +145,7 @@ if __name__ == "__main__":
add_arg
(
'host_ip'
,
str
,
add_arg
(
'host_ip'
,
str
,
'localhost'
,
'localhost'
,
"Server's IP address."
)
"Server's IP address."
)
add_arg
(
'host_port'
,
int
,
808
6
,
"Server's IP port."
)
add_arg
(
'host_port'
,
int
,
808
9
,
"Server's IP port."
)
add_arg
(
'speech_save_dir'
,
str
,
add_arg
(
'speech_save_dir'
,
str
,
'demo_cache'
,
'demo_cache'
,
"Directory to save demo audios."
)
"Directory to save demo audios."
)
...
...
deepspeech/exps/deepspeech2/bin/deploy/server.py
浏览文件 @
e411e0bd
...
@@ -16,8 +16,10 @@ import functools
...
@@ -16,8 +16,10 @@ import functools
import
numpy
as
np
import
numpy
as
np
import
paddle
import
paddle
from
paddle.io
import
DataLoader
from
deepspeech.exps.deepspeech2.config
import
get_cfg_defaults
from
deepspeech.exps.deepspeech2.config
import
get_cfg_defaults
from
deepspeech.io.collator
import
SpeechCollator
from
deepspeech.io.dataset
import
ManifestDataset
from
deepspeech.io.dataset
import
ManifestDataset
from
deepspeech.models.ds2
import
DeepSpeech2Model
from
deepspeech.models.ds2
import
DeepSpeech2Model
from
deepspeech.training.cli
import
default_argument_parser
from
deepspeech.training.cli
import
default_argument_parser
...
@@ -31,26 +33,35 @@ from deepspeech.utils.utility import print_arguments
...
@@ -31,26 +33,35 @@ from deepspeech.utils.utility import print_arguments
def
start_server
(
config
,
args
):
def
start_server
(
config
,
args
):
"""Start the ASR server"""
"""Start the ASR server"""
config
.
defrost
()
config
.
defrost
()
config
.
data
.
manfiest
=
config
.
data
.
test_manifest
config
.
data
.
manifest
=
config
.
data
.
test_manifest
config
.
data
.
augmentation_config
=
""
config
.
data
.
keep_transcription_text
=
True
dataset
=
ManifestDataset
.
from_config
(
config
)
dataset
=
ManifestDataset
.
from_config
(
config
)
model
=
DeepSpeech2Model
.
from_pretrained
(
dataset
,
config
,
config
.
collator
.
augmentation_config
=
""
config
.
collator
.
keep_transcription_text
=
True
config
.
collator
.
batch_size
=
1
config
.
collator
.
num_workers
=
0
collate_fn
=
SpeechCollator
.
from_config
(
config
)
test_loader
=
DataLoader
(
dataset
,
collate_fn
=
collate_fn
,
num_workers
=
0
)
model
=
DeepSpeech2Model
.
from_pretrained
(
test_loader
,
config
,
args
.
checkpoint_path
)
args
.
checkpoint_path
)
model
.
eval
()
model
.
eval
()
# prepare ASR inference handler
# prepare ASR inference handler
def
file_to_transcript
(
filename
):
def
file_to_transcript
(
filename
):
feature
=
dataset
.
process_utterance
(
filename
,
""
)
feature
=
test_loader
.
collate_fn
.
process_utterance
(
filename
,
""
)
audio
=
np
.
array
([
feature
[
0
]]).
astype
(
'float32'
)
#[1, D, T]
audio
=
np
.
array
([
feature
[
0
]]).
astype
(
'float32'
)
#[1, T, D]
audio_len
=
feature
[
0
].
shape
[
1
]
# audio = audio.swapaxes(1,2)
print
(
'---file_to_transcript feature----'
)
print
(
audio
.
shape
)
audio_len
=
feature
[
0
].
shape
[
0
]
print
(
audio_len
)
audio_len
=
np
.
array
([
audio_len
]).
astype
(
'int64'
)
# [1]
audio_len
=
np
.
array
([
audio_len
]).
astype
(
'int64'
)
# [1]
result_transcript
=
model
.
decode
(
result_transcript
=
model
.
decode
(
paddle
.
to_tensor
(
audio
),
paddle
.
to_tensor
(
audio
),
paddle
.
to_tensor
(
audio_len
),
paddle
.
to_tensor
(
audio_len
),
vocab_list
=
dataset
.
vocab_list
,
vocab_list
=
test_loader
.
collate_fn
.
vocab_list
,
decoding_method
=
config
.
decoding
.
decoding_method
,
decoding_method
=
config
.
decoding
.
decoding_method
,
lang_model_path
=
config
.
decoding
.
lang_model_path
,
lang_model_path
=
config
.
decoding
.
lang_model_path
,
beam_alpha
=
config
.
decoding
.
alpha
,
beam_alpha
=
config
.
decoding
.
alpha
,
...
@@ -91,7 +102,7 @@ if __name__ == "__main__":
...
@@ -91,7 +102,7 @@ if __name__ == "__main__":
add_arg
(
'host_ip'
,
str
,
add_arg
(
'host_ip'
,
str
,
'localhost'
,
'localhost'
,
"Server's IP address."
)
"Server's IP address."
)
add_arg
(
'host_port'
,
int
,
808
6
,
"Server's IP port."
)
add_arg
(
'host_port'
,
int
,
808
8
,
"Server's IP port."
)
add_arg
(
'speech_save_dir'
,
str
,
add_arg
(
'speech_save_dir'
,
str
,
'demo_cache'
,
'demo_cache'
,
"Directory to save demo audios."
)
"Directory to save demo audios."
)
...
...
deepspeech/exps/deepspeech2/model.py
浏览文件 @
e411e0bd
...
@@ -12,26 +12,37 @@
...
@@ -12,26 +12,37 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
"""Contains DeepSpeech2 model."""
"""Contains DeepSpeech2 model."""
import
os
import
time
import
time
from
collections
import
defaultdict
from
collections
import
defaultdict
from
contextlib
import
nullcontext
from
pathlib
import
Path
from
pathlib
import
Path
from
typing
import
Optional
import
jsonlines
import
numpy
as
np
import
numpy
as
np
import
paddle
import
paddle
from
paddle
import
distributed
as
dist
from
paddle
import
distributed
as
dist
from
paddle
import
inference
from
paddle.io
import
DataLoader
from
paddle.io
import
DataLoader
from
yacs.config
import
CfgNode
from
deepspeech.frontend.featurizer.text_featurizer
import
TextFeaturizer
from
deepspeech.io.collator
import
SpeechCollator
from
deepspeech.io.collator
import
SpeechCollator
from
deepspeech.io.dataset
import
ManifestDataset
from
deepspeech.io.dataset
import
ManifestDataset
from
deepspeech.io.sampler
import
SortagradBatchSampler
from
deepspeech.io.sampler
import
SortagradBatchSampler
from
deepspeech.io.sampler
import
SortagradDistributedBatchSampler
from
deepspeech.io.sampler
import
SortagradDistributedBatchSampler
from
deepspeech.models.ds2
import
DeepSpeech2InferModel
from
deepspeech.models.ds2
import
DeepSpeech2InferModel
from
deepspeech.models.ds2
import
DeepSpeech2Model
from
deepspeech.models.ds2
import
DeepSpeech2Model
from
deepspeech.models.ds2_online
import
DeepSpeech2InferModelOnline
from
deepspeech.models.ds2_online
import
DeepSpeech2ModelOnline
from
deepspeech.training.gradclip
import
ClipGradByGlobalNormWithLog
from
deepspeech.training.gradclip
import
ClipGradByGlobalNormWithLog
from
deepspeech.training.reporter
import
report
from
deepspeech.training.trainer
import
Trainer
from
deepspeech.training.trainer
import
Trainer
from
deepspeech.utils
import
error_rate
from
deepspeech.utils
import
error_rate
from
deepspeech.utils
import
layer_tools
from
deepspeech.utils
import
layer_tools
from
deepspeech.utils
import
mp_tools
from
deepspeech.utils
import
mp_tools
from
deepspeech.utils.log
import
Autolog
from
deepspeech.utils.log
import
Log
from
deepspeech.utils.log
import
Log
from
deepspeech.utils.utility
import
UpdateConfig
from
deepspeech.utils.utility
import
UpdateConfig
...
@@ -42,9 +53,9 @@ class DeepSpeech2Trainer(Trainer):
...
@@ -42,9 +53,9 @@ class DeepSpeech2Trainer(Trainer):
def
__init__
(
self
,
config
,
args
):
def
__init__
(
self
,
config
,
args
):
super
().
__init__
(
config
,
args
)
super
().
__init__
(
config
,
args
)
def
train_batch
(
self
,
batch_index
,
batch
_data
,
msg
):
def
train_batch
(
self
,
batch_index
,
batch
,
msg
):
start
=
time
.
time
()
start
=
time
.
time
()
loss
=
self
.
model
(
*
batch
_data
)
loss
=
self
.
model
(
*
batch
)
loss
.
backward
()
loss
.
backward
()
layer_tools
.
print_grads
(
self
.
model
,
print_func
=
None
)
layer_tools
.
print_grads
(
self
.
model
,
print_func
=
None
)
self
.
optimizer
.
step
()
self
.
optimizer
.
step
()
...
@@ -176,7 +187,7 @@ class DeepSpeech2Trainer(Trainer):
...
@@ -176,7 +187,7 @@ class DeepSpeech2Trainer(Trainer):
sortagrad
=
config
.
data
.
sortagrad
,
sortagrad
=
config
.
data
.
sortagrad
,
shuffle_method
=
config
.
data
.
shuffle_method
)
shuffle_method
=
config
.
data
.
shuffle_method
)
collate_fn
=
SpeechCollator
(
keep_transcription_text
=
False
)
collate_fn
=
SpeechCollator
(
keep_transcription_text
=
False
,
return_utts
=
False
)
self
.
train_loader
=
DataLoader
(
self
.
train_loader
=
DataLoader
(
train_dataset
,
train_dataset
,
batch_sampler
=
batch_sampler
,
batch_sampler
=
batch_sampler
,
...
@@ -190,10 +201,55 @@ class DeepSpeech2Trainer(Trainer):
...
@@ -190,10 +201,55 @@ class DeepSpeech2Trainer(Trainer):
collate_fn
=
collate_fn
)
collate_fn
=
collate_fn
)
logger
.
info
(
"Setup train/valid Dataloader!"
)
logger
.
info
(
"Setup train/valid Dataloader!"
)
config
.
data
.
manifest
=
config
.
data
.
test_manifest
config
.
data
.
keep_transcription_text
=
True
config
.
data
.
augmentation_config
=
""
# filter test examples, will cause less examples, but no mismatch with training
# and can use large batch size , save training time, so filter test egs now.
# config.data.min_input_len = 0.0 # second
# config.data.max_input_len = float('inf') # second
# config.data.min_output_len = 0.0 # tokens
# config.data.max_output_len = float('inf') # tokens
# config.data.min_output_input_ratio = 0.00
# config.data.max_output_input_ratio = float('inf')
test_dataset
=
ManifestDataset
.
from_config
(
config
)
# return text ord id
self
.
test_loader
=
DataLoader
(
test_dataset
,
batch_size
=
config
.
decoding
.
batch_size
,
shuffle
=
False
,
drop_last
=
False
,
collate_fn
=
SpeechCollator
(
keep_transcription_text
=
True
,
return_utts
=
True
))
logger
.
info
(
"Setup test Dataloader!"
)
class
DeepSpeech2Tester
(
DeepSpeech2Trainer
):
class
DeepSpeech2Tester
(
DeepSpeech2Trainer
):
@
classmethod
def
params
(
cls
,
config
:
Optional
[
CfgNode
]
=
None
)
->
CfgNode
:
# testing config
default
=
CfgNode
(
dict
(
alpha
=
2.5
,
# Coef of LM for beam search.
beta
=
0.3
,
# Coef of WC for beam search.
cutoff_prob
=
1.0
,
# Cutoff probability for pruning.
cutoff_top_n
=
40
,
# Cutoff number for pruning.
lang_model_path
=
'models/lm/common_crawl_00.prune01111.trie.klm'
,
# Filepath for language model.
decoding_method
=
'ctc_beam_search'
,
# Decoding method. Options: ctc_beam_search, ctc_greedy
error_rate_type
=
'wer'
,
# Error rate type for evaluation. Options `wer`, 'cer'
num_proc_bsearch
=
8
,
# # of CPUs for beam search.
beam_size
=
500
,
# Beam search width.
batch_size
=
128
,
# decoding batch size
))
if
config
is
not
None
:
config
.
merge_from_other_cfg
(
default
)
return
default
def
__init__
(
self
,
config
,
args
):
def
__init__
(
self
,
config
,
args
):
super
().
__init__
(
config
,
args
)
super
().
__init__
(
config
,
args
)
self
.
_text_featurizer
=
TextFeaturizer
(
unit_type
=
config
.
data
.
unit_type
,
vocab_filepath
=
None
)
def
ordid2token
(
self
,
texts
,
texts_len
):
def
ordid2token
(
self
,
texts
,
texts_len
):
""" ord() id to chr() chr """
""" ord() id to chr() chr """
...
@@ -204,15 +260,10 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
...
@@ -204,15 +260,10 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
trans
.
append
(
''
.
join
([
chr
(
i
)
for
i
in
ids
]))
trans
.
append
(
''
.
join
([
chr
(
i
)
for
i
in
ids
]))
return
trans
return
trans
def
compute_metrics
(
self
,
audio
,
audio_len
,
texts
,
texts_len
):
def
compute_result_transcripts
(
self
,
audio
,
audio_len
,
vocab_list
,
cfg
):
cfg
=
self
.
config
.
decoding
self
.
autolog
.
times
.
start
()
errors_sum
,
len_refs
,
num_ins
=
0.0
,
0
,
0
self
.
autolog
.
times
.
stamp
()
errors_func
=
error_rate
.
char_errors
if
cfg
.
error_rate_type
==
'cer'
else
error_rate
.
word_errors
error_rate_func
=
error_rate
.
cer
if
cfg
.
error_rate_type
==
'cer'
else
error_rate
.
wer
vocab_list
=
self
.
test_loader
.
dataset
.
vocab_list
target_transcripts
=
self
.
ordid2token
(
texts
,
texts_len
)
result_transcripts
=
self
.
model
.
decode
(
result_transcripts
=
self
.
model
.
decode
(
audio
,
audio
,
audio_len
,
audio_len
,
...
@@ -225,14 +276,48 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
...
@@ -225,14 +276,48 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
cutoff_prob
=
cfg
.
cutoff_prob
,
cutoff_prob
=
cfg
.
cutoff_prob
,
cutoff_top_n
=
cfg
.
cutoff_top_n
,
cutoff_top_n
=
cfg
.
cutoff_top_n
,
num_processes
=
cfg
.
num_proc_bsearch
)
num_processes
=
cfg
.
num_proc_bsearch
)
#replace the <space> with ' '
result_transcripts
=
[
self
.
_text_featurizer
.
detokenize
(
sentence
)
for
sentence
in
result_transcripts
]
self
.
autolog
.
times
.
stamp
()
self
.
autolog
.
times
.
stamp
()
self
.
autolog
.
times
.
end
()
return
result_transcripts
def
compute_metrics
(
self
,
utts
,
audio
,
audio_len
,
texts
,
texts_len
,
fout
=
None
):
cfg
=
self
.
config
.
decoding
errors_sum
,
len_refs
,
num_ins
=
0.0
,
0
,
0
errors_func
=
error_rate
.
char_errors
if
cfg
.
error_rate_type
==
'cer'
else
error_rate
.
word_errors
error_rate_func
=
error_rate
.
cer
if
cfg
.
error_rate_type
==
'cer'
else
error_rate
.
wer
for
target
,
result
in
zip
(
target_transcripts
,
result_transcripts
):
#vocab_list = self.test_loader.collate_fn.vocab_list
vocab_list
=
self
.
test_loader
.
dataset
.
vocab_list
target_transcripts
=
self
.
ordid2token
(
texts
,
texts_len
)
result_transcripts
=
self
.
compute_result_transcripts
(
audio
,
audio_len
,
vocab_list
,
cfg
)
for
utt
,
target
,
result
in
zip
(
utts
,
target_transcripts
,
result_transcripts
):
errors
,
len_ref
=
errors_func
(
target
,
result
)
errors
,
len_ref
=
errors_func
(
target
,
result
)
errors_sum
+=
errors
errors_sum
+=
errors
len_refs
+=
len_ref
len_refs
+=
len_ref
num_ins
+=
1
num_ins
+=
1
logger
.
info
(
"
\n
Target Transcription: %s
\n
Output Transcription: %s"
%
if
fout
:
(
target
,
result
))
fout
.
write
({
"utt"
:
utt
,
"ref"
:
target
,
"hyp"
:
result
})
logger
.
info
(
f
"Utt:
{
utt
}
"
)
logger
.
info
(
f
"Ref:
{
target
}
"
)
logger
.
info
(
f
"Hyp:
{
result
}
"
)
logger
.
info
(
"Current error rate [%s] = %f"
%
logger
.
info
(
"Current error rate [%s] = %f"
%
(
cfg
.
error_rate_type
,
error_rate_func
(
target
,
result
)))
(
cfg
.
error_rate_type
,
error_rate_func
(
target
,
result
)))
...
@@ -247,19 +332,25 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
...
@@ -247,19 +332,25 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
@
paddle
.
no_grad
()
@
paddle
.
no_grad
()
def
test
(
self
):
def
test
(
self
):
logger
.
info
(
f
"Test Total Examples:
{
len
(
self
.
test_loader
.
dataset
)
}
"
)
logger
.
info
(
f
"Test Total Examples:
{
len
(
self
.
test_loader
.
dataset
)
}
"
)
self
.
autolog
=
Autolog
(
batch_size
=
self
.
config
.
decoding
.
batch_size
,
model_name
=
"deepspeech2"
,
model_precision
=
"fp32"
).
getlog
()
self
.
model
.
eval
()
self
.
model
.
eval
()
cfg
=
self
.
config
cfg
=
self
.
config
error_rate_type
=
None
error_rate_type
=
None
errors_sum
,
len_refs
,
num_ins
=
0.0
,
0
,
0
errors_sum
,
len_refs
,
num_ins
=
0.0
,
0
,
0
with
jsonlines
.
open
(
self
.
args
.
result_file
,
'w'
)
as
fout
:
for
i
,
batch
in
enumerate
(
self
.
test_loader
):
for
i
,
batch
in
enumerate
(
self
.
test_loader
):
metrics
=
self
.
compute_metrics
(
*
batch
)
audio
,
audio_len
,
texts
,
texts_len
,
utts
=
batch
errors_sum
+=
metrics
[
'errors_sum'
]
metrics
=
self
.
compute_metrics
(
utts
,
audio
,
audio_len
,
texts
,
len_refs
+=
metrics
[
'len_refs'
]
texts_len
,
fout
)
num_ins
+=
metrics
[
'num_ins'
]
errors_sum
+=
metrics
[
'errors_sum'
]
error_rate_type
=
metrics
[
'error_rate_type'
]
len_refs
+=
metrics
[
'len_refs'
]
logger
.
info
(
"Error rate [%s] (%d/?) = %f"
%
num_ins
+=
metrics
[
'num_ins'
]
(
error_rate_type
,
num_ins
,
errors_sum
/
len_refs
))
error_rate_type
=
metrics
[
'error_rate_type'
]
logger
.
info
(
"Error rate [%s] (%d/?) = %f"
%
(
error_rate_type
,
num_ins
,
errors_sum
/
len_refs
))
# logging
# logging
msg
=
"Test: "
msg
=
"Test: "
...
@@ -268,101 +359,234 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
...
@@ -268,101 +359,234 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
msg
+=
"Final error rate [%s] (%d/%d) = %f"
%
(
msg
+=
"Final error rate [%s] (%d/%d) = %f"
%
(
error_rate_type
,
num_ins
,
num_ins
,
errors_sum
/
len_refs
)
error_rate_type
,
num_ins
,
num_ins
,
errors_sum
/
len_refs
)
logger
.
info
(
msg
)
logger
.
info
(
msg
)
self
.
autolog
.
report
()
def
run_test
(
self
):
self
.
resume_or_scratch
()
try
:
self
.
test
()
except
KeyboardInterrupt
:
exit
(
-
1
)
def
export
(
self
):
def
export
(
self
):
infer_model
=
DeepSpeech2InferModel
.
from_pretrained
(
if
self
.
args
.
model_type
==
'offline'
:
self
.
test_loader
.
dataset
,
self
.
config
,
self
.
args
.
checkpoint_path
)
infer_model
=
DeepSpeech2InferModel
.
from_pretrained
(
self
.
test_loader
,
self
.
config
,
self
.
args
.
checkpoint_path
)
elif
self
.
args
.
model_type
==
'online'
:
infer_model
=
DeepSpeech2InferModelOnline
.
from_pretrained
(
self
.
test_loader
,
self
.
config
,
self
.
args
.
checkpoint_path
)
else
:
raise
Exception
(
"wrong model type"
)
infer_model
.
eval
()
infer_model
.
eval
()
#feat_dim = self.test_loader.collate_fn.feature_size
feat_dim
=
self
.
test_loader
.
dataset
.
feature_size
feat_dim
=
self
.
test_loader
.
dataset
.
feature_size
static_model
=
paddle
.
jit
.
to_static
(
static_model
=
infer_model
.
export
()
infer_model
,
input_spec
=
[
paddle
.
static
.
InputSpec
(
shape
=
[
None
,
None
,
feat_dim
],
dtype
=
'float32'
),
# audio, [B,T,D]
paddle
.
static
.
InputSpec
(
shape
=
[
None
],
dtype
=
'int64'
),
# audio_length, [B]
])
logger
.
info
(
f
"Export code:
{
static_model
.
forward
.
code
}
"
)
logger
.
info
(
f
"Export code:
{
static_model
.
forward
.
code
}
"
)
paddle
.
jit
.
save
(
static_model
,
self
.
args
.
export_path
)
paddle
.
jit
.
save
(
static_model
,
self
.
args
.
export_path
)
def
run_export
(
self
):
try
:
self
.
export
()
except
KeyboardInterrupt
:
exit
(
-
1
)
def
setup
(
self
):
"""Setup the experiment.
"""
paddle
.
set_device
(
self
.
args
.
device
)
self
.
setup_output_dir
()
class
DeepSpeech2ExportTester
(
DeepSpeech2Tester
):
self
.
setup_checkpointer
()
def
__init__
(
self
,
config
,
args
):
super
().
__init__
(
config
,
args
)
self
.
setup_dataloader
()
def
compute_result_transcripts
(
self
,
audio
,
audio_len
,
vocab_list
,
cfg
):
self
.
setup_model
()
if
self
.
args
.
model_type
==
"online"
:
output_probs
,
output_lens
=
self
.
static_forward_online
(
audio
,
audio_len
)
elif
self
.
args
.
model_type
==
"offline"
:
output_probs
,
output_lens
=
self
.
static_forward_offline
(
audio
,
audio_len
)
else
:
raise
Exception
(
"wrong model type"
)
self
.
iteration
=
0
self
.
predictor
.
clear_intermediate_tensor
()
self
.
epoch
=
0
self
.
predictor
.
try_shrink_memory
()
def
setup_model
(
self
):
self
.
model
.
decoder
.
init_decode
(
cfg
.
alpha
,
cfg
.
beta
,
cfg
.
lang_model_path
,
config
=
self
.
config
vocab_list
,
cfg
.
decoding_method
)
model
=
DeepSpeech2Model
(
feat_size
=
self
.
test_loader
.
dataset
.
feature_size
,
dict_size
=
self
.
test_loader
.
dataset
.
vocab_size
,
num_conv_layers
=
config
.
model
.
num_conv_layers
,
num_rnn_layers
=
config
.
model
.
num_rnn_layers
,
rnn_size
=
config
.
model
.
rnn_layer_size
,
use_gru
=
config
.
model
.
use_gru
,
share_rnn_weights
=
config
.
model
.
share_rnn_weights
)
self
.
model
=
model
logger
.
info
(
"Setup model!"
)
def
setup_dataloader
(
self
):
result_transcripts
=
self
.
model
.
decoder
.
decode_probs
(
config
=
self
.
config
.
clone
()
output_probs
,
output_lens
,
vocab_list
,
cfg
.
decoding_method
,
config
.
defrost
()
cfg
.
lang_model_path
,
cfg
.
alpha
,
cfg
.
beta
,
cfg
.
beam_size
,
# return raw text
cfg
.
cutoff_prob
,
cfg
.
cutoff_top_n
,
cfg
.
num_proc_bsearch
)
#replace the <space> with ' '
config
.
data
.
manifest
=
config
.
data
.
test_manifest
result_transcripts
=
[
config
.
data
.
keep_transcription_text
=
True
self
.
_text_featurizer
.
detokenize
(
sentence
)
config
.
data
.
augmentation_config
=
""
for
sentence
in
result_transcripts
# filter test examples, will cause less examples, but no mismatch with training
]
# and can use large batch size , save training time, so filter test egs now.
# config.data.min_input_len = 0.0 # second
# config.data.max_input_len = float('inf') # second
# config.data.min_output_len = 0.0 # tokens
# config.data.max_output_len = float('inf') # tokens
# config.data.min_output_input_ratio = 0.00
# config.data.max_output_input_ratio = float('inf')
test_dataset
=
ManifestDataset
.
from_config
(
config
)
# return text ord id
return
result_transcripts
self
.
test_loader
=
DataLoader
(
test_dataset
,
batch_size
=
config
.
decoding
.
batch_size
,
shuffle
=
False
,
drop_last
=
False
,
collate_fn
=
SpeechCollator
(
keep_transcription_text
=
True
))
logger
.
info
(
"Setup test Dataloader!"
)
def
setup_output_dir
(
self
):
def
static_forward_online
(
self
,
audio
,
audio_len
,
"""Create a directory used for output.
decoder_chunk_size
:
int
=
1
):
"""
Parameters
----------
audio (Tensor): shape[B, T, D]
audio_len (Tensor): shape[B]
decoder_chunk_size(int)
Returns
-------
output_probs(numpy.array): shape[B, T, vocab_size]
output_lens(numpy.array): shape[B]
"""
"""
# output dir
output_probs_list
=
[]
if
self
.
args
.
output
:
output_lens_list
=
[]
output_dir
=
Path
(
self
.
args
.
output
).
expanduser
()
subsampling_rate
=
self
.
model
.
encoder
.
conv
.
subsampling_rate
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
receptive_field_length
=
self
.
model
.
encoder
.
conv
.
receptive_field_length
chunk_stride
=
subsampling_rate
*
decoder_chunk_size
chunk_size
=
(
decoder_chunk_size
-
1
)
*
subsampling_rate
+
receptive_field_length
x_batch
=
audio
.
numpy
()
batch_size
,
Tmax
,
x_dim
=
x_batch
.
shape
x_len_batch
=
audio_len
.
numpy
().
astype
(
np
.
int64
)
if
(
Tmax
-
chunk_size
)
%
chunk_stride
!=
0
:
padding_len_batch
=
chunk_stride
-
(
Tmax
-
chunk_size
)
%
chunk_stride
# The length of padding for the batch
else
:
else
:
output_dir
=
Path
(
padding_len_batch
=
0
self
.
args
.
checkpoint_path
).
expanduser
().
parent
.
parent
x_list
=
np
.
split
(
x_batch
,
batch_size
,
axis
=
0
)
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
x_len_list
=
np
.
split
(
x_len_batch
,
batch_size
,
axis
=
0
)
for
x
,
x_len
in
zip
(
x_list
,
x_len_list
):
self
.
autolog
.
times
.
start
()
self
.
autolog
.
times
.
stamp
()
x_len
=
x_len
[
0
]
assert
(
chunk_size
<=
x_len
)
if
(
x_len
-
chunk_size
)
%
chunk_stride
!=
0
:
padding_len_x
=
chunk_stride
-
(
x_len
-
chunk_size
)
%
chunk_stride
else
:
padding_len_x
=
0
padding
=
np
.
zeros
(
(
x
.
shape
[
0
],
padding_len_x
,
x
.
shape
[
2
]),
dtype
=
x
.
dtype
)
padded_x
=
np
.
concatenate
([
x
,
padding
],
axis
=
1
)
num_chunk
=
(
x_len
+
padding_len_x
-
chunk_size
)
/
chunk_stride
+
1
num_chunk
=
int
(
num_chunk
)
chunk_state_h_box
=
np
.
zeros
(
(
self
.
config
.
model
.
num_rnn_layers
,
1
,
self
.
config
.
model
.
rnn_layer_size
),
dtype
=
x
.
dtype
)
chunk_state_c_box
=
np
.
zeros
(
(
self
.
config
.
model
.
num_rnn_layers
,
1
,
self
.
config
.
model
.
rnn_layer_size
),
dtype
=
x
.
dtype
)
input_names
=
self
.
predictor
.
get_input_names
()
audio_handle
=
self
.
predictor
.
get_input_handle
(
input_names
[
0
])
audio_len_handle
=
self
.
predictor
.
get_input_handle
(
input_names
[
1
])
h_box_handle
=
self
.
predictor
.
get_input_handle
(
input_names
[
2
])
c_box_handle
=
self
.
predictor
.
get_input_handle
(
input_names
[
3
])
probs_chunk_list
=
[]
probs_chunk_lens_list
=
[]
for
i
in
range
(
0
,
num_chunk
):
start
=
i
*
chunk_stride
end
=
start
+
chunk_size
x_chunk
=
padded_x
[:,
start
:
end
,
:]
if
x_len
<
i
*
chunk_stride
:
x_chunk_lens
=
0
else
:
x_chunk_lens
=
min
(
x_len
-
i
*
chunk_stride
,
chunk_size
)
if
(
x_chunk_lens
<
receptive_field_length
):
#means the number of input frames in the chunk is not enough for predicting one prob
break
x_chunk_lens
=
np
.
array
([
x_chunk_lens
])
audio_handle
.
reshape
(
x_chunk
.
shape
)
audio_handle
.
copy_from_cpu
(
x_chunk
)
audio_len_handle
.
reshape
(
x_chunk_lens
.
shape
)
audio_len_handle
.
copy_from_cpu
(
x_chunk_lens
)
h_box_handle
.
reshape
(
chunk_state_h_box
.
shape
)
h_box_handle
.
copy_from_cpu
(
chunk_state_h_box
)
c_box_handle
.
reshape
(
chunk_state_c_box
.
shape
)
c_box_handle
.
copy_from_cpu
(
chunk_state_c_box
)
output_names
=
self
.
predictor
.
get_output_names
()
output_handle
=
self
.
predictor
.
get_output_handle
(
output_names
[
0
])
output_lens_handle
=
self
.
predictor
.
get_output_handle
(
output_names
[
1
])
output_state_h_handle
=
self
.
predictor
.
get_output_handle
(
output_names
[
2
])
output_state_c_handle
=
self
.
predictor
.
get_output_handle
(
output_names
[
3
])
self
.
predictor
.
run
()
output_chunk_probs
=
output_handle
.
copy_to_cpu
()
output_chunk_lens
=
output_lens_handle
.
copy_to_cpu
()
chunk_state_h_box
=
output_state_h_handle
.
copy_to_cpu
()
chunk_state_c_box
=
output_state_c_handle
.
copy_to_cpu
()
probs_chunk_list
.
append
(
output_chunk_probs
)
probs_chunk_lens_list
.
append
(
output_chunk_lens
)
output_probs
=
np
.
concatenate
(
probs_chunk_list
,
axis
=
1
)
output_lens
=
np
.
sum
(
probs_chunk_lens_list
,
axis
=
0
)
vocab_size
=
output_probs
.
shape
[
2
]
output_probs_padding_len
=
Tmax
+
padding_len_batch
-
output_probs
.
shape
[
1
]
output_probs_padding
=
np
.
zeros
(
(
1
,
output_probs_padding_len
,
vocab_size
),
dtype
=
output_probs
.
dtype
)
# The prob padding for a piece of utterance
output_probs
=
np
.
concatenate
(
[
output_probs
,
output_probs_padding
],
axis
=
1
)
output_probs_list
.
append
(
output_probs
)
output_lens_list
.
append
(
output_lens
)
self
.
autolog
.
times
.
stamp
()
self
.
autolog
.
times
.
stamp
()
self
.
autolog
.
times
.
end
()
output_probs
=
np
.
concatenate
(
output_probs_list
,
axis
=
0
)
output_lens
=
np
.
concatenate
(
output_lens_list
,
axis
=
0
)
return
output_probs
,
output_lens
def
static_forward_offline
(
self
,
audio
,
audio_len
):
"""
Parameters
----------
audio (Tensor): shape[B, T, D]
audio_len (Tensor): shape[B]
Returns
-------
output_probs(numpy.array): shape[B, T, vocab_size]
output_lens(numpy.array): shape[B]
"""
x
=
audio
.
numpy
()
x_len
=
audio_len
.
numpy
().
astype
(
np
.
int64
)
input_names
=
self
.
predictor
.
get_input_names
()
audio_handle
=
self
.
predictor
.
get_input_handle
(
input_names
[
0
])
audio_len_handle
=
self
.
predictor
.
get_input_handle
(
input_names
[
1
])
self
.
output_dir
=
output_dir
audio_handle
.
reshape
(
x
.
shape
)
audio_handle
.
copy_from_cpu
(
x
)
audio_len_handle
.
reshape
(
x_len
.
shape
)
audio_len_handle
.
copy_from_cpu
(
x_len
)
self
.
autolog
.
times
.
start
()
self
.
autolog
.
times
.
stamp
()
self
.
predictor
.
run
()
self
.
autolog
.
times
.
stamp
()
self
.
autolog
.
times
.
stamp
()
self
.
autolog
.
times
.
end
()
output_names
=
self
.
predictor
.
get_output_names
()
output_handle
=
self
.
predictor
.
get_output_handle
(
output_names
[
0
])
output_lens_handle
=
self
.
predictor
.
get_output_handle
(
output_names
[
1
])
output_probs
=
output_handle
.
copy_to_cpu
()
output_lens
=
output_lens_handle
.
copy_to_cpu
()
return
output_probs
,
output_lens
def
setup_model
(
self
):
super
().
setup_model
()
infer_config
=
inference
.
Config
(
self
.
args
.
export_path
+
".pdmodel"
,
self
.
args
.
export_path
+
".pdiparams"
)
if
(
os
.
environ
[
'CUDA_VISIBLE_DEVICES'
].
strip
()
!=
''
):
infer_config
.
enable_use_gpu
(
100
,
0
)
infer_config
.
enable_memory_optim
()
infer_predictor
=
inference
.
create_predictor
(
infer_config
)
self
.
predictor
=
infer_predictor
deepspeech/exps/u2/bin/alignment.py
浏览文件 @
e411e0bd
...
@@ -30,6 +30,9 @@ def main(config, args):
...
@@ -30,6 +30,9 @@ def main(config, args):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
parser
=
default_argument_parser
()
parser
=
default_argument_parser
()
# save asr result to
parser
.
add_argument
(
"--result_file"
,
type
=
str
,
help
=
"path of save the asr result"
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
print_arguments
(
args
,
globals
())
print_arguments
(
args
,
globals
())
...
...
deepspeech/exps/u2/bin/export.py
浏览文件 @
e411e0bd
...
@@ -30,6 +30,9 @@ def main(config, args):
...
@@ -30,6 +30,9 @@ def main(config, args):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
parser
=
default_argument_parser
()
parser
=
default_argument_parser
()
# save jit model to
parser
.
add_argument
(
"--export_path"
,
type
=
str
,
help
=
"path of the jit model to save"
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
print_arguments
(
args
,
globals
())
print_arguments
(
args
,
globals
())
...
...
deepspeech/exps/u2/bin/test.py
浏览文件 @
e411e0bd
...
@@ -34,6 +34,9 @@ def main(config, args):
...
@@ -34,6 +34,9 @@ def main(config, args):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
parser
=
default_argument_parser
()
parser
=
default_argument_parser
()
# save asr result to
parser
.
add_argument
(
"--result_file"
,
type
=
str
,
help
=
"path of save the asr result"
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
print_arguments
(
args
,
globals
())
print_arguments
(
args
,
globals
())
...
...
deepspeech/exps/u2/bin/train.py
浏览文件 @
e411e0bd
...
@@ -22,6 +22,8 @@ from deepspeech.exps.u2.model import U2Trainer as Trainer
...
@@ -22,6 +22,8 @@ from deepspeech.exps.u2.model import U2Trainer as Trainer
from
deepspeech.training.cli
import
default_argument_parser
from
deepspeech.training.cli
import
default_argument_parser
from
deepspeech.utils.utility
import
print_arguments
from
deepspeech.utils.utility
import
print_arguments
# from deepspeech.exps.u2.trainer import U2Trainer as Trainer
def
main_sp
(
config
,
args
):
def
main_sp
(
config
,
args
):
exp
=
Trainer
(
config
,
args
)
exp
=
Trainer
(
config
,
args
)
...
...
deepspeech/exps/u2/model.py
浏览文件 @
e411e0bd
...
@@ -73,11 +73,11 @@ class U2Trainer(Trainer):
...
@@ -73,11 +73,11 @@ class U2Trainer(Trainer):
def
__init__
(
self
,
config
,
args
):
def
__init__
(
self
,
config
,
args
):
super
().
__init__
(
config
,
args
)
super
().
__init__
(
config
,
args
)
def
train_batch
(
self
,
batch_index
,
batch
_data
,
msg
):
def
train_batch
(
self
,
batch_index
,
batch
,
msg
):
train_conf
=
self
.
config
.
training
train_conf
=
self
.
config
.
training
start
=
time
.
time
()
start
=
time
.
time
()
loss
,
attention_loss
,
ctc_loss
=
self
.
model
(
*
batch
_data
)
loss
,
attention_loss
,
ctc_loss
=
self
.
model
(
*
batch
)
# loss div by `batch_size * accum_grad`
# loss div by `batch_size * accum_grad`
loss
/=
train_conf
.
accum_grad
loss
/=
train_conf
.
accum_grad
loss
.
backward
()
loss
.
backward
()
...
@@ -219,7 +219,7 @@ class U2Trainer(Trainer):
...
@@ -219,7 +219,7 @@ class U2Trainer(Trainer):
config
.
data
.
augmentation_config
=
""
config
.
data
.
augmentation_config
=
""
dev_dataset
=
ManifestDataset
.
from_config
(
config
)
dev_dataset
=
ManifestDataset
.
from_config
(
config
)
collate_fn
=
SpeechCollator
(
keep_transcription_text
=
False
)
collate_fn
=
SpeechCollator
(
keep_transcription_text
=
False
,
return_utts
=
False
)
if
self
.
parallel
:
if
self
.
parallel
:
batch_sampler
=
SortagradDistributedBatchSampler
(
batch_sampler
=
SortagradDistributedBatchSampler
(
train_dataset
,
train_dataset
,
...
@@ -269,7 +269,7 @@ class U2Trainer(Trainer):
...
@@ -269,7 +269,7 @@ class U2Trainer(Trainer):
batch_size
=
config
.
decoding
.
batch_size
,
batch_size
=
config
.
decoding
.
batch_size
,
shuffle
=
False
,
shuffle
=
False
,
drop_last
=
False
,
drop_last
=
False
,
collate_fn
=
SpeechCollator
(
keep_transcription_text
=
True
))
collate_fn
=
SpeechCollator
(
keep_transcription_text
=
True
,
return_utts
=
True
))
logger
.
info
(
"Setup train/valid/test Dataloader!"
)
logger
.
info
(
"Setup train/valid/test Dataloader!"
)
def
setup_model
(
self
):
def
setup_model
(
self
):
...
@@ -345,7 +345,7 @@ class U2Tester(U2Trainer):
...
@@ -345,7 +345,7 @@ class U2Tester(U2Trainer):
decoding_chunk_size
=-
1
,
# decoding chunk size. Defaults to -1.
decoding_chunk_size
=-
1
,
# decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
=-
1
,
# number of left chunks for decoding. Defaults to -1.
num_decoding_left_chunks
=-
1
,
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
=
False
,
# simulate streaming inference. Defaults to False.
simulate_streaming
=
False
,
# simulate streaming inference. Defaults to False.
))
))
...
@@ -428,7 +428,7 @@ class U2Tester(U2Trainer):
...
@@ -428,7 +428,7 @@ class U2Tester(U2Trainer):
num_time
=
0.0
num_time
=
0.0
with
open
(
self
.
args
.
result_file
,
'w'
)
as
fout
:
with
open
(
self
.
args
.
result_file
,
'w'
)
as
fout
:
for
i
,
batch
in
enumerate
(
self
.
test_loader
):
for
i
,
batch
in
enumerate
(
self
.
test_loader
):
metrics
=
self
.
compute_metrics
(
*
batch
,
fout
=
fout
)
metrics
=
self
.
compute_metrics
(
*
batch
[:
-
1
]
,
fout
=
fout
)
num_frames
+=
metrics
[
'num_frames'
]
num_frames
+=
metrics
[
'num_frames'
]
num_time
+=
metrics
[
"decode_time"
]
num_time
+=
metrics
[
"decode_time"
]
errors_sum
+=
metrics
[
'errors_sum'
]
errors_sum
+=
metrics
[
'errors_sum'
]
...
@@ -476,12 +476,12 @@ class U2Tester(U2Trainer):
...
@@ -476,12 +476,12 @@ class U2Tester(U2Trainer):
})
})
f
.
write
(
data
+
'
\n
'
)
f
.
write
(
data
+
'
\n
'
)
def
run_test
(
self
):
#
def run_test(self):
self
.
resume_or_scratch
()
#
self.resume_or_scratch()
try
:
#
try:
self
.
test
()
#
self.test()
except
KeyboardInterrupt
:
#
except KeyboardInterrupt:
sys
.
exit
(
-
1
)
#
sys.exit(-1)
def
load_inferspec
(
self
):
def
load_inferspec
(
self
):
"""infer model and input spec.
"""infer model and input spec.
...
@@ -512,36 +512,36 @@ class U2Tester(U2Trainer):
...
@@ -512,36 +512,36 @@ class U2Tester(U2Trainer):
logger
.
info
(
f
"Export code:
{
static_model
.
forward
.
code
}
"
)
logger
.
info
(
f
"Export code:
{
static_model
.
forward
.
code
}
"
)
paddle
.
jit
.
save
(
static_model
,
self
.
args
.
export_path
)
paddle
.
jit
.
save
(
static_model
,
self
.
args
.
export_path
)
def
run_export
(
self
):
#
def run_export(self):
try
:
#
try:
self
.
export
()
#
self.export()
except
KeyboardInterrupt
:
#
except KeyboardInterrupt:
sys
.
exit
(
-
1
)
#
sys.exit(-1)
def
setup
(
self
):
#
def setup(self):
"""Setup the experiment.
#
"""Setup the experiment.
"""
#
"""
paddle
.
set_device
(
self
.
args
.
device
)
#
paddle.set_device(self.args.device)
self
.
setup_output_dir
()
#
self.setup_output_dir()
self
.
setup_checkpointer
()
#
self.setup_checkpointer()
self
.
setup_dataloader
()
#
self.setup_dataloader()
self
.
setup_model
()
#
self.setup_model()
self
.
iteration
=
0
#
self.iteration = 0
self
.
epoch
=
0
#
self.epoch = 0
def
setup_output_dir
(
self
):
#
def setup_output_dir(self):
"""Create a directory used for output.
#
"""Create a directory used for output.
"""
#
"""
# output dir
#
# output dir
if
self
.
args
.
output
:
#
if self.args.output:
output_dir
=
Path
(
self
.
args
.
output
).
expanduser
()
#
output_dir = Path(self.args.output).expanduser()
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
#
output_dir.mkdir(parents=True, exist_ok=True)
else
:
#
else:
output_dir
=
Path
(
#
output_dir = Path(
self
.
args
.
checkpoint_path
).
expanduser
().
parent
.
parent
#
self.args.checkpoint_path).expanduser().parent.parent
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
#
output_dir.mkdir(parents=True, exist_ok=True)
self
.
output_dir
=
output_dir
#
self.output_dir = output_dir
deepspeech/frontend/featurizer/text_featurizer.py
浏览文件 @
e411e0bd
...
@@ -14,12 +14,27 @@
...
@@ -14,12 +14,27 @@
"""Contains the text featurizer class."""
"""Contains the text featurizer class."""
import
sentencepiece
as
spm
import
sentencepiece
as
spm
from
deepspeech.frontend.utility
import
EOS
from
..utility
import
EOS
from
deepspeech.frontend.utility
import
UNK
from
..utility
import
SPACE
from
..utility
import
UNK
from
..utility
import
SOS
from
..utility
import
BLANK
from
..utility
import
MASKCTC
from
..utility
import
load_dict
from
deepspeech.utils.log
import
Log
class
TextFeaturizer
(
object
):
logger
=
Log
(
__name__
).
getlog
()
def
__init__
(
self
,
unit_type
,
vocab_filepath
,
spm_model_prefix
=
None
):
__all__
=
[
"TextFeaturizer"
]
class
TextFeaturizer
():
def
__init__
(
self
,
unit_type
,
vocab_filepath
,
spm_model_prefix
=
None
,
maskctc
=
False
):
"""Text featurizer, for processing or extracting features from text.
"""Text featurizer, for processing or extracting features from text.
Currently, it supports char/word/sentence-piece level tokenizing and conversion into
Currently, it supports char/word/sentence-piece level tokenizing and conversion into
...
@@ -34,11 +49,12 @@ class TextFeaturizer(object):
...
@@ -34,11 +49,12 @@ class TextFeaturizer(object):
assert
unit_type
in
(
'char'
,
'spm'
,
'word'
)
assert
unit_type
in
(
'char'
,
'spm'
,
'word'
)
self
.
unit_type
=
unit_type
self
.
unit_type
=
unit_type
self
.
unk
=
UNK
self
.
unk
=
UNK
self
.
maskctc
=
maskctc
if
vocab_filepath
:
if
vocab_filepath
:
self
.
_vocab_dict
,
self
.
_id2token
,
self
.
_vocab_list
=
self
.
_load_vocabulary_from_file
(
self
.
vocab_dict
,
self
.
_id2token
,
self
.
vocab_list
,
self
.
unk_id
,
self
.
eos_id
=
self
.
_load_vocabulary_from_file
(
vocab_filepath
)
vocab_filepath
,
maskctc
)
self
.
unk_id
=
self
.
_vocab_list
.
index
(
self
.
unk
)
self
.
vocab_size
=
len
(
self
.
vocab_list
)
self
.
eos_id
=
self
.
_vocab_list
.
index
(
EOS
)
if
unit_type
==
'spm'
:
if
unit_type
==
'spm'
:
spm_model
=
spm_model_prefix
+
'.model'
spm_model
=
spm_model_prefix
+
'.model'
...
@@ -47,7 +63,7 @@ class TextFeaturizer(object):
...
@@ -47,7 +63,7 @@ class TextFeaturizer(object):
def
tokenize
(
self
,
text
,
replace_space
=
True
):
def
tokenize
(
self
,
text
,
replace_space
=
True
):
if
self
.
unit_type
==
'char'
:
if
self
.
unit_type
==
'char'
:
tokens
=
self
.
char_tokenize
(
text
)
tokens
=
self
.
char_tokenize
(
text
,
replace_space
)
elif
self
.
unit_type
==
'word'
:
elif
self
.
unit_type
==
'word'
:
tokens
=
self
.
word_tokenize
(
text
)
tokens
=
self
.
word_tokenize
(
text
)
else
:
# spm
else
:
# spm
...
@@ -75,8 +91,8 @@ class TextFeaturizer(object):
...
@@ -75,8 +91,8 @@ class TextFeaturizer(object):
tokens
=
self
.
tokenize
(
text
)
tokens
=
self
.
tokenize
(
text
)
ids
=
[]
ids
=
[]
for
token
in
tokens
:
for
token
in
tokens
:
token
=
token
if
token
in
self
.
_
vocab_dict
else
self
.
unk
token
=
token
if
token
in
self
.
vocab_dict
else
self
.
unk
ids
.
append
(
self
.
_
vocab_dict
[
token
])
ids
.
append
(
self
.
vocab_dict
[
token
])
return
ids
return
ids
def
defeaturize
(
self
,
idxs
):
def
defeaturize
(
self
,
idxs
):
...
@@ -87,7 +103,7 @@ class TextFeaturizer(object):
...
@@ -87,7 +103,7 @@ class TextFeaturizer(object):
idxs (List[int]): List of token indices.
idxs (List[int]): List of token indices.
Returns:
Returns:
str: Text
to process
.
str: Text.
"""
"""
tokens
=
[]
tokens
=
[]
for
idx
in
idxs
:
for
idx
in
idxs
:
...
@@ -97,43 +113,22 @@ class TextFeaturizer(object):
...
@@ -97,43 +113,22 @@ class TextFeaturizer(object):
text
=
self
.
detokenize
(
tokens
)
text
=
self
.
detokenize
(
tokens
)
return
text
return
text
@
property
def
char_tokenize
(
self
,
text
,
replace_space
=
True
):
def
vocab_size
(
self
):
"""Return the vocabulary size.
:return: Vocabulary size.
:rtype: int
"""
return
len
(
self
.
_vocab_list
)
@
property
def
vocab_list
(
self
):
"""Return the vocabulary in list.
Returns:
List[str]: tokens.
"""
return
self
.
_vocab_list
@
property
def
vocab_dict
(
self
):
"""Return the vocabulary in dict.
Returns:
Dict[str, int]: token str -> int
"""
return
self
.
_vocab_dict
def
char_tokenize
(
self
,
text
):
"""Character tokenizer.
"""Character tokenizer.
Args:
Args:
text (str): text string.
text (str): text string.
replace_space (bool): False only used by build_vocab.py.
Returns:
Returns:
List[str]: tokens.
List[str]: tokens.
"""
"""
return
list
(
text
.
strip
())
text
=
text
.
strip
()
if
replace_space
:
text_list
=
[
SPACE
if
item
==
" "
else
item
for
item
in
list
(
text
)]
else
:
text_list
=
list
(
text
)
return
text_list
def
char_detokenize
(
self
,
tokens
):
def
char_detokenize
(
self
,
tokens
):
"""Character detokenizer.
"""Character detokenizer.
...
@@ -144,6 +139,7 @@ class TextFeaturizer(object):
...
@@ -144,6 +139,7 @@ class TextFeaturizer(object):
Returns:
Returns:
str: text string.
str: text string.
"""
"""
tokens
=
tokens
.
replace
(
SPACE
,
" "
)
return
""
.
join
(
tokens
)
return
""
.
join
(
tokens
)
def
word_tokenize
(
self
,
text
):
def
word_tokenize
(
self
,
text
):
...
@@ -206,14 +202,28 @@ class TextFeaturizer(object):
...
@@ -206,14 +202,28 @@ class TextFeaturizer(object):
return
decode
(
tokens
)
return
decode
(
tokens
)
def
_load_vocabulary_from_file
(
self
,
vocab_filepath
):
def
_load_vocabulary_from_file
(
self
,
vocab_filepath
:
str
,
maskctc
:
bool
):
"""Load vocabulary from file."""
"""Load vocabulary from file."""
vocab_li
nes
=
[]
vocab_li
st
=
load_dict
(
vocab_filepath
,
maskctc
)
with
open
(
vocab_filepath
,
'r'
,
encoding
=
'utf-8'
)
as
file
:
assert
vocab_list
is
not
None
vocab_lines
.
extend
(
file
.
readlines
()
)
logger
.
info
(
f
"Vocab:
{
vocab_list
}
"
)
vocab_list
=
[
line
[:
-
1
]
for
line
in
vocab_lines
]
id2token
=
dict
(
id2token
=
dict
(
[(
idx
,
token
)
for
(
idx
,
token
)
in
enumerate
(
vocab_list
)])
[(
idx
,
token
)
for
(
idx
,
token
)
in
enumerate
(
vocab_list
)])
token2id
=
dict
(
token2id
=
dict
(
[(
token
,
idx
)
for
(
idx
,
token
)
in
enumerate
(
vocab_list
)])
[(
token
,
idx
)
for
(
idx
,
token
)
in
enumerate
(
vocab_list
)])
return
token2id
,
id2token
,
vocab_list
blank_id
=
vocab_list
.
index
(
BLANK
)
if
BLANK
in
vocab_list
else
-
1
maskctc_id
=
vocab_list
.
index
(
MASKCTC
)
if
MASKCTC
in
vocab_list
else
-
1
unk_id
=
vocab_list
.
index
(
UNK
)
if
UNK
in
vocab_list
else
-
1
eos_id
=
vocab_list
.
index
(
EOS
)
if
EOS
in
vocab_list
else
-
1
sos_id
=
vocab_list
.
index
(
SOS
)
if
SOS
in
vocab_list
else
-
1
space_id
=
vocab_list
.
index
(
SPACE
)
if
SPACE
in
vocab_list
else
-
1
logger
.
info
(
f
"UNK id:
{
unk_id
}
"
)
logger
.
info
(
f
"EOS id:
{
eos_id
}
"
)
logger
.
info
(
f
"SOS id:
{
sos_id
}
"
)
logger
.
info
(
f
"SPACE id:
{
space_id
}
"
)
logger
.
info
(
f
"BLANK id:
{
blank_id
}
"
)
logger
.
info
(
f
"MASKCTC id:
{
maskctc_id
}
"
)
return
token2id
,
id2token
,
vocab_list
,
unk_id
,
eos_id
deepspeech/frontend/utility.py
浏览文件 @
e411e0bd
...
@@ -12,10 +12,15 @@
...
@@ -12,10 +12,15 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
"""Contains data helper functions."""
"""Contains data helper functions."""
import
codecs
import
json
import
json
import
math
import
math
import
tarfile
from
collections
import
namedtuple
from
typing
import
List
from
typing
import
Optional
from
typing
import
Text
import
jsonlines
import
numpy
as
np
import
numpy
as
np
from
deepspeech.utils.log
import
Log
from
deepspeech.utils.log
import
Log
...
@@ -23,17 +28,41 @@ from deepspeech.utils.log import Log
...
@@ -23,17 +28,41 @@ from deepspeech.utils.log import Log
logger
=
Log
(
__name__
).
getlog
()
logger
=
Log
(
__name__
).
getlog
()
__all__
=
[
__all__
=
[
"load_
cmvn"
,
"read_manifest"
,
"rms_to_db"
,
"rms_to_dbfs"
,
"max
_dbfs"
,
"load_
dict"
,
"load_cmvn"
,
"read_manifest"
,
"rms_to_db"
,
"rms_to
_dbfs"
,
"m
ean_dbfs"
,
"gain_db_to_ratio"
,
"normalize_audio"
,
"SOS"
,
"EOS"
,
"UNK
"
,
"m
ax_dbfs"
,
"mean_dbfs"
,
"gain_db_to_ratio"
,
"normalize_audio"
,
"SOS
"
,
"
BLANK
"
"
EOS"
,
"UNK"
,
"BLANK"
,
"MASKCTC"
,
"SPACE
"
]
]
IGNORE_ID
=
-
1
IGNORE_ID
=
-
1
SOS
=
"<sos/eos>"
# `sos` and `eos` using same token
SOS
=
"<eos>"
EOS
=
SOS
EOS
=
SOS
UNK
=
"<unk>"
UNK
=
"<unk>"
SPACE
=
" "
SPACE
=
" "
BLANK
=
"<blank>"
BLANK
=
"<blank>"
MASKCTC
=
"<mask>"
SPACE
=
"<space>"
def
load_dict
(
dict_path
:
Optional
[
Text
],
maskctc
=
False
)
->
Optional
[
List
[
Text
]]:
if
dict_path
is
None
:
return
None
with
open
(
dict_path
,
"r"
)
as
f
:
dictionary
=
f
.
readlines
()
# first token is `<blank>`
# multi line: `<blank> 0\n`
# one line: `<blank>`
# space is relpace with <space>
char_list
=
[
entry
[:
-
1
].
split
(
" "
)[
0
]
for
entry
in
dictionary
]
if
BLANK
not
in
char_list
:
char_list
.
insert
(
0
,
BLANK
)
if
EOS
not
in
char_list
:
char_list
.
append
(
EOS
)
# for non-autoregressive maskctc model
if
maskctc
and
MASKCTC
not
in
char_list
:
char_list
.
append
(
MASKCTC
)
return
char_list
def
read_manifest
(
def
read_manifest
(
...
@@ -48,12 +77,20 @@ def read_manifest(
...
@@ -48,12 +77,20 @@ def read_manifest(
Args:
Args:
manifest_path ([type]): Manifest file to load and parse.
manifest_path ([type]): Manifest file to load and parse.
max_input_len ([type], optional): maximum output seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf').
max_input_len ([type], optional): maximum output seq length,
min_input_len (float, optional): minimum input seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0.
in seconds for raw wav, in frame numbers for feature data.
max_output_len (float, optional): maximum input seq length, in modeling units. Defaults to 500.0.
Defaults to float('inf').
min_output_len (float, optional): minimum input seq length, in modeling units. Defaults to 0.0.
min_input_len (float, optional): minimum input seq length,
max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. Defaults to 10.0.
in seconds for raw wav, in frame numbers for feature data.
min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio. Defaults to 0.05.
Defaults to 0.0.
max_output_len (float, optional): maximum input seq length,
in modeling units. Defaults to 500.0.
min_output_len (float, optional): minimum input seq length,
in modeling units. Defaults to 0.0.
max_output_input_ratio (float, optional):
maximum output seq length/output seq length ratio. Defaults to 10.0.
min_output_input_ratio (float, optional):
minimum output seq length/output seq length ratio. Defaults to 0.05.
Raises:
Raises:
IOError: If failed to parse the manifest.
IOError: If failed to parse the manifest.
...
@@ -63,29 +100,70 @@ def read_manifest(
...
@@ -63,29 +100,70 @@ def read_manifest(
"""
"""
manifest
=
[]
manifest
=
[]
for
json_line
in
codecs
.
open
(
manifest_path
,
'r'
,
'utf-8'
):
with
jsonlines
.
open
(
manifest_path
,
'r'
)
as
reader
:
try
:
for
json_data
in
reader
:
json_data
=
json
.
loads
(
json_line
)
feat_len
=
json_data
[
"feat_shape"
][
except
Exception
as
e
:
0
]
if
'feat_shape'
in
json_data
else
1.0
raise
IOError
(
"Error reading manifest: %s"
%
str
(
e
))
token_len
=
json_data
[
"token_shape"
][
0
]
if
'token_shape'
in
json_data
else
1.0
feat_len
=
json_data
[
"feat_shape"
][
conditions
=
[
0
]
if
'feat_shape'
in
json_data
else
1.0
feat_len
>=
min_input_len
,
token_len
=
json_data
[
"token_shape"
][
feat_len
<=
max_input_len
,
0
]
if
'token_shape'
in
json_data
else
1.0
token_len
>=
min_output_len
,
conditions
=
[
token_len
<=
max_output_len
,
feat_len
>=
min_input_len
,
token_len
/
feat_len
>=
min_output_input_ratio
,
feat_len
<=
max_input_len
,
token_len
/
feat_len
<=
max_output_input_ratio
,
token_len
>=
min_output_len
,
]
token_len
<=
max_output_len
,
if
all
(
conditions
):
token_len
/
feat_len
>=
min_output_input_ratio
,
manifest
.
append
(
json_data
)
token_len
/
feat_len
<=
max_output_input_ratio
,
]
if
all
(
conditions
):
manifest
.
append
(
json_data
)
return
manifest
return
manifest
# Tar File read
TarLocalData
=
namedtuple
(
'TarLocalData'
,
[
'tar2info'
,
'tar2object'
])
def
parse_tar
(
file
):
"""Parse a tar file to get a tarfile object
and a map containing tarinfoes
"""
result
=
{}
f
=
tarfile
.
open
(
file
)
for
tarinfo
in
f
.
getmembers
():
result
[
tarinfo
.
name
]
=
tarinfo
return
f
,
result
def
subfile_from_tar
(
file
,
local_data
=
None
):
"""Get subfile object from tar.
tar:tarpath#filename
It will return a subfile object from tar file
and cached tar file info for next reading request.
"""
tarpath
,
filename
=
file
.
split
(
':'
,
1
)[
1
].
split
(
'#'
,
1
)
if
local_data
is
None
:
local_data
=
TarLocalData
(
tar2info
=
{},
tar2object
=
{})
assert
isinstance
(
local_data
,
TarLocalData
)
if
'tar2info'
not
in
local_data
.
__dict__
:
local_data
.
tar2info
=
{}
if
'tar2object'
not
in
local_data
.
__dict__
:
local_data
.
tar2object
=
{}
if
tarpath
not
in
local_data
.
tar2info
:
fobj
,
infos
=
parse_tar
(
tarpath
)
local_data
.
tar2info
[
tarpath
]
=
infos
local_data
.
tar2object
[
tarpath
]
=
fobj
else
:
fobj
=
local_data
.
tar2object
[
tarpath
]
infos
=
local_data
.
tar2info
[
tarpath
]
return
fobj
.
extractfile
(
infos
[
filename
])
def
rms_to_db
(
rms
:
float
):
def
rms_to_db
(
rms
:
float
):
"""Root Mean Square to dB.
"""Root Mean Square to dB.
...
@@ -255,6 +333,13 @@ def load_cmvn(cmvn_file: str, filetype: str):
...
@@ -255,6 +333,13 @@ def load_cmvn(cmvn_file: str, filetype: str):
cmvn
=
_load_json_cmvn
(
cmvn_file
)
cmvn
=
_load_json_cmvn
(
cmvn_file
)
elif
filetype
==
"kaldi"
:
elif
filetype
==
"kaldi"
:
cmvn
=
_load_kaldi_cmvn
(
cmvn_file
)
cmvn
=
_load_kaldi_cmvn
(
cmvn_file
)
elif
filetype
==
"npz"
:
eps
=
1e-14
npzfile
=
np
.
load
(
cmvn_file
)
mean
=
np
.
squeeze
(
npzfile
[
"mean"
])
std
=
np
.
squeeze
(
npzfile
[
"std"
])
istd
=
1
/
(
std
+
eps
)
cmvn
=
[
mean
,
istd
]
else
:
else
:
raise
ValueError
(
f
"cmvn file type no support:
{
filetype
}
"
)
raise
ValueError
(
f
"cmvn file type no support:
{
filetype
}
"
)
return
cmvn
[
0
],
cmvn
[
1
]
return
cmvn
[
0
],
cmvn
[
1
]
deepspeech/io/collator.py
浏览文件 @
e411e0bd
...
@@ -23,7 +23,7 @@ logger = Log(__name__).getlog()
...
@@ -23,7 +23,7 @@ logger = Log(__name__).getlog()
class
SpeechCollator
():
class
SpeechCollator
():
def
__init__
(
self
,
keep_transcription_text
=
True
):
def
__init__
(
self
,
keep_transcription_text
=
True
,
return_utts
=
False
):
"""
"""
Padding audio features with zeros to make them have the same shape (or
Padding audio features with zeros to make them have the same shape (or
a user-defined shape) within one bach.
a user-defined shape) within one bach.
...
@@ -31,6 +31,7 @@ class SpeechCollator():
...
@@ -31,6 +31,7 @@ class SpeechCollator():
if ``keep_transcription_text`` is False, text is token ids else is raw string.
if ``keep_transcription_text`` is False, text is token ids else is raw string.
"""
"""
self
.
_keep_transcription_text
=
keep_transcription_text
self
.
_keep_transcription_text
=
keep_transcription_text
self
.
return_utts
=
return_utts
def
__call__
(
self
,
batch
):
def
__call__
(
self
,
batch
):
"""batch examples
"""batch examples
...
@@ -51,7 +52,9 @@ class SpeechCollator():
...
@@ -51,7 +52,9 @@ class SpeechCollator():
audio_lens
=
[]
audio_lens
=
[]
texts
=
[]
texts
=
[]
text_lens
=
[]
text_lens
=
[]
for
audio
,
text
in
batch
:
utts
=
[]
for
utt
,
audio
,
text
in
batch
:
utts
.
append
(
utt
)
# audio
# audio
audios
.
append
(
audio
.
T
)
# [T, D]
audios
.
append
(
audio
.
T
)
# [T, D]
audio_lens
.
append
(
audio
.
shape
[
1
])
audio_lens
.
append
(
audio
.
shape
[
1
])
...
@@ -75,4 +78,7 @@ class SpeechCollator():
...
@@ -75,4 +78,7 @@ class SpeechCollator():
padded_texts
=
pad_sequence
(
padded_texts
=
pad_sequence
(
texts
,
padding_value
=
IGNORE_ID
).
astype
(
np
.
int64
)
texts
,
padding_value
=
IGNORE_ID
).
astype
(
np
.
int64
)
text_lens
=
np
.
array
(
text_lens
).
astype
(
np
.
int64
)
text_lens
=
np
.
array
(
text_lens
).
astype
(
np
.
int64
)
return
padded_audios
,
audio_lens
,
padded_texts
,
text_lens
if
self
.
return_utts
:
return
padded_audios
,
audio_lens
,
padded_texts
,
text_lens
,
utts
else
:
return
padded_audios
,
audio_lens
,
padded_texts
,
text_lens
\ No newline at end of file
deepspeech/io/dataset.py
浏览文件 @
e411e0bd
...
@@ -347,4 +347,5 @@ class ManifestDataset(Dataset):
...
@@ -347,4 +347,5 @@ class ManifestDataset(Dataset):
def
__getitem__
(
self
,
idx
):
def
__getitem__
(
self
,
idx
):
instance
=
self
.
_manifest
[
idx
]
instance
=
self
.
_manifest
[
idx
]
return
self
.
process_utterance
(
instance
[
"feat"
],
instance
[
"text"
])
feat
,
text
=
self
.
process_utterance
(
instance
[
"feat"
],
instance
[
"text"
])
return
instance
[
"utt"
],
feat
,
text
deepspeech/models/ds2/conv.py
浏览文件 @
e411e0bd
...
@@ -26,9 +26,9 @@ __all__ = ['ConvStack', "conv_output_size"]
...
@@ -26,9 +26,9 @@ __all__ = ['ConvStack', "conv_output_size"]
def
conv_output_size
(
I
,
F
,
P
,
S
):
def
conv_output_size
(
I
,
F
,
P
,
S
):
# https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters
# https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters
# Output size after Conv:
# Output size after Conv:
# By noting I the length of the input volume size,
# By noting I the length of the input volume size,
# F the length of the filter,
# F the length of the filter,
# P the amount of zero padding,
# P the amount of zero padding,
# S the stride,
# S the stride,
# then the output size O of the feature map along that dimension is given by:
# then the output size O of the feature map along that dimension is given by:
# O = (I - F + Pstart + Pend) // S + 1
# O = (I - F + Pstart + Pend) // S + 1
...
@@ -45,7 +45,7 @@ def conv_output_size(I, F, P, S):
...
@@ -45,7 +45,7 @@ def conv_output_size(I, F, P, S):
# https://fomoro.com/research/article/receptive-field-calculator
# https://fomoro.com/research/article/receptive-field-calculator
# https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters
# https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters
# https://distill.pub/2019/computing-receptive-fields/
# https://distill.pub/2019/computing-receptive-fields/
# Rl-1 = Sl * Rl + (Kl - Sl)
# Rl-1 = Sl * Rl + (Kl - Sl)
class
ConvBn
(
nn
.
Layer
):
class
ConvBn
(
nn
.
Layer
):
...
@@ -58,8 +58,8 @@ class ConvBn(nn.Layer):
...
@@ -58,8 +58,8 @@ class ConvBn(nn.Layer):
:type num_channels_in: int
:type num_channels_in: int
:param num_channels_out: Number of output channels.
:param num_channels_out: Number of output channels.
:type num_channels_out: int
:type num_channels_out: int
:param stride: The x dimension of the stride. Or input a tuple for two
:param stride: The x dimension of the stride. Or input a tuple for two
image dimension.
image dimension.
:type stride: int|tuple|list
:type stride: int|tuple|list
:param padding: The x dimension of the padding. Or input a tuple for two
:param padding: The x dimension of the padding. Or input a tuple for two
image dimension.
image dimension.
...
@@ -114,7 +114,7 @@ class ConvBn(nn.Layer):
...
@@ -114,7 +114,7 @@ class ConvBn(nn.Layer):
masks
=
make_non_pad_mask
(
x_len
)
#[B, T]
masks
=
make_non_pad_mask
(
x_len
)
#[B, T]
masks
=
masks
.
unsqueeze
(
1
).
unsqueeze
(
1
)
# [B, 1, 1, T]
masks
=
masks
.
unsqueeze
(
1
).
unsqueeze
(
1
)
# [B, 1, 1, T]
# TODO(Hui Zhang): not support bool multiply
# TODO(Hui Zhang): not support bool multiply
masks
=
masks
.
type_as
(
x
)
masks
=
masks
.
astype
(
x
.
dtype
)
x
=
x
.
multiply
(
masks
)
x
=
x
.
multiply
(
masks
)
return
x
,
x_len
return
x
,
x_len
...
...
deepspeech/models/ds2/deepspeech2.py
浏览文件 @
e411e0bd
...
@@ -219,15 +219,17 @@ class DeepSpeech2Model(nn.Layer):
...
@@ -219,15 +219,17 @@ class DeepSpeech2Model(nn.Layer):
The model built from pretrained result.
The model built from pretrained result.
"""
"""
model
=
cls
(
model
=
cls
(
feat_size
=
dataloader
.
collate_fn
.
feature_size
,
#feat_size=dataloader.collate_fn.feature_size,
dict_size
=
dataloader
.
collate_fn
.
vocab_size
,
feat_size
=
dataloader
.
dataset
.
feature_size
,
#dict_size=dataloader.collate_fn.vocab_size,
dict_size
=
dataloader
.
dataset
.
vocab_size
,
num_conv_layers
=
config
.
model
.
num_conv_layers
,
num_conv_layers
=
config
.
model
.
num_conv_layers
,
num_rnn_layers
=
config
.
model
.
num_rnn_layers
,
num_rnn_layers
=
config
.
model
.
num_rnn_layers
,
rnn_size
=
config
.
model
.
rnn_layer_size
,
rnn_size
=
config
.
model
.
rnn_layer_size
,
use_gru
=
config
.
model
.
use_gru
,
use_gru
=
config
.
model
.
use_gru
,
share_rnn_weights
=
config
.
model
.
share_rnn_weights
,
share_rnn_weights
=
config
.
model
.
share_rnn_weights
,
blank_id
=
config
.
model
.
blank_id
,
blank_id
=
config
.
model
.
blank_id
,
ctc_grad_norm_type
=
config
.
ctc_grad_norm_type
,
)
ctc_grad_norm_type
=
config
.
model
.
ctc_grad_norm_type
,
)
infos
=
Checkpoint
().
load_parameters
(
infos
=
Checkpoint
().
load_parameters
(
model
,
checkpoint_path
=
checkpoint_path
)
model
,
checkpoint_path
=
checkpoint_path
)
logger
.
info
(
f
"checkpoint info:
{
infos
}
"
)
logger
.
info
(
f
"checkpoint info:
{
infos
}
"
)
...
@@ -260,24 +262,8 @@ class DeepSpeech2Model(nn.Layer):
...
@@ -260,24 +262,8 @@ class DeepSpeech2Model(nn.Layer):
class
DeepSpeech2InferModel
(
DeepSpeech2Model
):
class
DeepSpeech2InferModel
(
DeepSpeech2Model
):
def
__init__
(
self
,
def
__init__
(
self
,
*
args
,
**
kwargs
):
feat_size
,
super
().
__init__
(
*
args
,
**
kwargs
)
dict_size
,
num_conv_layers
=
2
,
num_rnn_layers
=
3
,
rnn_size
=
1024
,
use_gru
=
False
,
share_rnn_weights
=
True
,
blank_id
=
0
):
super
().
__init__
(
feat_size
=
feat_size
,
dict_size
=
dict_size
,
num_conv_layers
=
num_conv_layers
,
num_rnn_layers
=
num_rnn_layers
,
rnn_size
=
rnn_size
,
use_gru
=
use_gru
,
share_rnn_weights
=
share_rnn_weights
,
blank_id
=
blank_id
)
def
forward
(
self
,
audio
,
audio_len
):
def
forward
(
self
,
audio
,
audio_len
):
"""export model function
"""export model function
...
...
deepspeech/models/ds2/rnn.py
浏览文件 @
e411e0bd
...
@@ -29,13 +29,13 @@ __all__ = ['RNNStack']
...
@@ -29,13 +29,13 @@ __all__ = ['RNNStack']
class
RNNCell
(
nn
.
RNNCellBase
):
class
RNNCell
(
nn
.
RNNCellBase
):
r
"""
r
"""
Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it
Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it
computes the outputs and updates states.
computes the outputs and updates states.
The formula used is as follows:
The formula used is as follows:
.. math::
.. math::
h_{t} & = act(x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh})
h_{t} & = act(x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh})
y_{t} & = h_{t}
y_{t} & = h_{t}
where :math:`act` is for :attr:`activation`.
where :math:`act` is for :attr:`activation`.
"""
"""
...
@@ -92,7 +92,7 @@ class RNNCell(nn.RNNCellBase):
...
@@ -92,7 +92,7 @@ class RNNCell(nn.RNNCellBase):
class
GRUCell
(
nn
.
RNNCellBase
):
class
GRUCell
(
nn
.
RNNCellBase
):
r
"""
r
"""
Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states,
Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states,
it computes the outputs and updates states.
it computes the outputs and updates states.
The formula for GRU used is as follows:
The formula for GRU used is as follows:
.. math::
.. math::
...
@@ -101,8 +101,8 @@ class GRUCell(nn.RNNCellBase):
...
@@ -101,8 +101,8 @@ class GRUCell(nn.RNNCellBase):
\widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}h_{t-1} + b_{hc}))
\widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}h_{t-1} + b_{hc}))
h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t}
h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t}
y_{t} & = h_{t}
y_{t} & = h_{t}
where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise
where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise
multiplication operator.
multiplication operator.
"""
"""
...
@@ -309,6 +309,6 @@ class RNNStack(nn.Layer):
...
@@ -309,6 +309,6 @@ class RNNStack(nn.Layer):
masks
=
make_non_pad_mask
(
x_len
)
#[B, T]
masks
=
make_non_pad_mask
(
x_len
)
#[B, T]
masks
=
masks
.
unsqueeze
(
-
1
)
# [B, T, 1]
masks
=
masks
.
unsqueeze
(
-
1
)
# [B, T, 1]
# TODO(Hui Zhang): not support bool multiply
# TODO(Hui Zhang): not support bool multiply
masks
=
masks
.
type_as
(
x
)
masks
=
masks
.
astype
(
x
.
dtype
)
x
=
x
.
multiply
(
masks
)
x
=
x
.
multiply
(
masks
)
return
x
,
x_len
return
x
,
x_len
deepspeech/models/ds2_online/deepspeech2.py
浏览文件 @
e411e0bd
...
@@ -255,22 +255,24 @@ class DeepSpeech2ModelOnline(nn.Layer):
...
@@ -255,22 +255,24 @@ class DeepSpeech2ModelOnline(nn.Layer):
fc_layers_size_list
=
[
512
,
256
],
fc_layers_size_list
=
[
512
,
256
],
use_gru
=
True
,
#Use gru if set True. Use simple rnn if set False.
use_gru
=
True
,
#Use gru if set True. Use simple rnn if set False.
blank_id
=
0
,
# index of blank in vocob.txt
blank_id
=
0
,
# index of blank in vocob.txt
))
ctc_grad_norm_type
=
'instance'
,
))
if
config
is
not
None
:
if
config
is
not
None
:
config
.
merge_from_other_cfg
(
default
)
config
.
merge_from_other_cfg
(
default
)
return
default
return
default
def
__init__
(
self
,
def
__init__
(
feat_size
,
self
,
dict_size
,
feat_size
,
num_conv_layers
=
2
,
dict_size
,
num_rnn_layers
=
4
,
num_conv_layers
=
2
,
rnn_size
=
1024
,
num_rnn_layers
=
4
,
rnn_direction
=
'forward'
,
rnn_size
=
1024
,
num_fc_layers
=
2
,
rnn_direction
=
'forward'
,
fc_layers_size_list
=
[
512
,
256
],
num_fc_layers
=
2
,
use_gru
=
False
,
fc_layers_size_list
=
[
512
,
256
],
blank_id
=
0
):
use_gru
=
False
,
blank_id
=
0
,
ctc_grad_norm_type
=
'instance'
,
):
super
().
__init__
()
super
().
__init__
()
self
.
encoder
=
CRNNEncoder
(
self
.
encoder
=
CRNNEncoder
(
feat_size
=
feat_size
,
feat_size
=
feat_size
,
...
@@ -290,7 +292,7 @@ class DeepSpeech2ModelOnline(nn.Layer):
...
@@ -290,7 +292,7 @@ class DeepSpeech2ModelOnline(nn.Layer):
dropout_rate
=
0.0
,
dropout_rate
=
0.0
,
reduction
=
True
,
# sum
reduction
=
True
,
# sum
batch_average
=
True
,
# sum / batch_size
batch_average
=
True
,
# sum / batch_size
grad_norm_type
=
'instance'
)
grad_norm_type
=
ctc_grad_norm_type
)
def
forward
(
self
,
audio
,
audio_len
,
text
,
text_len
):
def
forward
(
self
,
audio
,
audio_len
,
text
,
text_len
):
"""Compute Model loss
"""Compute Model loss
...
@@ -348,16 +350,18 @@ class DeepSpeech2ModelOnline(nn.Layer):
...
@@ -348,16 +350,18 @@ class DeepSpeech2ModelOnline(nn.Layer):
DeepSpeech2ModelOnline
DeepSpeech2ModelOnline
The model built from pretrained result.
The model built from pretrained result.
"""
"""
model
=
cls
(
feat_size
=
dataloader
.
collate_fn
.
feature_size
,
model
=
cls
(
dict_size
=
dataloader
.
collate_fn
.
vocab_size
,
feat_size
=
dataloader
.
collate_fn
.
feature_size
,
num_conv_layers
=
config
.
model
.
num_conv_layers
,
dict_size
=
dataloader
.
collate_fn
.
vocab_size
,
num_rnn_layers
=
config
.
model
.
num_rnn_layers
,
num_conv_layers
=
config
.
model
.
num_conv_layers
,
rnn_size
=
config
.
model
.
rnn_layer_size
,
num_rnn_layers
=
config
.
model
.
num_rnn_layers
,
rnn_direction
=
config
.
model
.
rnn_direction
,
rnn_size
=
config
.
model
.
rnn_layer_size
,
num_fc_layers
=
config
.
model
.
num_fc_layers
,
rnn_direction
=
config
.
model
.
rnn_direction
,
fc_layers_size_list
=
config
.
model
.
fc_layers_size_list
,
num_fc_layers
=
config
.
model
.
num_fc_layers
,
use_gru
=
config
.
model
.
use_gru
,
fc_layers_size_list
=
config
.
model
.
fc_layers_size_list
,
blank_id
=
config
.
model
.
blank_id
)
use_gru
=
config
.
model
.
use_gru
,
blank_id
=
config
.
model
.
blank_id
,
ctc_grad_norm_type
=
config
.
model
.
ctc_grad_norm_type
,
)
infos
=
Checkpoint
().
load_parameters
(
infos
=
Checkpoint
().
load_parameters
(
model
,
checkpoint_path
=
checkpoint_path
)
model
,
checkpoint_path
=
checkpoint_path
)
logger
.
info
(
f
"checkpoint info:
{
infos
}
"
)
logger
.
info
(
f
"checkpoint info:
{
infos
}
"
)
...
@@ -376,42 +380,24 @@ class DeepSpeech2ModelOnline(nn.Layer):
...
@@ -376,42 +380,24 @@ class DeepSpeech2ModelOnline(nn.Layer):
DeepSpeech2ModelOnline
DeepSpeech2ModelOnline
The model built from config.
The model built from config.
"""
"""
model
=
cls
(
feat_size
=
config
.
feat_size
,
model
=
cls
(
dict_size
=
config
.
dict_size
,
feat_size
=
config
.
feat_size
,
num_conv_layers
=
config
.
num_conv_layers
,
dict_size
=
config
.
dict_size
,
num_rnn_layers
=
config
.
num_rnn_layers
,
num_conv_layers
=
config
.
num_conv_layers
,
rnn_size
=
config
.
rnn_layer_size
,
num_rnn_layers
=
config
.
num_rnn_layers
,
rnn_direction
=
config
.
rnn_direction
,
rnn_size
=
config
.
rnn_layer_size
,
num_fc_layers
=
config
.
num_fc_layers
,
rnn_direction
=
config
.
rnn_direction
,
fc_layers_size_list
=
config
.
fc_layers_size_list
,
num_fc_layers
=
config
.
num_fc_layers
,
use_gru
=
config
.
use_gru
,
fc_layers_size_list
=
config
.
fc_layers_size_list
,
blank_id
=
config
.
blank_id
)
use_gru
=
config
.
use_gru
,
blank_id
=
config
.
blank_id
,
ctc_grad_norm_type
=
config
.
ctc_grad_norm_type
,
)
return
model
return
model
class
DeepSpeech2InferModelOnline
(
DeepSpeech2ModelOnline
):
class
DeepSpeech2InferModelOnline
(
DeepSpeech2ModelOnline
):
def
__init__
(
self
,
def
__init__
(
self
,
*
args
,
**
kwargs
):
feat_size
,
super
().
__init__
(
*
args
,
**
kwargs
)
dict_size
,
num_conv_layers
=
2
,
num_rnn_layers
=
4
,
rnn_size
=
1024
,
rnn_direction
=
'forward'
,
num_fc_layers
=
2
,
fc_layers_size_list
=
[
512
,
256
],
use_gru
=
False
,
blank_id
=
0
):
super
().
__init__
(
feat_size
=
feat_size
,
dict_size
=
dict_size
,
num_conv_layers
=
num_conv_layers
,
num_rnn_layers
=
num_rnn_layers
,
rnn_size
=
rnn_size
,
rnn_direction
=
rnn_direction
,
num_fc_layers
=
num_fc_layers
,
fc_layers_size_list
=
fc_layers_size_list
,
use_gru
=
use_gru
,
blank_id
=
blank_id
)
def
forward
(
self
,
audio_chunk
,
audio_chunk_lens
,
chunk_state_h_box
,
def
forward
(
self
,
audio_chunk
,
audio_chunk_lens
,
chunk_state_h_box
,
chunk_state_c_box
):
chunk_state_c_box
):
...
...
deepspeech/utils/log.py
浏览文件 @
e411e0bd
...
@@ -120,14 +120,15 @@ class Autolog:
...
@@ -120,14 +120,15 @@ class Autolog:
model_precision
=
"fp32"
):
model_precision
=
"fp32"
):
import
auto_log
import
auto_log
pid
=
os
.
getpid
()
pid
=
os
.
getpid
()
if
(
os
.
environ
[
'CUDA_VISIBLE_DEVICES'
].
strip
()
!=
''
):
if
os
.
environ
.
get
(
'CUDA_VISIBLE_DEVICES'
,
None
):
gpu_id
=
int
(
os
.
environ
[
'CUDA_VISIBLE_DEVICES'
].
split
(
','
)[
0
])
gpu_id
=
int
(
os
.
environ
[
'CUDA_VISIBLE_DEVICES'
].
split
(
','
)[
0
])
infer_config
=
inference
.
Config
()
infer_config
=
inference
.
Config
()
infer_config
.
enable_use_gpu
(
100
,
gpu_id
)
infer_config
.
enable_use_gpu
(
100
,
gpu_id
)
else
:
else
:
gpu_id
=
None
gpu_id
=
None
infer_config
=
inference
.
Config
()
infer_config
=
inference
.
Config
()
autolog
=
auto_log
.
AutoLogger
(
self
.
autolog
=
auto_log
.
AutoLogger
(
model_name
=
model_name
,
model_name
=
model_name
,
model_precision
=
model_precision
,
model_precision
=
model_precision
,
batch_size
=
batch_size
,
batch_size
=
batch_size
,
...
@@ -139,7 +140,6 @@ class Autolog:
...
@@ -139,7 +140,6 @@ class Autolog:
gpu_ids
=
gpu_id
,
gpu_ids
=
gpu_id
,
time_keys
=
[
'preprocess_time'
,
'inference_time'
,
'postprocess_time'
],
time_keys
=
[
'preprocess_time'
,
'inference_time'
,
'postprocess_time'
],
warmup
=
0
)
warmup
=
0
)
self
.
autolog
=
autolog
def
getlog
(
self
):
def
getlog
(
self
):
return
self
.
autolog
return
self
.
autolog
examples/dataset/mini_librispeech/.gitignore
浏览文件 @
e411e0bd
...
@@ -2,3 +2,4 @@ dev-clean/
...
@@ -2,3 +2,4 @@ dev-clean/
manifest.dev-clean
manifest.dev-clean
manifest.train-clean
manifest.train-clean
train-clean/
train-clean/
*.meta
examples/dataset/mini_librispeech/mini_librispeech.py
浏览文件 @
e411e0bd
...
@@ -58,6 +58,10 @@ def create_manifest(data_dir, manifest_path):
...
@@ -58,6 +58,10 @@ def create_manifest(data_dir, manifest_path):
"""
"""
print
(
"Creating manifest %s ..."
%
manifest_path
)
print
(
"Creating manifest %s ..."
%
manifest_path
)
json_lines
=
[]
json_lines
=
[]
total_sec
=
0.0
total_text
=
0.0
total_num
=
0
for
subfolder
,
_
,
filelist
in
sorted
(
os
.
walk
(
data_dir
)):
for
subfolder
,
_
,
filelist
in
sorted
(
os
.
walk
(
data_dir
)):
text_filelist
=
[
text_filelist
=
[
filename
for
filename
in
filelist
if
filename
.
endswith
(
'trans.txt'
)
filename
for
filename
in
filelist
if
filename
.
endswith
(
'trans.txt'
)
...
@@ -80,10 +84,27 @@ def create_manifest(data_dir, manifest_path):
...
@@ -80,10 +84,27 @@ def create_manifest(data_dir, manifest_path):
'text'
:
'text'
:
text
text
}))
}))
total_sec
+=
duration
total_text
+=
len
(
text
)
total_num
+=
1
with
codecs
.
open
(
manifest_path
,
'w'
,
'utf-8'
)
as
out_file
:
with
codecs
.
open
(
manifest_path
,
'w'
,
'utf-8'
)
as
out_file
:
for
line
in
json_lines
:
for
line
in
json_lines
:
out_file
.
write
(
line
+
'
\n
'
)
out_file
.
write
(
line
+
'
\n
'
)
subset
=
os
.
path
.
splitext
(
manifest_path
)[
1
][
1
:]
manifest_dir
=
os
.
path
.
dirname
(
manifest_path
)
data_dir_name
=
os
.
path
.
split
(
data_dir
)[
-
1
]
meta_path
=
os
.
path
.
join
(
manifest_dir
,
data_dir_name
)
+
'.meta'
with
open
(
meta_path
,
'w'
)
as
f
:
print
(
f
"
{
subset
}
:"
,
file
=
f
)
print
(
f
"
{
total_num
}
utts"
,
file
=
f
)
print
(
f
"
{
total_sec
/
(
60
*
60
)
}
h"
,
file
=
f
)
print
(
f
"
{
total_text
}
text"
,
file
=
f
)
print
(
f
"
{
total_text
/
total_sec
}
text/sec"
,
file
=
f
)
print
(
f
"
{
total_sec
/
total_num
}
sec/utt"
,
file
=
f
)
def
prepare_dataset
(
url
,
md5sum
,
target_dir
,
manifest_path
):
def
prepare_dataset
(
url
,
md5sum
,
target_dir
,
manifest_path
):
"""Download, unpack and create summmary manifest file.
"""Download, unpack and create summmary manifest file.
...
...
examples/librispeech/s1/local/align.sh
0 → 100755
浏览文件 @
e411e0bd
#!/bin/bash
if
[
$#
!=
2
]
;
then
echo
"usage:
${
0
}
config_path ckpt_path_prefix"
exit
-1
fi
ngpu
=
$(
echo
$CUDA_VISIBLE_DEVICES
|
awk
-F
","
'{print NF}'
)
echo
"using
$ngpu
gpus..."
config_path
=
$1
ckpt_prefix
=
$2
batch_size
=
1
output_dir
=
${
ckpt_prefix
}
mkdir
-p
${
output_dir
}
# align dump in `result_file`
# .tier, .TextGrid dump in `dir of result_file`
python3
-u
${
BIN_DIR
}
/alignment.py
\
--nproc
${
ngpu
}
\
--config
${
config_path
}
\
--result_file
${
output_dir
}
/
${
type
}
.align
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decoding.batch_size
${
batch_size
}
if
[
$?
-ne
0
]
;
then
echo
"Failed in ctc alignment!"
exit
1
fi
exit
0
examples/librispeech/s1/local/data.sh
浏览文件 @
e411e0bd
#!
/usr/bin/env
bash
#!
/bin/
bash
stage
=
-1
stage
=
-1
stop_stage
=
100
stop_stage
=
100
...
...
examples/librispeech/s1/local/download_lm_en.sh
浏览文件 @
e411e0bd
#!
/usr/bin/env
bash
#!
/bin/
bash
.
${
MAIN_ROOT
}
/utils/utility.sh
.
${
MAIN_ROOT
}
/utils/utility.sh
...
...
examples/librispeech/s1/local/export.sh
浏览文件 @
e411e0bd
#!
/usr/bin/env
bash
#!
/bin/
bash
if
[
$#
!=
3
]
;
then
if
[
$#
!=
3
]
;
then
echo
"usage:
$0
config_path ckpt_prefix jit_model_path"
echo
"usage:
$0
config_path ckpt_prefix jit_model_path"
...
@@ -12,13 +12,7 @@ config_path=$1
...
@@ -12,13 +12,7 @@ config_path=$1
ckpt_path_prefix
=
$2
ckpt_path_prefix
=
$2
jit_model_export_path
=
$3
jit_model_export_path
=
$3
device
=
gpu
if
[
ngpu
==
0
]
;
then
device
=
cpu
fi
python3
-u
${
BIN_DIR
}
/export.py
\
python3
-u
${
BIN_DIR
}
/export.py
\
--device
${
device
}
\
--nproc
${
ngpu
}
\
--nproc
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--checkpoint_path
${
ckpt_path_prefix
}
\
--checkpoint_path
${
ckpt_path_prefix
}
\
...
...
examples/librispeech/s1/local/test.sh
浏览文件 @
e411e0bd
#!
/usr/bin/env
bash
#!
/bin/
bash
if
[
$#
!=
2
]
;
then
set
-e
echo
"usage:
${
0
}
config_path ckpt_path_prefix"
expdir
=
exp
datadir
=
data
nj
=
32
lmtag
=
recog_set
=
"test-clean test-other dev-clean dev-other"
recog_set
=
"test-clean"
# bpemode (unigram or bpe)
nbpe
=
5000
bpemode
=
unigram
bpeprefix
=
"data/bpe_
${
bpemode
}
_
${
nbpe
}
"
bpemodel
=
${
bpeprefix
}
.model
if
[
$#
!=
3
]
;
then
echo
"usage:
${
0
}
config_path dict_path ckpt_path_prefix"
exit
-1
exit
-1
fi
fi
ngpu
=
$(
echo
$CUDA_VISIBLE_DEVICES
|
awk
-F
","
'{print NF}'
)
ngpu
=
$(
echo
$CUDA_VISIBLE_DEVICES
|
awk
-F
","
'{print NF}'
)
echo
"using
$ngpu
gpus..."
echo
"using
$ngpu
gpus..."
device
=
gpu
if
[
ngpu
==
0
]
;
then
device
=
cpu
fi
config_path
=
$1
config_path
=
$1
ckpt_prefix
=
$2
dict
=
$2
ckpt_prefix
=
$3
chunk_mode
=
false
if
[[
${
config_path
}
=
~ ^.
*
chunk_.
*
yaml
$
]]
;
then
chunk_mode
=
true
fi
echo
"chunk mode
${
chunk_mode
}
"
# download language model
# download language model
#bash local/download_lm_en.sh
#bash local/download_lm_en.sh
...
@@ -21,39 +42,46 @@ ckpt_prefix=$2
...
@@ -21,39 +42,46 @@ ckpt_prefix=$2
# exit 1
# exit 1
#fi
#fi
for
type
in
attention ctc_greedy_search
;
do
pids
=()
# initialize pids
echo
"decoding
${
type
}
"
batch_size
=
64
for
dmethd
in
attention ctc_greedy_search ctc_prefix_beam_search attention_rescoring
;
do
python3
-u
${
BIN_DIR
}
/test.py
\
(
--device
${
device
}
\
for
rtask
in
${
recog_set
}
;
do
--nproc
1
\
(
--config
${
config_path
}
\
decode_dir
=
decode_
${
rtask
}
_
${
dmethd
}
_
$(
basename
${
config_path
%.*
}
)
_
${
lmtag
}
--result_file
${
ckpt_prefix
}
.
${
type
}
.rsl
\
feat_recog_dir
=
${
datadir
}
--checkpoint_path
${
ckpt_prefix
}
\
mkdir
-p
${
expdir
}
/
${
decode_dir
}
--opts
decoding.decoding_method
${
type
}
decoding.batch_size
${
batch_size
}
mkdir
-p
${
feat_recog_dir
}
if
[
$?
-ne
0
]
;
then
# split data
echo
"Failed in evaluation!"
split_json.sh
${
feat_recog_dir
}
/manifest.
${
rtask
}
${
nj
}
exit
1
fi
#### use CPU for decoding
done
ngpu
=
0
# set batchsize 0 to disable batch decoding
batch_size
=
1
${
decode_cmd
}
JOB
=
1:
${
nj
}
${
expdir
}
/
${
decode_dir
}
/log/decode.JOB.log
\
python3
-u
${
BIN_DIR
}
/test.py
\
--nproc
${
ngpu
}
\
--config
${
config_path
}
\
--result_file
${
expdir
}
/
${
decode_dir
}
/data.JOB.json
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decoding.decoding_method
${
dmethd
}
\
--opts
decoding.batch_size
${
batch_size
}
\
--opts
data.test_manifest
${
feat_recog_dir
}
/split
${
nj
}
/JOB/manifest.
${
rtask
}
score_sclite.sh
--bpe
${
nbpe
}
--bpemodel
${
bpemodel
}
.model
--wer
true
${
expdir
}
/
${
decode_dir
}
${
dict
}
for
type
in
ctc_prefix_beam_search attention_rescoring
;
do
)
&
echo
"decoding
${
type
}
"
pids+
=(
$!
)
# store background pids
batch_size
=
1
done
python3
-u
${
BIN_DIR
}
/test.py
\
)
&
--device
${
device
}
\
pids+
=(
$!
)
# store background pids
--nproc
1
\
--config
${
config_path
}
\
--result_file
${
ckpt_prefix
}
.
${
type
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decoding.decoding_method
${
type
}
decoding.batch_size
${
batch_size
}
if
[
$?
-ne
0
]
;
then
echo
"Failed in evaluation!"
exit
1
fi
done
done
i
=
0
;
for
pid
in
"
${
pids
[@]
}
"
;
do
wait
${
pid
}
||
((
++i
))
;
done
[
${
i
}
-gt
0
]
&&
echo
"
$0
:
${
i
}
background jobs are failed."
&&
false
echo
"Finished"
exit
0
exit
0
examples/librispeech/s1/local/train.sh
浏览文件 @
e411e0bd
...
@@ -13,6 +13,16 @@ ckpt_name=$2
...
@@ -13,6 +13,16 @@ ckpt_name=$2
mkdir
-p
exp
mkdir
-p
exp
# seed may break model convergence
seed
=
0
if
[
${
seed
}
!=
0
]
;
then
#export FLAGS_cudnn_deterministic=True
echo
"None"
fi
# export FLAGS_cudnn_exhaustive_search=true
# export FLAGS_conv_workspace_size_limit=4000
# seed may break model convergence
# seed may break model convergence
seed
=
0
seed
=
0
if
[
${
seed
}
!=
0
]
;
then
if
[
${
seed
}
!=
0
]
;
then
...
@@ -22,10 +32,11 @@ fi
...
@@ -22,10 +32,11 @@ fi
python3
-u
${
BIN_DIR
}
/train.py
\
python3
-u
${
BIN_DIR
}
/train.py
\
--nproc
${
ngpu
}
\
--nproc
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--output
exp/
${
ckpt_name
}
\
--output
exp/
${
ckpt_name
}
--seed
${
seed
}
#
--seed ${seed}
if
[
${
seed
}
!=
0
]
;
then
if
[
${
seed
}
!=
0
]
;
then
#unset FLAGS_cudnn_deterministic
echo
"None"
echo
"None"
fi
fi
...
...
examples/tiny/s0/conf/deepspeech2.yaml
浏览文件 @
e411e0bd
...
@@ -4,6 +4,7 @@ data:
...
@@ -4,6 +4,7 @@ data:
dev_manifest
:
data/manifest.tiny
dev_manifest
:
data/manifest.tiny
test_manifest
:
data/manifest.tiny
test_manifest
:
data/manifest.tiny
mean_std_filepath
:
data/mean_std.json
mean_std_filepath
:
data/mean_std.json
unit_type
:
char
vocab_filepath
:
data/vocab.txt
vocab_filepath
:
data/vocab.txt
augmentation_config
:
conf/augmentation.json
augmentation_config
:
conf/augmentation.json
batch_size
:
4
batch_size
:
4
...
@@ -35,6 +36,8 @@ model:
...
@@ -35,6 +36,8 @@ model:
rnn_layer_size
:
2048
rnn_layer_size
:
2048
use_gru
:
False
use_gru
:
False
share_rnn_weights
:
True
share_rnn_weights
:
True
blank_id
:
0
ctc_grad_norm_type
:
instance
training
:
training
:
n_epoch
:
20
n_epoch
:
20
...
...
examples/tiny/s0/conf/deepspeech2_online.yaml
0 → 100644
浏览文件 @
e411e0bd
# https://yaml.org/type/float.html
data
:
train_manifest
:
data/manifest.tiny
dev_manifest
:
data/manifest.tiny
test_manifest
:
data/manifest.tiny
min_input_len
:
0.0
max_input_len
:
30.0
min_output_len
:
0.0
max_output_len
:
400.0
min_output_input_ratio
:
0.05
max_output_input_ratio
:
10.0
collator
:
mean_std_filepath
:
data/mean_std.json
unit_type
:
char
vocab_filepath
:
data/vocab.txt
augmentation_config
:
conf/augmentation.json
random_seed
:
0
spm_model_prefix
:
spectrum_type
:
linear
feat_dim
:
delta_delta
:
False
stride_ms
:
10.0
window_ms
:
20.0
n_fft
:
None
max_freq
:
None
target_sample_rate
:
16000
use_dB_normalization
:
True
target_dB
:
-20
dither
:
1.0
keep_transcription_text
:
False
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
0
batch_size
:
4
model
:
num_conv_layers
:
2
num_rnn_layers
:
4
rnn_layer_size
:
2048
rnn_direction
:
forward
num_fc_layers
:
2
fc_layers_size_list
:
512,
256
use_gru
:
True
blank_id
:
0
ctc_grad_norm_type
:
instance
training
:
n_epoch
:
10
accum_grad
:
1
lr
:
1e-5
lr_decay
:
1.0
weight_decay
:
1e-06
global_grad_clip
:
5.0
log_interval
:
1
checkpoint
:
kbest_n
:
3
latest_n
:
2
decoding
:
batch_size
:
128
error_rate_type
:
wer
decoding_method
:
ctc_beam_search
lang_model_path
:
data/lm/common_crawl_00.prune01111.trie.klm
alpha
:
2.5
beta
:
0.3
beam_size
:
500
cutoff_prob
:
1.0
cutoff_top_n
:
40
num_proc_bsearch
:
8
examples/tiny/s0/local/download_lm_en.sh
浏览文件 @
e411e0bd
#!
/usr/bin/env
bash
#!
/bin/
bash
.
${
MAIN_ROOT
}
/utils/utility.sh
.
${
MAIN_ROOT
}
/utils/utility.sh
...
@@ -9,6 +9,11 @@ URL=https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm
...
@@ -9,6 +9,11 @@ URL=https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm
MD5
=
"099a601759d467cd0a8523ff939819c5"
MD5
=
"099a601759d467cd0a8523ff939819c5"
TARGET
=
${
DIR
}
/common_crawl_00.prune01111.trie.klm
TARGET
=
${
DIR
}
/common_crawl_00.prune01111.trie.klm
if
[
-e
$TARGET
]
;
then
echo
"
$TARGET
exists."
exit
0
fi
echo
"Download language model ..."
echo
"Download language model ..."
download
$URL
$MD5
$TARGET
download
$URL
$MD5
$TARGET
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
...
...
examples/tiny/s0/local/export.sh
浏览文件 @
e411e0bd
#!
/usr/bin/env
bash
#!
/bin/
bash
if
[
$#
!=
3
]
;
then
if
[
$#
!=
4
]
;
then
echo
"usage:
$0
config_path ckpt_prefix jit_model_path"
echo
"usage:
$0
config_path ckpt_prefix jit_model_path
model_type
"
exit
-1
exit
-1
fi
fi
...
@@ -11,19 +11,14 @@ echo "using $ngpu gpus..."
...
@@ -11,19 +11,14 @@ echo "using $ngpu gpus..."
config_path
=
$1
config_path
=
$1
ckpt_path_prefix
=
$2
ckpt_path_prefix
=
$2
jit_model_export_path
=
$3
jit_model_export_path
=
$3
model_type
=
$4
device
=
gpu
if
[
ngpu
==
0
]
;
then
device
=
cpu
fi
python3
-u
${
BIN_DIR
}
/export.py
\
python3
-u
${
BIN_DIR
}
/export.py
\
--device
${
device
}
\
--nproc
${
ngpu
}
\
--nproc
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--checkpoint_path
${
ckpt_path_prefix
}
\
--checkpoint_path
${
ckpt_path_prefix
}
\
--export_path
${
jit_model_export_path
}
--export_path
${
jit_model_export_path
}
\
--model_type
${
model_type
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in export!"
echo
"Failed in export!"
...
...
examples/tiny/s0/local/test.sh
浏览文件 @
e411e0bd
#!
/usr/bin/env
bash
#!
/bin/
bash
if
[
$#
!=
2
]
;
then
if
[
$#
!=
3
]
;
then
echo
"usage:
${
0
}
config_path ckpt_path_prefix"
echo
"usage:
${
0
}
config_path ckpt_path_prefix
model_type
"
exit
-1
exit
-1
fi
fi
ngpu
=
$(
echo
$CUDA_VISIBLE_DEVICES
|
awk
-F
","
'{print NF}'
)
ngpu
=
$(
echo
$CUDA_VISIBLE_DEVICES
|
awk
-F
","
'{print NF}'
)
echo
"using
$ngpu
gpus..."
echo
"using
$ngpu
gpus..."
device
=
gpu
if
[
ngpu
==
0
]
;
then
device
=
cpu
fi
config_path
=
$1
config_path
=
$1
ckpt_prefix
=
$2
ckpt_prefix
=
$2
model_type
=
$3
# download language model
# download language model
bash
local
/download_lm_en.sh
bash
local
/download_lm_en.sh
...
@@ -22,11 +19,11 @@ if [ $? -ne 0 ]; then
...
@@ -22,11 +19,11 @@ if [ $? -ne 0 ]; then
fi
fi
python3
-u
${
BIN_DIR
}
/test.py
\
python3
-u
${
BIN_DIR
}
/test.py
\
--device
${
device
}
\
--nproc
${
ngpu
}
\
--nproc
1
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--result_file
${
ckpt_prefix
}
.rsl
\
--result_file
${
ckpt_prefix
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
--checkpoint_path
${
ckpt_prefix
}
\
--model_type
${
model_type
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in evaluation!"
echo
"Failed in evaluation!"
...
...
examples/tiny/s0/local/train.sh
浏览文件 @
e411e0bd
#!
/usr/bin/env
bash
#!
/bin/
bash
if
[
$#
!=
2
]
;
then
profiler_options
=
echo
"usage: CUDA_VISIBLE_DEVICES=0
${
0
}
config_path ckpt_name"
exit
-1
# seed may break model convergence
fi
seed
=
0
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
ngpu
=
$(
echo
$CUDA_VISIBLE_DEVICES
|
awk
-F
","
'{print NF}'
)
ngpu
=
$(
echo
$CUDA_VISIBLE_DEVICES
|
awk
-F
","
'{print NF}'
)
echo
"using
$ngpu
gpus..."
echo
"using
$ngpu
gpus..."
config_path
=
$1
if
[
${
seed
}
!=
0
]
;
then
ckpt_name
=
$2
export
FLAGS_cudnn_deterministic
=
True
echo
"using seed
$seed
& FLAGS_cudnn_deterministic=True ..."
fi
device
=
gpu
if
[
$#
!=
3
]
;
then
if
[
ngpu
==
0
]
;
then
echo
"usage: CUDA_VISIBLE_DEVICES=0
${
0
}
config_path ckpt_name model_type"
device
=
cpu
exit
-1
fi
fi
config_path
=
$1
ckpt_name
=
$2
model_type
=
$3
mkdir
-p
exp
mkdir
-p
exp
python3
-u
${
BIN_DIR
}
/train.py
\
python3
-u
${
BIN_DIR
}
/train.py
\
--device
${
device
}
\
--nproc
${
ngpu
}
\
--nproc
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--output
exp/
${
ckpt_name
}
--output
exp/
${
ckpt_name
}
\
--model_type
${
model_type
}
\
--profiler-options
"
${
profiler_options
}
"
\
--seed
${
seed
}
if
[
${
seed
}
!=
0
]
;
then
unset
FLAGS_cudnn_deterministic
fi
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in training!"
echo
"Failed in training!"
...
...
examples/tiny/s0/path.sh
浏览文件 @
e411e0bd
export
MAIN_ROOT
=
${
PWD
}
/../../../
export
MAIN_ROOT
=
`
realpath
${
PWD
}
/../../../
`
export
PATH
=
${
MAIN_ROOT
}
:
${
MAIN_ROOT
}
/utils:
${
PATH
}
export
PATH
=
${
MAIN_ROOT
}
:
${
MAIN_ROOT
}
/utils:
${
PATH
}
export
LC_ALL
=
C
export
LC_ALL
=
C
...
...
examples/tiny/s0/run.sh
浏览文件 @
e411e0bd
...
@@ -7,11 +7,12 @@ stage=0
...
@@ -7,11 +7,12 @@ stage=0
stop_stage
=
100
stop_stage
=
100
conf_path
=
conf/deepspeech2.yaml
conf_path
=
conf/deepspeech2.yaml
avg_num
=
1
avg_num
=
1
model_type
=
offline
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
avg_ckpt
=
avg_
${
avg_num
}
avg_ckpt
=
avg_
${
avg_num
}
ckpt
=
$(
basename
${
conf_path
}
|
awk
-F
'.'
'{print $1}'
)
ckpt
=
$(
basename
${
conf_path
}
|
awk
-F
'.'
'{print $1}'
)
###ckpt = deepspeech2
echo
"checkpoint name
${
ckpt
}
"
echo
"checkpoint name
${
ckpt
}
"
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
...
@@ -21,20 +22,20 @@ fi
...
@@ -21,20 +22,20 @@ fi
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
# train model, all `ckpt` under `exp` dir
# train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/train.sh
${
conf_path
}
${
ckpt
}
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/train.sh
${
conf_path
}
${
ckpt
}
${
model_type
}
fi
fi
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
# avg n best model
# avg n best model
./local/avg.sh
exp/
${
ckpt
}
/checkpoints
${
avg_num
}
avg.sh best
exp/
${
ckpt
}
/checkpoints
${
avg_num
}
fi
fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
# test ckpt avg_n
# test ckpt avg_n
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/test.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/test.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
${
model_type
}
||
exit
-1
fi
fi
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
# export ckpt avg_n
# export ckpt avg_n
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/export.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/export.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
${
model_type
}
fi
fi
examples/tiny/s1/conf/transformer.yaml
浏览文件 @
e411e0bd
...
@@ -65,6 +65,8 @@ model:
...
@@ -65,6 +65,8 @@ model:
# hybrid CTC/attention
# hybrid CTC/attention
model_conf
:
model_conf
:
ctc_weight
:
0.3
ctc_weight
:
0.3
ctc_dropoutrate
:
0.0
ctc_grad_norm_type
:
instance
lsm_weight
:
0.1
# label smoothing option
lsm_weight
:
0.1
# label smoothing option
length_normalized_loss
:
false
length_normalized_loss
:
false
...
...
examples/tiny/s1/local/align.sh
0 → 100755
浏览文件 @
e411e0bd
#!/bin/bash
if
[
$#
!=
2
]
;
then
echo
"usage:
${
0
}
config_path ckpt_path_prefix"
exit
-1
fi
ngpu
=
$(
echo
$CUDA_VISIBLE_DEVICES
|
awk
-F
","
'{print NF}'
)
echo
"using
$ngpu
gpus..."
config_path
=
$1
ckpt_prefix
=
$2
batch_size
=
1
output_dir
=
${
ckpt_prefix
}
mkdir
-p
${
output_dir
}
# align dump in `result_file`
# .tier, .TextGrid dump in `dir of result_file`
python3
-u
${
BIN_DIR
}
/alignment.py
\
--nproc
${
ngpu
}
\
--config
${
config_path
}
\
--result_file
${
output_dir
}
/
${
type
}
.align
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decoding.batch_size
${
batch_size
}
if
[
$?
-ne
0
]
;
then
echo
"Failed in ctc alignment!"
exit
1
fi
exit
0
examples/tiny/s1/local/data.sh
浏览文件 @
e411e0bd
#!
/usr/bin/env
bash
#!
/bin/
bash
stage
=
-1
stage
=
-1
stop_stage
=
100
stop_stage
=
100
...
...
examples/tiny/s1/local/export.sh
浏览文件 @
e411e0bd
#!
/usr/bin/env
bash
#!
/bin/
bash
if
[
$#
!=
3
]
;
then
if
[
$#
!=
3
]
;
then
echo
"usage:
$0
config_path ckpt_prefix jit_model_path"
echo
"usage:
$0
config_path ckpt_prefix jit_model_path"
...
@@ -12,13 +12,7 @@ config_path=$1
...
@@ -12,13 +12,7 @@ config_path=$1
ckpt_path_prefix
=
$2
ckpt_path_prefix
=
$2
jit_model_export_path
=
$3
jit_model_export_path
=
$3
device
=
gpu
if
[
ngpu
==
0
]
;
then
device
=
cpu
fi
python3
-u
${
BIN_DIR
}
/export.py
\
python3
-u
${
BIN_DIR
}
/export.py
\
--device
${
device
}
\
--nproc
${
ngpu
}
\
--nproc
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--checkpoint_path
${
ckpt_path_prefix
}
\
--checkpoint_path
${
ckpt_path_prefix
}
\
...
...
examples/tiny/s1/local/test.sh
浏览文件 @
e411e0bd
#!
/usr/bin/env
bash
#!
/bin/
bash
if
[
$#
!=
2
]
;
then
if
[
$#
!=
2
]
;
then
echo
"usage:
${
0
}
config_path ckpt_path_prefix"
echo
"usage:
${
0
}
config_path ckpt_path_prefix"
...
@@ -8,30 +8,57 @@ fi
...
@@ -8,30 +8,57 @@ fi
ngpu
=
$(
echo
$CUDA_VISIBLE_DEVICES
|
awk
-F
","
'{print NF}'
)
ngpu
=
$(
echo
$CUDA_VISIBLE_DEVICES
|
awk
-F
","
'{print NF}'
)
echo
"using
$ngpu
gpus..."
echo
"using
$ngpu
gpus..."
device
=
gpu
if
[
ngpu
==
0
]
;
then
device
=
cpu
fi
config_path
=
$1
config_path
=
$1
ckpt_prefix
=
$2
ckpt_prefix
=
$2
chunk_mode
=
false
if
[[
${
config_path
}
=
~ ^.
*
chunk_.
*
yaml
$
]]
;
then
chunk_mode
=
true
fi
# download language model
# download language model
#bash local/download_lm_en.sh
#bash local/download_lm_en.sh
#if [ $? -ne 0 ]; then
#if [ $? -ne 0 ]; then
# exit 1
# exit 1
#fi
#fi
python3
-u
${
BIN_DIR
}
/test.py
\
for
type
in
attention ctc_greedy_search
;
do
--device
${
device
}
\
echo
"decoding
${
type
}
"
--nproc
1
\
if
[
${
chunk_mode
}
==
true
]
;
then
--config
${
config_path
}
\
# stream decoding only support batchsize=1
--result_file
${
ckpt_prefix
}
.rsl
\
batch_size
=
1
--checkpoint_path
${
ckpt_prefix
}
else
batch_size
=
64
fi
python3
-u
${
BIN_DIR
}
/test.py
\
--nproc
${
ngpu
}
\
--config
${
config_path
}
\
--result_file
${
ckpt_prefix
}
.
${
type
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decoding.decoding_method
${
type
}
\
--opts
decoding.batch_size
${
batch_size
}
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in evaluation!"
echo
"Failed in evaluation!"
exit
1
exit
1
fi
fi
done
for
type
in
ctc_prefix_beam_search attention_rescoring
;
do
echo
"decoding
${
type
}
"
batch_size
=
1
python3
-u
${
BIN_DIR
}
/test.py
\
--nproc
${
ngpu
}
\
--config
${
config_path
}
\
--result_file
${
ckpt_prefix
}
.
${
type
}
.rsl
\
--checkpoint_path
${
ckpt_prefix
}
\
--opts
decoding.decoding_method
${
type
}
\
--opts
decoding.batch_size
${
batch_size
}
if
[
$?
-ne
0
]
;
then
echo
"Failed in evaluation!"
exit
1
fi
done
exit
0
exit
0
examples/tiny/s1/local/train.sh
浏览文件 @
e411e0bd
#! /usr/bin/env bash
#!/bin/bash
profiler_options
=
benchmark_batch_size
=
0
benchmark_max_step
=
0
# seed may break model convergence
seed
=
0
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
ngpu
=
$(
echo
$CUDA_VISIBLE_DEVICES
|
awk
-F
","
'{print NF}'
)
echo
"using
$ngpu
gpus..."
if
[
${
seed
}
!=
0
]
;
then
export
FLAGS_cudnn_deterministic
=
True
echo
"using seed
$seed
& FLAGS_cudnn_deterministic=True ..."
fi
if
[
$#
!=
2
]
;
then
if
[
$#
!=
2
]
;
then
echo
"usage: CUDA_VISIBLE_DEVICES=0
${
0
}
config_path ckpt_name"
echo
"usage: CUDA_VISIBLE_DEVICES=0
${
0
}
config_path ckpt_name"
exit
-1
exit
-1
fi
fi
ngpu
=
$(
echo
$CUDA_VISIBLE_DEVICES
|
awk
-F
","
'{print NF}'
)
echo
"using
$ngpu
gpus..."
config_path
=
$1
config_path
=
$1
ckpt_name
=
$2
ckpt_name
=
$2
device
=
gpu
if
[
ngpu
==
0
]
;
then
device
=
cpu
fi
mkdir
-p
exp
mkdir
-p
exp
python3
-u
${
BIN_DIR
}
/train.py
\
python3
-u
${
BIN_DIR
}
/train.py
\
--
device
${
device
}
\
--
seed
${
seed
}
\
--nproc
${
ngpu
}
\
--nproc
${
ngpu
}
\
--config
${
config_path
}
\
--config
${
config_path
}
\
--output
exp/
${
ckpt_name
}
--output
exp/
${
ckpt_name
}
\
--profiler-options
"
${
profiler_options
}
"
\
--benchmark-batch-size
${
benchmark_batch_size
}
\
--benchmark-max-step
${
benchmark_max_step
}
if
[
${
seed
}
!=
0
]
;
then
unset
FLAGS_cudnn_deterministic
fi
if
[
$?
-ne
0
]
;
then
if
[
$?
-ne
0
]
;
then
echo
"Failed in training!"
echo
"Failed in training!"
...
...
examples/tiny/s1/run.sh
浏览文件 @
e411e0bd
...
@@ -20,20 +20,26 @@ fi
...
@@ -20,20 +20,26 @@ fi
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
# train model, all `ckpt` under `exp` dir
# train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES
=
4,5,6,7
./local/train.sh
${
conf_path
}
${
ckpt
}
./local/train.sh
${
conf_path
}
${
ckpt
}
fi
fi
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
if
[
${
stage
}
-le
2
]
&&
[
${
stop_stage
}
-ge
2
]
;
then
# avg n best model
# avg n best model
./local/avg.sh
exp/
${
ckpt
}
/checkpoints
${
avg_num
}
avg.sh best
exp/
${
ckpt
}
/checkpoints
${
avg_num
}
fi
fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
# test ckpt avg_n
# test ckpt avg_n
CUDA_VISIBLE_DEVICES
=
7
./local/test.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
./local/test.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
fi
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
# ctc alignment of test data
CUDA_VISIBLE_DEVICES
=
./local/align.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
if
[
${
stage
}
-le
5
]
&&
[
${
stop_stage
}
-ge
5
]
;
then
# export ckpt avg_n
# export ckpt avg_n
CUDA_VISIBLE_DEVICES
=
./local/export.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
CUDA_VISIBLE_DEVICES
=
./local/export.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
fi
fi
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录