Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
7ec623f7
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
1 年多 前同步成功
通知
207
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
7ec623f7
编写于
6月 29, 2021
作者:
H
Hui Zhang
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into align
上级
90788b11
5b851f1e
变更
25
隐藏空白更改
内联
并排
Showing
25 changed file
with
398 addition
and
112 deletion
+398
-112
deepspeech/exps/deepspeech2/bin/deploy/runtime.py
deepspeech/exps/deepspeech2/bin/deploy/runtime.py
+16
-9
deepspeech/exps/deepspeech2/bin/deploy/server.py
deepspeech/exps/deepspeech2/bin/deploy/server.py
+20
-9
deepspeech/exps/deepspeech2/bin/tune.py
deepspeech/exps/deepspeech2/bin/tune.py
+1
-1
deepspeech/exps/deepspeech2/model.py
deepspeech/exps/deepspeech2/model.py
+1
-1
deepspeech/exps/u2/model.py
deepspeech/exps/u2/model.py
+4
-5
deepspeech/io/collator.py
deepspeech/io/collator.py
+5
-4
deepspeech/models/deepspeech2.py
deepspeech/models/deepspeech2.py
+4
-4
deepspeech/models/u2.py
deepspeech/models/u2.py
+4
-4
deepspeech/utils/socket_server.py
deepspeech/utils/socket_server.py
+2
-2
examples/aishell/s0/README.md
examples/aishell/s0/README.md
+7
-7
examples/aishell/s0/conf/deepspeech2.yaml
examples/aishell/s0/conf/deepspeech2.yaml
+1
-2
examples/aishell/s0/run.sh
examples/aishell/s0/run.sh
+2
-2
examples/aishell/s1/README.md
examples/aishell/s1/README.md
+16
-15
examples/dataset/aishell/aishell.py
examples/dataset/aishell/aishell.py
+3
-1
examples/dataset/thchs30/.gitignore
examples/dataset/thchs30/.gitignore
+5
-0
examples/dataset/thchs30/thchs30.py
examples/dataset/thchs30/thchs30.py
+169
-0
examples/librispeech/s0/README.md
examples/librispeech/s0/README.md
+5
-5
examples/librispeech/s0/conf/deepspeech2.yaml
examples/librispeech/s0/conf/deepspeech2.yaml
+9
-4
examples/librispeech/s1/README.md
examples/librispeech/s1/README.md
+11
-11
examples/librispeech/s1/conf/chunk_confermer.yaml
examples/librispeech/s1/conf/chunk_confermer.yaml
+9
-7
examples/librispeech/s1/conf/chunk_transformer.yaml
examples/librispeech/s1/conf/chunk_transformer.yaml
+9
-7
examples/librispeech/s1/conf/conformer.yaml
examples/librispeech/s1/conf/conformer.yaml
+8
-6
examples/librispeech/s1/conf/transformer.yaml
examples/librispeech/s1/conf/transformer.yaml
+8
-6
speechnn/CMakeLists.txt
speechnn/CMakeLists.txt
+77
-0
speechnn/core/decoder/CMakeLists.txt
speechnn/core/decoder/CMakeLists.txt
+2
-0
未找到文件。
deepspeech/exps/deepspeech2/bin/deploy/runtime.py
浏览文件 @
7ec623f7
...
...
@@ -18,8 +18,10 @@ import numpy as np
import
paddle
from
paddle.inference
import
Config
from
paddle.inference
import
create_predictor
from
paddle.io
import
DataLoader
from
deepspeech.exps.deepspeech2.config
import
get_cfg_defaults
from
deepspeech.io.collator
import
SpeechCollator
from
deepspeech.io.dataset
import
ManifestDataset
from
deepspeech.models.deepspeech2
import
DeepSpeech2Model
from
deepspeech.training.cli
import
default_argument_parser
...
...
@@ -78,26 +80,31 @@ def inference(config, args):
def
start_server
(
config
,
args
):
"""Start the ASR server"""
config
.
defrost
()
config
.
data
.
manfiest
=
config
.
data
.
test_manifest
config
.
data
.
augmentation_config
=
""
config
.
data
.
keep_transcription_text
=
True
config
.
data
.
manifest
=
config
.
data
.
test_manifest
dataset
=
ManifestDataset
.
from_config
(
config
)
model
=
DeepSpeech2Model
.
from_pretrained
(
dataset
,
config
,
config
.
collator
.
augmentation_config
=
""
config
.
collator
.
keep_transcription_text
=
True
config
.
collator
.
batch_size
=
1
config
.
collator
.
num_workers
=
0
collate_fn
=
SpeechCollator
.
from_config
(
config
)
test_loader
=
DataLoader
(
dataset
,
collate_fn
=
collate_fn
,
num_workers
=
0
)
model
=
DeepSpeech2Model
.
from_pretrained
(
test_loader
,
config
,
args
.
checkpoint_path
)
model
.
eval
()
# prepare ASR inference handler
def
file_to_transcript
(
filename
):
feature
=
dataset
.
process_utterance
(
filename
,
""
)
audio
=
np
.
array
([
feature
[
0
]]).
astype
(
'float32'
)
#[1,
D, T
]
audio_len
=
feature
[
0
].
shape
[
1
]
feature
=
test_loader
.
collate_fn
.
process_utterance
(
filename
,
""
)
audio
=
np
.
array
([
feature
[
0
]]).
astype
(
'float32'
)
#[1,
T, D
]
audio_len
=
feature
[
0
].
shape
[
0
]
audio_len
=
np
.
array
([
audio_len
]).
astype
(
'int64'
)
# [1]
result_transcript
=
model
.
decode
(
paddle
.
to_tensor
(
audio
),
paddle
.
to_tensor
(
audio_len
),
vocab_list
=
dataset
.
vocab_list
,
vocab_list
=
test_loader
.
collate_fn
.
vocab_list
,
decoding_method
=
config
.
decoding
.
decoding_method
,
lang_model_path
=
config
.
decoding
.
lang_model_path
,
beam_alpha
=
config
.
decoding
.
alpha
,
...
...
@@ -138,7 +145,7 @@ if __name__ == "__main__":
add_arg
(
'host_ip'
,
str
,
'localhost'
,
"Server's IP address."
)
add_arg
(
'host_port'
,
int
,
808
6
,
"Server's IP port."
)
add_arg
(
'host_port'
,
int
,
808
9
,
"Server's IP port."
)
add_arg
(
'speech_save_dir'
,
str
,
'demo_cache'
,
"Directory to save demo audios."
)
...
...
deepspeech/exps/deepspeech2/bin/deploy/server.py
浏览文件 @
7ec623f7
...
...
@@ -16,8 +16,10 @@ import functools
import
numpy
as
np
import
paddle
from
paddle.io
import
DataLoader
from
deepspeech.exps.deepspeech2.config
import
get_cfg_defaults
from
deepspeech.io.collator
import
SpeechCollator
from
deepspeech.io.dataset
import
ManifestDataset
from
deepspeech.models.deepspeech2
import
DeepSpeech2Model
from
deepspeech.training.cli
import
default_argument_parser
...
...
@@ -31,26 +33,35 @@ from deepspeech.utils.utility import print_arguments
def
start_server
(
config
,
args
):
"""Start the ASR server"""
config
.
defrost
()
config
.
data
.
manfiest
=
config
.
data
.
test_manifest
config
.
data
.
augmentation_config
=
""
config
.
data
.
keep_transcription_text
=
True
config
.
data
.
manifest
=
config
.
data
.
test_manifest
dataset
=
ManifestDataset
.
from_config
(
config
)
model
=
DeepSpeech2Model
.
from_pretrained
(
dataset
,
config
,
config
.
collator
.
augmentation_config
=
""
config
.
collator
.
keep_transcription_text
=
True
config
.
collator
.
batch_size
=
1
config
.
collator
.
num_workers
=
0
collate_fn
=
SpeechCollator
.
from_config
(
config
)
test_loader
=
DataLoader
(
dataset
,
collate_fn
=
collate_fn
,
num_workers
=
0
)
model
=
DeepSpeech2Model
.
from_pretrained
(
test_loader
,
config
,
args
.
checkpoint_path
)
model
.
eval
()
# prepare ASR inference handler
def
file_to_transcript
(
filename
):
feature
=
dataset
.
process_utterance
(
filename
,
""
)
audio
=
np
.
array
([
feature
[
0
]]).
astype
(
'float32'
)
#[1, D, T]
audio_len
=
feature
[
0
].
shape
[
1
]
feature
=
test_loader
.
collate_fn
.
process_utterance
(
filename
,
""
)
audio
=
np
.
array
([
feature
[
0
]]).
astype
(
'float32'
)
#[1, T, D]
# audio = audio.swapaxes(1,2)
print
(
'---file_to_transcript feature----'
)
print
(
audio
.
shape
)
audio_len
=
feature
[
0
].
shape
[
0
]
print
(
audio_len
)
audio_len
=
np
.
array
([
audio_len
]).
astype
(
'int64'
)
# [1]
result_transcript
=
model
.
decode
(
paddle
.
to_tensor
(
audio
),
paddle
.
to_tensor
(
audio_len
),
vocab_list
=
dataset
.
vocab_list
,
vocab_list
=
test_loader
.
collate_fn
.
vocab_list
,
decoding_method
=
config
.
decoding
.
decoding_method
,
lang_model_path
=
config
.
decoding
.
lang_model_path
,
beam_alpha
=
config
.
decoding
.
alpha
,
...
...
@@ -91,7 +102,7 @@ if __name__ == "__main__":
add_arg
(
'host_ip'
,
str
,
'localhost'
,
"Server's IP address."
)
add_arg
(
'host_port'
,
int
,
808
6
,
"Server's IP port."
)
add_arg
(
'host_port'
,
int
,
808
8
,
"Server's IP port."
)
add_arg
(
'speech_save_dir'
,
str
,
'demo_cache'
,
"Directory to save demo audios."
)
...
...
deepspeech/exps/deepspeech2/bin/tune.py
浏览文件 @
7ec623f7
...
...
@@ -47,7 +47,7 @@ def tune(config, args):
drop_last
=
False
,
collate_fn
=
SpeechCollator
(
keep_transcription_text
=
True
))
model
=
DeepSpeech2Model
.
from_pretrained
(
dev_dataset
,
config
,
model
=
DeepSpeech2Model
.
from_pretrained
(
valid_loader
,
config
,
args
.
checkpoint_path
)
model
.
eval
()
...
...
deepspeech/exps/deepspeech2/model.py
浏览文件 @
7ec623f7
...
...
@@ -318,7 +318,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
def
export
(
self
):
infer_model
=
DeepSpeech2InferModel
.
from_pretrained
(
self
.
test_loader
.
dataset
,
self
.
config
,
self
.
args
.
checkpoint_path
)
self
.
test_loader
,
self
.
config
,
self
.
args
.
checkpoint_path
)
infer_model
.
eval
()
feat_dim
=
self
.
test_loader
.
collate_fn
.
feature_size
static_model
=
paddle
.
jit
.
to_static
(
...
...
deepspeech/exps/u2/model.py
浏览文件 @
7ec623f7
...
...
@@ -574,15 +574,14 @@ class U2Tester(U2Trainer):
List[paddle.static.InputSpec]: input spec.
"""
from
deepspeech.models.u2
import
U2InferModel
infer_model
=
U2InferModel
.
from_pretrained
(
self
.
test_loader
.
dataset
,
infer_model
=
U2InferModel
.
from_pretrained
(
self
.
test_loader
,
self
.
config
.
model
.
clone
(),
self
.
args
.
checkpoint_path
)
feat_dim
=
self
.
test_loader
.
collate_fn
.
feature_size
input_spec
=
[
paddle
.
static
.
InputSpec
(
shape
=
[
None
,
feat_dim
,
None
],
dtype
=
'float32'
),
# audio, [B,D,T]
paddle
.
static
.
InputSpec
(
shape
=
[
None
],
paddle
.
static
.
InputSpec
(
shape
=
[
1
,
None
,
feat_dim
],
dtype
=
'float32'
),
# audio, [B,T,D]
paddle
.
static
.
InputSpec
(
shape
=
[
1
],
dtype
=
'int64'
),
# audio_length, [B]
]
return
infer_model
,
input_spec
...
...
deepspeech/io/collator.py
浏览文件 @
7ec623f7
...
...
@@ -154,8 +154,8 @@ class SpeechCollator():
random_seed (int, optional): for random generator. Defaults to 0.
keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False.
if ``keep_transcription_text`` is False, text is token ids else is raw string.
Do augmentations
Do augmentations
Padding audio features with zeros to make them have the same shape (or
a user-defined shape) within one batch.
"""
...
...
@@ -242,6 +242,7 @@ class SpeechCollator():
# specgram augment
specgram
=
self
.
_augmentation_pipeline
.
transform_feature
(
specgram
)
specgram
=
specgram
.
transpose
([
1
,
0
])
return
specgram
,
transcript_part
def
__call__
(
self
,
batch
):
...
...
@@ -269,8 +270,8 @@ class SpeechCollator():
#utt
utts
.
append
(
utt
)
# audio
audios
.
append
(
audio
.
T
)
# [T, D]
audio_lens
.
append
(
audio
.
shape
[
1
])
audios
.
append
(
audio
)
# [T, D]
audio_lens
.
append
(
audio
.
shape
[
0
])
# text
# for training, text is token ids
# else text is string, convert to unicode ord
...
...
deepspeech/models/deepspeech2.py
浏览文件 @
7ec623f7
...
...
@@ -198,11 +198,11 @@ class DeepSpeech2Model(nn.Layer):
cutoff_top_n
,
num_processes
)
@
classmethod
def
from_pretrained
(
cls
,
data
set
,
config
,
checkpoint_path
):
def
from_pretrained
(
cls
,
data
loader
,
config
,
checkpoint_path
):
"""Build a DeepSpeech2Model model from a pretrained model.
Parameters
----------
data
set: paddle.io.Dataset
data
loader: paddle.io.DataLoader
config: yacs.config.CfgNode
model configs
...
...
@@ -215,8 +215,8 @@ class DeepSpeech2Model(nn.Layer):
DeepSpeech2Model
The model built from pretrained result.
"""
model
=
cls
(
feat_size
=
data
set
.
feature_size
,
dict_size
=
data
set
.
vocab_size
,
model
=
cls
(
feat_size
=
data
loader
.
collate_fn
.
feature_size
,
dict_size
=
data
loader
.
collate_fn
.
vocab_size
,
num_conv_layers
=
config
.
model
.
num_conv_layers
,
num_rnn_layers
=
config
.
model
.
num_rnn_layers
,
rnn_size
=
config
.
model
.
rnn_layer_size
,
...
...
deepspeech/models/u2.py
浏览文件 @
7ec623f7
...
...
@@ -876,11 +876,11 @@ class U2Model(U2BaseModel):
return
model
@
classmethod
def
from_pretrained
(
cls
,
data
set
,
config
,
checkpoint_path
):
def
from_pretrained
(
cls
,
data
loader
,
config
,
checkpoint_path
):
"""Build a DeepSpeech2Model model from a pretrained model.
Args:
data
set (paddle.io.Dataset
): not used.
data
loader (paddle.io.DataLoader
): not used.
config (yacs.config.CfgNode): model configs
checkpoint_path (Path or str): the path of pretrained model checkpoint, without extension name
...
...
@@ -888,8 +888,8 @@ class U2Model(U2BaseModel):
DeepSpeech2Model: The model built from pretrained result.
"""
config
.
defrost
()
config
.
input_dim
=
data
set
.
feature_size
config
.
output_dim
=
data
set
.
vocab_size
config
.
input_dim
=
data
loader
.
collate_fn
.
feature_size
config
.
output_dim
=
data
loader
.
collate_fn
.
vocab_size
config
.
freeze
()
model
=
cls
.
from_config
(
config
)
...
...
deepspeech/utils/socket_server.py
浏览文件 @
7ec623f7
...
...
@@ -48,9 +48,9 @@ def warm_up_test(audio_process_handler,
rng
=
random
.
Random
(
random_seed
)
samples
=
rng
.
sample
(
manifest
,
num_test_cases
)
for
idx
,
sample
in
enumerate
(
samples
):
print
(
"Warm-up Test Case %d: %s"
,
idx
,
sample
[
'audio_filepath'
]
)
print
(
"Warm-up Test Case %d: %s"
%
(
idx
,
sample
[
'feat'
])
)
start_time
=
time
.
time
()
transcript
=
audio_process_handler
(
sample
[
'
audio_filepath
'
])
transcript
=
audio_process_handler
(
sample
[
'
feat
'
])
finish_time
=
time
.
time
()
print
(
"Response Time: %f, Transcript: %s"
%
(
finish_time
-
start_time
,
transcript
))
...
...
examples/aishell/s0/README.md
浏览文件 @
7ec623f7
...
...
@@ -2,10 +2,10 @@
## Deepspeech2
| Model |
r
elease | Config | Test set | Loss | CER |
| --- | --- | --- | --- | --- | --- |
| DeepSpeech2
58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug + new datapipe | test | 6.396368026733398 | 0.068382
|
| DeepSpeech2 58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug | test | 7.483316898345947 | 0.077860 |
| DeepSpeech2 58.4M | 2.1.0 | conf/deepspeech2.yaml | test | 7.299022197723389 | 0.078671 |
| DeepSpeech2 58.4M | 2.0.0 | conf/deepspeech2.yaml | test | - | 0.078977 |
| DeepSpeech2 58.4M | 1.8.5 | - | test | - | 0.080447 |
| Model |
Params | R
elease | Config | Test set | Loss | CER |
| --- | --- | --- | --- | --- | --- |
--- |
| DeepSpeech2
| 58.4M | 2.2.0 | conf/deepspeech2.yaml + spec aug + new datapipe | test | 6.396368026733398 | 0.068382,0.073507
|
| DeepSpeech2
|
58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug | test | 7.483316898345947 | 0.077860 |
| DeepSpeech2
|
58.4M | 2.1.0 | conf/deepspeech2.yaml | test | 7.299022197723389 | 0.078671 |
| DeepSpeech2
|
58.4M | 2.0.0 | conf/deepspeech2.yaml | test | - | 0.078977 |
| DeepSpeech2
|
58.4M | 1.8.5 | - | test | - | 0.080447 |
examples/aishell/s0/conf/deepspeech2.yaml
浏览文件 @
7ec623f7
...
...
@@ -10,8 +10,8 @@ data:
min_output_input_ratio
:
0.00
max_output_input_ratio
:
.inf
collator
:
batch_size
:
64
# one gpu
mean_std_filepath
:
data/mean_std.json
unit_type
:
char
vocab_filepath
:
data/vocab.txt
...
...
@@ -33,7 +33,6 @@ collator:
sortagrad
:
True
shuffle_method
:
batch_shuffle
num_workers
:
0
batch_size
:
64
# one gpu
model
:
num_conv_layers
:
2
...
...
examples/aishell/s0/run.sh
浏览文件 @
7ec623f7
...
...
@@ -31,10 +31,10 @@ fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
# test ckpt avg_n
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/test.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0
./local/test.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
# export ckpt avg_n
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/export.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
CUDA_VISIBLE_DEVICES
=
0
./local/export.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
fi
examples/aishell/s1/README.md
浏览文件 @
7ec623f7
...
...
@@ -2,25 +2,26 @@
## Conformer
| Model | Config | Augmentation| Test set | Decode method | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- |
| conformer | conf/conformer.yaml | spec_aug + shift | test | attention | - | 0.059858 |
| conformer | conf/conformer.yaml | spec_aug + shift | test | ctc_greedy_search | - | 0.062311 |
| conformer | conf/conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | - | 0.062196 |
| conformer | conf/conformer.yaml | spec_aug + shift | test | attention_rescoring | - | 0.054694 |
| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- | --- |
| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | attention | - | 0.059858 |
| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | ctc_greedy_search | - | 0.062311 |
| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | - | 0.062196 |
| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | attention_rescoring | - | 0.054694 |
## Chunk Conformer
| Model | Config | Augmentation| Test set | Decode method | Chunk | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- | --- |
| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | attention | 16 | - | 0.061939 |
| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16 | - | 0.070806 |
| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16 | - | 0.070739 |
| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16 | - | 0.059400 |
| Model |
Params |
Config | Augmentation| Test set | Decode method | Chunk | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- | --- |
--- |
| conformer |
47.06M |
conf/chunk_conformer.yaml | spec_aug + shift | test | attention | 16 | - | 0.061939 |
| conformer |
47.06M |
conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16 | - | 0.070806 |
| conformer |
47.06M |
conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16 | - | 0.070739 |
| conformer |
47.06M |
conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16 | - | 0.059400 |
## Transformer
| Model | Config | Augmentation| Test set | Decode method | Loss | WER |
| --- | --- | --- | --- | --- | --- | ---|
| transformer | conf/transformer.yaml | spec_aug + shift | test | attention | - | - |
| Model |
Params |
Config | Augmentation| Test set | Decode method | Loss | WER |
| --- | --- | --- | --- | --- | --- | ---
| ---
|
| transformer |
- |
conf/transformer.yaml | spec_aug + shift | test | attention | - | - |
examples/dataset/aishell/aishell.py
浏览文件 @
7ec623f7
...
...
@@ -60,7 +60,7 @@ def create_manifest(data_dir, manifest_path_prefix):
if
line
==
''
:
continue
audio_id
,
text
=
line
.
split
(
' '
,
1
)
# remove withespace
# remove withespace
, charactor text
text
=
''
.
join
(
text
.
split
())
transcript_dict
[
audio_id
]
=
text
...
...
@@ -123,6 +123,8 @@ def main():
target_dir
=
args
.
target_dir
,
manifest_path
=
args
.
manifest_prefix
)
print
(
"Data download and manifest prepare done!"
)
if
__name__
==
'__main__'
:
main
()
examples/dataset/thchs30/.gitignore
0 → 100644
浏览文件 @
7ec623f7
*.tgz
manifest.*
data_thchs30
resource
test-noise
examples/dataset/thchs30/thchs30.py
0 → 100644
浏览文件 @
7ec623f7
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Prepare THCHS-30 mandarin dataset
Download, unpack and create manifest files.
Manifest file is a json-format file with each line containing the
meta data (i.e. audio filepath, transcript and audio duration)
of each audio file in the data set.
"""
import
argparse
import
codecs
import
json
import
os
from
multiprocessing.pool
import
Pool
from
pathlib
import
Path
import
soundfile
from
utils.utility
import
download
from
utils.utility
import
unpack
DATA_HOME
=
os
.
path
.
expanduser
(
'~/.cache/paddle/dataset/speech'
)
URL_ROOT
=
'http://www.openslr.org/resources/18'
# URL_ROOT = 'https://openslr.magicdatatech.com/resources/18'
DATA_URL
=
URL_ROOT
+
'/data_thchs30.tgz'
TEST_NOISE_URL
=
URL_ROOT
+
'/test-noise.tgz'
RESOURCE_URL
=
URL_ROOT
+
'/resource.tgz'
MD5_DATA
=
'2d2252bde5c8429929e1841d4cb95e90'
MD5_TEST_NOISE
=
'7e8a985fb965b84141b68c68556c2030'
MD5_RESOURCE
=
'c0b2a565b4970a0c4fe89fefbf2d97e1'
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
.
add_argument
(
"--target_dir"
,
default
=
DATA_HOME
+
"/THCHS30"
,
type
=
str
,
help
=
"Directory to save the dataset. (default: %(default)s)"
)
parser
.
add_argument
(
"--manifest_prefix"
,
default
=
"manifest"
,
type
=
str
,
help
=
"Filepath prefix for output manifests. (default: %(default)s)"
)
args
=
parser
.
parse_args
()
def
read_trn
(
filepath
):
"""read trn file.
word text in first line.
syllable text in second line.
phoneme text in third line.
Args:
filepath (str): trn path.
Returns:
list(str): (word, syllable, phone)
"""
texts
=
[]
with
open
(
filepath
,
'r'
)
as
f
:
lines
=
f
.
read
().
split
(
'
\n
'
)
# last line is `empty`
lines
=
lines
[:
3
]
assert
len
(
lines
)
==
3
,
lines
# charactor text, remove withespace
texts
.
append
(
''
.
join
(
lines
[
0
].
split
()))
texts
.
extend
(
lines
[
1
:])
return
texts
def
resolve_symlink
(
filepath
):
"""resolve symlink which content is norm file.
Args:
filepath (str): norm file symlink.
"""
sym_path
=
Path
(
filepath
)
relative_link
=
sym_path
.
read_text
().
strip
()
relative
=
Path
(
relative_link
)
relpath
=
sym_path
.
parent
/
relative
return
relpath
.
resolve
()
def
create_manifest
(
data_dir
,
manifest_path_prefix
):
print
(
"Creating manifest %s ..."
%
manifest_path_prefix
)
json_lines
=
[]
data_types
=
[
'train'
,
'dev'
,
'test'
]
for
dtype
in
data_types
:
del
json_lines
[:]
audio_dir
=
os
.
path
.
join
(
data_dir
,
dtype
)
for
subfolder
,
_
,
filelist
in
sorted
(
os
.
walk
(
audio_dir
)):
for
fname
in
filelist
:
file_path
=
os
.
path
.
join
(
subfolder
,
fname
)
if
file_path
.
endswith
(
'.wav'
):
audio_path
=
os
.
path
.
abspath
(
file_path
)
text_path
=
resolve_symlink
(
audio_path
+
'.trn'
)
else
:
continue
assert
os
.
path
.
exists
(
audio_path
)
and
os
.
path
.
exists
(
text_path
)
audio_id
=
os
.
path
.
basename
(
audio_path
)[:
-
4
]
word_text
,
syllable_text
,
phone_text
=
read_trn
(
text_path
)
audio_data
,
samplerate
=
soundfile
.
read
(
audio_path
)
duration
=
float
(
len
(
audio_data
)
/
samplerate
)
json_lines
.
append
(
json
.
dumps
(
{
'utt'
:
audio_id
,
'feat'
:
audio_path
,
'feat_shape'
:
(
duration
,
),
# second
'text'
:
word_text
,
'syllable'
:
syllable_text
,
'phone'
:
phone_text
,
},
ensure_ascii
=
False
))
manifest_path
=
manifest_path_prefix
+
'.'
+
dtype
with
codecs
.
open
(
manifest_path
,
'w'
,
'utf-8'
)
as
fout
:
for
line
in
json_lines
:
fout
.
write
(
line
+
'
\n
'
)
def
prepare_dataset
(
url
,
md5sum
,
target_dir
,
manifest_path
,
subset
):
"""Download, unpack and create manifest file."""
datadir
=
os
.
path
.
join
(
target_dir
,
subset
)
if
not
os
.
path
.
exists
(
datadir
):
filepath
=
download
(
url
,
md5sum
,
target_dir
)
unpack
(
filepath
,
target_dir
)
else
:
print
(
"Skip downloading and unpacking. Data already exists in %s."
%
target_dir
)
if
subset
==
'data_thchs30'
:
create_manifest
(
datadir
,
manifest_path
)
def
main
():
if
args
.
target_dir
.
startswith
(
'~'
):
args
.
target_dir
=
os
.
path
.
expanduser
(
args
.
target_dir
)
tasks
=
[
(
DATA_URL
,
MD5_DATA
,
args
.
target_dir
,
args
.
manifest_prefix
,
"data_thchs30"
),
(
TEST_NOISE_URL
,
MD5_TEST_NOISE
,
args
.
target_dir
,
args
.
manifest_prefix
,
"test-noise"
),
(
RESOURCE_URL
,
MD5_RESOURCE
,
args
.
target_dir
,
args
.
manifest_prefix
,
"resource"
),
]
with
Pool
(
7
)
as
pool
:
pool
.
starmap
(
prepare_dataset
,
tasks
)
print
(
"Data download and manifest prepare done!"
)
if
__name__
==
'__main__'
:
main
()
examples/librispeech/s0/README.md
浏览文件 @
7ec623f7
...
...
@@ -2,8 +2,8 @@
## Deepspeech2
| Model | release | Config | Test set | Loss | WER |
| --- | --- | --- | --- | --- | --- |
| DeepSpeech2 | 2.1.0 | conf/deepspeech2.yaml | 15.184467315673828 | test-clean | 0.072154 |
| DeepSpeech2 | 2.0.0 | conf/deepspeech2.yaml | - | test-clean | 0.073973 |
| DeepSpeech2 | 1.8.5 | - | test-clean | - | 0.074939 |
| Model |
Params |
release | Config | Test set | Loss | WER |
| --- | --- |
--- |
--- | --- | --- | --- |
| DeepSpeech2 |
42.96M |
2.1.0 | conf/deepspeech2.yaml | 15.184467315673828 | test-clean | 0.072154 |
| DeepSpeech2 |
42.96M |
2.0.0 | conf/deepspeech2.yaml | - | test-clean | 0.073973 |
| DeepSpeech2 |
42.96M |
1.8.5 | - | test-clean | - | 0.074939 |
examples/librispeech/s0/conf/deepspeech2.yaml
浏览文件 @
7ec623f7
...
...
@@ -3,16 +3,21 @@ data:
train_manifest
:
data/manifest.train
dev_manifest
:
data/manifest.dev-clean
test_manifest
:
data/manifest.test-clean
mean_std_filepath
:
data/mean_std.json
vocab_filepath
:
data/vocab.txt
augmentation_config
:
conf/augmentation.json
batch_size
:
20
min_input_len
:
0.0
max_input_len
:
27.0
# second
min_output_len
:
0.0
max_output_len
:
.inf
min_output_input_ratio
:
0.00
max_output_input_ratio
:
.inf
collator
:
batch_size
:
20
mean_std_filepath
:
data/mean_std.json
unit_type
:
char
vocab_filepath
:
data/vocab.txt
augmentation_config
:
conf/augmentation.json
random_seed
:
0
spm_model_prefix
:
specgram_type
:
linear
target_sample_rate
:
16000
max_freq
:
None
...
...
examples/librispeech/s1/README.md
浏览文件 @
7ec623f7
...
...
@@ -2,17 +2,17 @@
## Conformer
| Model | Config | Augmentation| Test set | Decode method | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- |
| conformer |
conf/conformer.yaml | spec_aug + shift | test-all | attention | test-all
6.35 | 0.057117 |
| conformer |
conf/conformer.yaml | spec_aug + shift | test-clean | attention | test-all
6.35 | 0.030162 |
| conformer |
conf/conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | test-all
6.35 | 0.037910 |
| conformer |
conf/conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | test-all
6.35 | 0.037761 |
| conformer |
conf/conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | test-all
6.35 | 0.032115 |
| Model |
Params |
Config | Augmentation| Test set | Decode method | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- |
--- |
| conformer |
47.63 M | conf/conformer.yaml | spec_aug + shift | test-all | attention |
6.35 | 0.057117 |
| conformer |
47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention |
6.35 | 0.030162 |
| conformer |
47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search |
6.35 | 0.037910 |
| conformer |
47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search |
6.35 | 0.037761 |
| conformer |
47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention_rescoring |
6.35 | 0.032115 |
## Transformer
| Model | Config | Augmentation| Test set | Decode method | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- |
| transformer |
conf/transformer.yaml | spec_aug + shift | test-all | attention | test-all
6.98 | 0.066500 |
| transformer |
conf/transformer.yaml | spec_aug + shift | test-clean | attention | test-all
6.98 | 0.036 |
| Model |
Params |
Config | Augmentation| Test set | Decode method | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- |
--- |
| transformer |
32.52 M | conf/transformer.yaml | spec_aug + shift | test-all | attention |
6.98 | 0.066500 |
| transformer |
32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention |
6.98 | 0.036 |
examples/librispeech/s1/conf/chunk_confermer.yaml
浏览文件 @
7ec623f7
...
...
@@ -3,18 +3,20 @@ data:
train_manifest
:
data/manifest.train
dev_manifest
:
data/manifest.dev
test_manifest
:
data/manifest.test
vocab_filepath
:
data/vocab.txt
unit_type
:
'
spm'
spm_model_prefix
:
'
data/bpe_unigram_5000'
mean_std_filepath
:
"
"
augmentation_config
:
conf/augmentation.json
batch_size
:
4
min_input_len
:
0.5
max_input_len
:
20.0
min_output_len
:
0.0
max_output_len
:
400.0
min_output_input_ratio
:
0.05
max_output_input_ratio
:
10.0
collator
:
vocab_filepath
:
data/vocab.txt
unit_type
:
'
spm'
spm_model_prefix
:
'
data/bpe_unigram_5000'
mean_std_filepath
:
"
"
augmentation_config
:
conf/augmentation.json
batch_size
:
16
raw_wav
:
True
# use raw_wav or kaldi feature
specgram_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
...
...
@@ -80,7 +82,7 @@ model:
training
:
n_epoch
:
120
accum_grad
:
1
accum_grad
:
8
global_grad_clip
:
5.0
optim
:
adam
optim_conf
:
...
...
examples/librispeech/s1/conf/chunk_transformer.yaml
浏览文件 @
7ec623f7
...
...
@@ -3,18 +3,20 @@ data:
train_manifest
:
data/manifest.train
dev_manifest
:
data/manifest.dev
test_manifest
:
data/manifest.test
vocab_filepath
:
data/vocab.txt
unit_type
:
'
spm'
spm_model_prefix
:
'
data/bpe_unigram_5000'
mean_std_filepath
:
"
"
augmentation_config
:
conf/augmentation.json
batch_size
:
64
min_input_len
:
0.5
# second
max_input_len
:
20.0
# second
min_output_len
:
0.0
# tokens
max_output_len
:
400.0
# tokens
min_output_input_ratio
:
0.05
max_output_input_ratio
:
10.0
collator
:
vocab_filepath
:
data/vocab.txt
unit_type
:
'
spm'
spm_model_prefix
:
'
data/bpe_unigram_5000'
mean_std_filepath
:
"
"
augmentation_config
:
conf/augmentation.json
batch_size
:
64
raw_wav
:
True
# use raw_wav or kaldi feature
specgram_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
...
...
@@ -103,6 +105,6 @@ decoding:
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks
:
-1
# number of left chunks for decoding. Defaults to -1.
simulate_streaming
:
Fals
e
# simulate streaming inference. Defaults to False.
simulate_streaming
:
tru
e
# simulate streaming inference. Defaults to False.
examples/librispeech/s1/conf/conformer.yaml
浏览文件 @
7ec623f7
...
...
@@ -3,18 +3,20 @@ data:
train_manifest
:
data/manifest.train
dev_manifest
:
data/manifest.dev
test_manifest
:
data/manifest.test-clean
vocab_filepath
:
data/vocab.txt
unit_type
:
'
spm'
spm_model_prefix
:
'
data/bpe_unigram_5000'
mean_std_filepath
:
"
"
augmentation_config
:
conf/augmentation.json
batch_size
:
16
min_input_len
:
0.5
# seconds
max_input_len
:
20.0
# seconds
min_output_len
:
0.0
# tokens
max_output_len
:
400.0
# tokens
min_output_input_ratio
:
0.05
max_output_input_ratio
:
10.0
collator
:
vocab_filepath
:
data/vocab.txt
unit_type
:
'
spm'
spm_model_prefix
:
'
data/bpe_unigram_5000'
mean_std_filepath
:
"
"
augmentation_config
:
conf/augmentation.json
batch_size
:
16
raw_wav
:
True
# use raw_wav or kaldi feature
specgram_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
...
...
examples/librispeech/s1/conf/transformer.yaml
浏览文件 @
7ec623f7
...
...
@@ -3,18 +3,20 @@ data:
train_manifest
:
data/manifest.train
dev_manifest
:
data/manifest.dev
test_manifest
:
data/manifest.test-clean
vocab_filepath
:
data/vocab.txt
unit_type
:
'
spm'
spm_model_prefix
:
'
data/bpe_unigram_5000'
mean_std_filepath
:
"
"
augmentation_config
:
conf/augmentation.json
batch_size
:
64
min_input_len
:
0.5
# second
max_input_len
:
20.0
# second
min_output_len
:
0.0
# tokens
max_output_len
:
400.0
# tokens
min_output_input_ratio
:
0.05
max_output_input_ratio
:
10.0
collator
:
vocab_filepath
:
data/vocab.txt
unit_type
:
'
spm'
spm_model_prefix
:
'
data/bpe_unigram_5000'
mean_std_filepath
:
"
"
augmentation_config
:
conf/augmentation.json
batch_size
:
64
raw_wav
:
True
# use raw_wav or kaldi feature
specgram_type
:
fbank
#linear, mfcc, fbank
feat_dim
:
80
...
...
speechnn/CMakeLists.txt
浏览文件 @
7ec623f7
cmake_minimum_required
(
VERSION 3.14 FATAL_ERROR
)
project
(
deepspeech VERSION 0.1
)
set
(
CMAKE_VERBOSE_MAKEFILE on
)
# set std-14
set
(
CMAKE_CXX_STANDARD 14
)
# include file
include
(
FetchContent
)
include
(
ExternalProject
)
# fc_patch dir
set
(
FETCHCONTENT_QUIET off
)
get_filename_component
(
fc_patch
"fc_patch"
REALPATH BASE_DIR
"
${
CMAKE_SOURCE_DIR
}
"
)
set
(
FETCHCONTENT_BASE_DIR
${
fc_patch
}
)
###############################################################################
# Option Configurations
###############################################################################
# option configurations
option
(
TEST_DEBUG
"option for debug"
OFF
)
###############################################################################
# Include third party
###############################################################################
# #example for include third party
# FetchContent_Declare()
# # FetchContent_MakeAvailable was not added until CMake 3.14
# FetchContent_MakeAvailable()
# include_directories()
# ABSEIL-CPP
include
(
FetchContent
)
FetchContent_Declare
(
absl
GIT_REPOSITORY
"https://github.com/abseil/abseil-cpp.git"
GIT_TAG
"20210324.1"
)
FetchContent_MakeAvailable
(
absl
)
# libsndfile
include
(
FetchContent
)
FetchContent_Declare
(
libsndfile
GIT_REPOSITORY
"https://github.com/libsndfile/libsndfile.git"
GIT_TAG
"1.0.31"
)
FetchContent_MakeAvailable
(
libsndfile
)
###############################################################################
# Add local library
###############################################################################
# system lib
find_package
()
# if dir have CmakeLists.txt
add_subdirectory
()
# if dir do not have CmakeLists.txt
add_library
(
lib_name STATIC file.cc
)
target_link_libraries
(
lib_name item0 item1
)
add_dependencies
(
lib_name depend-target
)
###############################################################################
# Library installation
###############################################################################
install
()
###############################################################################
# Build binary file
###############################################################################
add_executable
()
target_link_libraries
()
speechnn/core/decoder/CMakeLists.txt
浏览文件 @
7ec623f7
aux_source_directory
(
. DIR_LIB_SRCS
)
add_library
(
decoder STATIC
${
DIR_LIB_SRCS
}
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录