Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
1ec93dbd
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
1ec93dbd
编写于
6月 28, 2021
作者:
H
Haoxin Ma
浏览文件
操作
浏览文件
下载
差异文件
spec augment
上级
ae566f66
718bd307
变更
18
隐藏空白更改
内联
并排
Showing
18 changed file
with
150 addition
and
121 deletion
+150
-121
deepspeech/exps/deepspeech2/bin/deploy/runtime.py
deepspeech/exps/deepspeech2/bin/deploy/runtime.py
+16
-9
deepspeech/exps/deepspeech2/bin/deploy/server.py
deepspeech/exps/deepspeech2/bin/deploy/server.py
+20
-9
deepspeech/exps/deepspeech2/bin/tune.py
deepspeech/exps/deepspeech2/bin/tune.py
+1
-1
deepspeech/exps/deepspeech2/model.py
deepspeech/exps/deepspeech2/model.py
+1
-1
deepspeech/exps/u2/model.py
deepspeech/exps/u2/model.py
+1
-1
deepspeech/frontend/augmentor/spec_augment.py
deepspeech/frontend/augmentor/spec_augment.py
+49
-46
deepspeech/frontend/featurizer/audio_featurizer.py
deepspeech/frontend/featurizer/audio_featurizer.py
+6
-4
deepspeech/io/collator.py
deepspeech/io/collator.py
+7
-6
deepspeech/models/deepspeech2.py
deepspeech/models/deepspeech2.py
+4
-4
deepspeech/models/u2.py
deepspeech/models/u2.py
+4
-4
deepspeech/utils/socket_server.py
deepspeech/utils/socket_server.py
+2
-2
examples/aishell/s0/README.md
examples/aishell/s0/README.md
+7
-6
examples/aishell/s0/conf/deepspeech2.yaml
examples/aishell/s0/conf/deepspeech2.yaml
+1
-2
examples/aishell/s0/run.sh
examples/aishell/s0/run.sh
+3
-3
examples/aishell/s1/README.md
examples/aishell/s1/README.md
+12
-12
examples/librispeech/s0/README.md
examples/librispeech/s0/README.md
+5
-5
examples/librispeech/s0/conf/deepspeech2.yaml
examples/librispeech/s0/conf/deepspeech2.yaml
+9
-4
tools/Makefile
tools/Makefile
+2
-2
未找到文件。
deepspeech/exps/deepspeech2/bin/deploy/runtime.py
浏览文件 @
1ec93dbd
...
@@ -18,8 +18,10 @@ import numpy as np
...
@@ -18,8 +18,10 @@ import numpy as np
import
paddle
import
paddle
from
paddle.inference
import
Config
from
paddle.inference
import
Config
from
paddle.inference
import
create_predictor
from
paddle.inference
import
create_predictor
from
paddle.io
import
DataLoader
from
deepspeech.exps.deepspeech2.config
import
get_cfg_defaults
from
deepspeech.exps.deepspeech2.config
import
get_cfg_defaults
from
deepspeech.io.collator
import
SpeechCollator
from
deepspeech.io.dataset
import
ManifestDataset
from
deepspeech.io.dataset
import
ManifestDataset
from
deepspeech.models.deepspeech2
import
DeepSpeech2Model
from
deepspeech.models.deepspeech2
import
DeepSpeech2Model
from
deepspeech.training.cli
import
default_argument_parser
from
deepspeech.training.cli
import
default_argument_parser
...
@@ -78,26 +80,31 @@ def inference(config, args):
...
@@ -78,26 +80,31 @@ def inference(config, args):
def
start_server
(
config
,
args
):
def
start_server
(
config
,
args
):
"""Start the ASR server"""
"""Start the ASR server"""
config
.
defrost
()
config
.
defrost
()
config
.
data
.
manfiest
=
config
.
data
.
test_manifest
config
.
data
.
manifest
=
config
.
data
.
test_manifest
config
.
data
.
augmentation_config
=
""
config
.
data
.
keep_transcription_text
=
True
dataset
=
ManifestDataset
.
from_config
(
config
)
dataset
=
ManifestDataset
.
from_config
(
config
)
model
=
DeepSpeech2Model
.
from_pretrained
(
dataset
,
config
,
config
.
collator
.
augmentation_config
=
""
config
.
collator
.
keep_transcription_text
=
True
config
.
collator
.
batch_size
=
1
config
.
collator
.
num_workers
=
0
collate_fn
=
SpeechCollator
.
from_config
(
config
)
test_loader
=
DataLoader
(
dataset
,
collate_fn
=
collate_fn
,
num_workers
=
0
)
model
=
DeepSpeech2Model
.
from_pretrained
(
test_loader
,
config
,
args
.
checkpoint_path
)
args
.
checkpoint_path
)
model
.
eval
()
model
.
eval
()
# prepare ASR inference handler
# prepare ASR inference handler
def
file_to_transcript
(
filename
):
def
file_to_transcript
(
filename
):
feature
=
dataset
.
process_utterance
(
filename
,
""
)
feature
=
test_loader
.
collate_fn
.
process_utterance
(
filename
,
""
)
audio
=
np
.
array
([
feature
[
0
]]).
astype
(
'float32'
)
#[1,
D, T
]
audio
=
np
.
array
([
feature
[
0
]]).
astype
(
'float32'
)
#[1,
T, D
]
audio_len
=
feature
[
0
].
shape
[
1
]
audio_len
=
feature
[
0
].
shape
[
0
]
audio_len
=
np
.
array
([
audio_len
]).
astype
(
'int64'
)
# [1]
audio_len
=
np
.
array
([
audio_len
]).
astype
(
'int64'
)
# [1]
result_transcript
=
model
.
decode
(
result_transcript
=
model
.
decode
(
paddle
.
to_tensor
(
audio
),
paddle
.
to_tensor
(
audio
),
paddle
.
to_tensor
(
audio_len
),
paddle
.
to_tensor
(
audio_len
),
vocab_list
=
dataset
.
vocab_list
,
vocab_list
=
test_loader
.
collate_fn
.
vocab_list
,
decoding_method
=
config
.
decoding
.
decoding_method
,
decoding_method
=
config
.
decoding
.
decoding_method
,
lang_model_path
=
config
.
decoding
.
lang_model_path
,
lang_model_path
=
config
.
decoding
.
lang_model_path
,
beam_alpha
=
config
.
decoding
.
alpha
,
beam_alpha
=
config
.
decoding
.
alpha
,
...
@@ -138,7 +145,7 @@ if __name__ == "__main__":
...
@@ -138,7 +145,7 @@ if __name__ == "__main__":
add_arg
(
'host_ip'
,
str
,
add_arg
(
'host_ip'
,
str
,
'localhost'
,
'localhost'
,
"Server's IP address."
)
"Server's IP address."
)
add_arg
(
'host_port'
,
int
,
808
6
,
"Server's IP port."
)
add_arg
(
'host_port'
,
int
,
808
9
,
"Server's IP port."
)
add_arg
(
'speech_save_dir'
,
str
,
add_arg
(
'speech_save_dir'
,
str
,
'demo_cache'
,
'demo_cache'
,
"Directory to save demo audios."
)
"Directory to save demo audios."
)
...
...
deepspeech/exps/deepspeech2/bin/deploy/server.py
浏览文件 @
1ec93dbd
...
@@ -16,8 +16,10 @@ import functools
...
@@ -16,8 +16,10 @@ import functools
import
numpy
as
np
import
numpy
as
np
import
paddle
import
paddle
from
paddle.io
import
DataLoader
from
deepspeech.exps.deepspeech2.config
import
get_cfg_defaults
from
deepspeech.exps.deepspeech2.config
import
get_cfg_defaults
from
deepspeech.io.collator
import
SpeechCollator
from
deepspeech.io.dataset
import
ManifestDataset
from
deepspeech.io.dataset
import
ManifestDataset
from
deepspeech.models.deepspeech2
import
DeepSpeech2Model
from
deepspeech.models.deepspeech2
import
DeepSpeech2Model
from
deepspeech.training.cli
import
default_argument_parser
from
deepspeech.training.cli
import
default_argument_parser
...
@@ -31,26 +33,35 @@ from deepspeech.utils.utility import print_arguments
...
@@ -31,26 +33,35 @@ from deepspeech.utils.utility import print_arguments
def
start_server
(
config
,
args
):
def
start_server
(
config
,
args
):
"""Start the ASR server"""
"""Start the ASR server"""
config
.
defrost
()
config
.
defrost
()
config
.
data
.
manfiest
=
config
.
data
.
test_manifest
config
.
data
.
manifest
=
config
.
data
.
test_manifest
config
.
data
.
augmentation_config
=
""
config
.
data
.
keep_transcription_text
=
True
dataset
=
ManifestDataset
.
from_config
(
config
)
dataset
=
ManifestDataset
.
from_config
(
config
)
model
=
DeepSpeech2Model
.
from_pretrained
(
dataset
,
config
,
config
.
collator
.
augmentation_config
=
""
config
.
collator
.
keep_transcription_text
=
True
config
.
collator
.
batch_size
=
1
config
.
collator
.
num_workers
=
0
collate_fn
=
SpeechCollator
.
from_config
(
config
)
test_loader
=
DataLoader
(
dataset
,
collate_fn
=
collate_fn
,
num_workers
=
0
)
model
=
DeepSpeech2Model
.
from_pretrained
(
test_loader
,
config
,
args
.
checkpoint_path
)
args
.
checkpoint_path
)
model
.
eval
()
model
.
eval
()
# prepare ASR inference handler
# prepare ASR inference handler
def
file_to_transcript
(
filename
):
def
file_to_transcript
(
filename
):
feature
=
dataset
.
process_utterance
(
filename
,
""
)
feature
=
test_loader
.
collate_fn
.
process_utterance
(
filename
,
""
)
audio
=
np
.
array
([
feature
[
0
]]).
astype
(
'float32'
)
#[1, D, T]
audio
=
np
.
array
([
feature
[
0
]]).
astype
(
'float32'
)
#[1, T, D]
audio_len
=
feature
[
0
].
shape
[
1
]
# audio = audio.swapaxes(1,2)
print
(
'---file_to_transcript feature----'
)
print
(
audio
.
shape
)
audio_len
=
feature
[
0
].
shape
[
0
]
print
(
audio_len
)
audio_len
=
np
.
array
([
audio_len
]).
astype
(
'int64'
)
# [1]
audio_len
=
np
.
array
([
audio_len
]).
astype
(
'int64'
)
# [1]
result_transcript
=
model
.
decode
(
result_transcript
=
model
.
decode
(
paddle
.
to_tensor
(
audio
),
paddle
.
to_tensor
(
audio
),
paddle
.
to_tensor
(
audio_len
),
paddle
.
to_tensor
(
audio_len
),
vocab_list
=
dataset
.
vocab_list
,
vocab_list
=
test_loader
.
collate_fn
.
vocab_list
,
decoding_method
=
config
.
decoding
.
decoding_method
,
decoding_method
=
config
.
decoding
.
decoding_method
,
lang_model_path
=
config
.
decoding
.
lang_model_path
,
lang_model_path
=
config
.
decoding
.
lang_model_path
,
beam_alpha
=
config
.
decoding
.
alpha
,
beam_alpha
=
config
.
decoding
.
alpha
,
...
@@ -91,7 +102,7 @@ if __name__ == "__main__":
...
@@ -91,7 +102,7 @@ if __name__ == "__main__":
add_arg
(
'host_ip'
,
str
,
add_arg
(
'host_ip'
,
str
,
'localhost'
,
'localhost'
,
"Server's IP address."
)
"Server's IP address."
)
add_arg
(
'host_port'
,
int
,
808
6
,
"Server's IP port."
)
add_arg
(
'host_port'
,
int
,
808
8
,
"Server's IP port."
)
add_arg
(
'speech_save_dir'
,
str
,
add_arg
(
'speech_save_dir'
,
str
,
'demo_cache'
,
'demo_cache'
,
"Directory to save demo audios."
)
"Directory to save demo audios."
)
...
...
deepspeech/exps/deepspeech2/bin/tune.py
浏览文件 @
1ec93dbd
...
@@ -47,7 +47,7 @@ def tune(config, args):
...
@@ -47,7 +47,7 @@ def tune(config, args):
drop_last
=
False
,
drop_last
=
False
,
collate_fn
=
SpeechCollator
(
keep_transcription_text
=
True
))
collate_fn
=
SpeechCollator
(
keep_transcription_text
=
True
))
model
=
DeepSpeech2Model
.
from_pretrained
(
dev_dataset
,
config
,
model
=
DeepSpeech2Model
.
from_pretrained
(
valid_loader
,
config
,
args
.
checkpoint_path
)
args
.
checkpoint_path
)
model
.
eval
()
model
.
eval
()
...
...
deepspeech/exps/deepspeech2/model.py
浏览文件 @
1ec93dbd
...
@@ -318,7 +318,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
...
@@ -318,7 +318,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
def
export
(
self
):
def
export
(
self
):
infer_model
=
DeepSpeech2InferModel
.
from_pretrained
(
infer_model
=
DeepSpeech2InferModel
.
from_pretrained
(
self
.
test_loader
.
dataset
,
self
.
config
,
self
.
args
.
checkpoint_path
)
self
.
test_loader
,
self
.
config
,
self
.
args
.
checkpoint_path
)
infer_model
.
eval
()
infer_model
.
eval
()
feat_dim
=
self
.
test_loader
.
collate_fn
.
feature_size
feat_dim
=
self
.
test_loader
.
collate_fn
.
feature_size
static_model
=
paddle
.
jit
.
to_static
(
static_model
=
paddle
.
jit
.
to_static
(
...
...
deepspeech/exps/u2/model.py
浏览文件 @
1ec93dbd
...
@@ -506,7 +506,7 @@ class U2Tester(U2Trainer):
...
@@ -506,7 +506,7 @@ class U2Tester(U2Trainer):
List[paddle.static.InputSpec]: input spec.
List[paddle.static.InputSpec]: input spec.
"""
"""
from
deepspeech.models.u2
import
U2InferModel
from
deepspeech.models.u2
import
U2InferModel
infer_model
=
U2InferModel
.
from_pretrained
(
self
.
test_loader
.
dataset
,
infer_model
=
U2InferModel
.
from_pretrained
(
self
.
test_loader
,
self
.
config
.
model
.
clone
(),
self
.
config
.
model
.
clone
(),
self
.
args
.
checkpoint_path
)
self
.
args
.
checkpoint_path
)
feat_dim
=
self
.
test_loader
.
collate_fn
.
feature_size
feat_dim
=
self
.
test_loader
.
collate_fn
.
feature_size
...
...
deepspeech/frontend/augmentor/spec_augment.py
浏览文件 @
1ec93dbd
...
@@ -124,7 +124,7 @@ class SpecAugmentor(AugmentorBase):
...
@@ -124,7 +124,7 @@ class SpecAugmentor(AugmentorBase):
def
time_warp
(
xs
,
W
=
40
):
def
time_warp
(
xs
,
W
=
40
):
raise
NotImplementedError
raise
NotImplementedError
def
randomize_parameters
(
self
,
n_
bins
,
n_frame
):
def
randomize_parameters
(
self
,
n_
frame
,
n_bins
):
# n_bins = xs.shape[0]
# n_bins = xs.shape[0]
# n_frames = xs.shape[1]
# n_frames = xs.shape[1]
...
@@ -156,66 +156,69 @@ class SpecAugmentor(AugmentorBase):
...
@@ -156,66 +156,69 @@ class SpecAugmentor(AugmentorBase):
self
.
t_0
.
append
(
int
(
self
.
_rng
.
uniform
(
low
=
0
,
high
=
n_frames
-
t
)))
self
.
t_0
.
append
(
int
(
self
.
_rng
.
uniform
(
low
=
0
,
high
=
n_frames
-
t
)))
def
apply
(
self
,
xs
:
np
.
ndarray
):
def
apply
(
self
,
xs
:
np
.
ndarray
):
n_bins
=
xs
.
shape
[
0
]
'''
n_frames
=
xs
.
shape
[
1
]
input xs [T, D]
'''
n_frames
=
xs
.
shape
[
0
]
n_bins
=
xs
.
shape
[
1
]
for
i
in
range
(
0
,
self
.
n_freq_masks
):
for
i
in
range
(
0
,
self
.
n_freq_masks
):
f
=
self
.
f
[
i
]
f
=
self
.
f
[
i
]
f_0
=
self
.
f_0
[
i
]
f_0
=
self
.
f_0
[
i
]
xs
[
f_0
:
f_0
+
f
,
:
]
=
0
xs
[
:,
f_0
:
f_0
+
f
]
=
0
assert
f_0
<=
f_0
+
f
assert
f_0
<=
f_0
+
f
for
i
in
range
(
self
.
n_masks
):
for
i
in
range
(
self
.
n_masks
):
t
=
self
.
t
[
i
]
t
=
self
.
t
[
i
]
t_0
=
self
.
t_0
[
i
]
t_0
=
self
.
t_0
[
i
]
xs
[
:,
t_0
:
t_0
+
t
]
=
0
xs
[
t_0
:
t_0
+
t
,
:
]
=
0
assert
t_0
<=
t_0
+
t
assert
t_0
<=
t_0
+
t
return
xs
return
xs
def
mask_freq
(
self
,
xs
,
replace_with_zero
=
False
):
# def mask_freq(self, xs, replace_with_zero=False):
n_bins
=
xs
.
shape
[
0
]
# n_bins = xs.shape[0]
for
i
in
range
(
0
,
self
.
n_freq_masks
):
# for i in range(0, self.n_freq_masks):
f
=
int
(
self
.
_rng
.
uniform
(
low
=
0
,
high
=
self
.
F
))
# f = int(self._rng.uniform(low=0, high=self.F))
f_0
=
int
(
self
.
_rng
.
uniform
(
low
=
0
,
high
=
n_bins
-
f
))
# f_0 = int(self._rng.uniform(low=0, high=n_bins - f))
xs
[
f_0
:
f_0
+
f
,
:]
=
0
# xs[f_0:f_0 + f, :] = 0
assert
f_0
<=
f_0
+
f
# assert f_0 <= f_0 + f
self
.
_freq_mask
=
(
f_0
,
f_0
+
f
)
# self._freq_mask = (f_0, f_0 + f)
return
xs
# return xs
def
mask_time
(
self
,
xs
,
replace_with_zero
=
False
):
n_frames
=
xs
.
shape
[
1
]
if
self
.
adaptive_number_ratio
>
0
:
n_masks
=
int
(
n_frames
*
self
.
adaptive_number_ratio
)
n_masks
=
min
(
n_masks
,
self
.
max_n_time_masks
)
else
:
n_masks
=
self
.
n_time_masks
if
self
.
adaptive_size_ratio
>
0
:
T
=
self
.
adaptive_size_ratio
*
n_frames
else
:
T
=
self
.
T
for
i
in
range
(
n_masks
):
# def mask_time(self, xs, replace_with_zero=False):
t
=
int
(
self
.
_rng
.
uniform
(
low
=
0
,
high
=
T
))
# n_frames = xs.shape[1]
t
=
min
(
t
,
int
(
n_frames
*
self
.
p
))
t_0
=
int
(
self
.
_rng
.
uniform
(
low
=
0
,
high
=
n_frames
-
t
))
# if self.adaptive_number_ratio > 0:
xs
[:,
t_0
:
t_0
+
t
]
=
0
# n_masks = int(n_frames * self.adaptive_number_ratio)
assert
t_0
<=
t_0
+
t
# n_masks = min(n_masks, self.max_n_time_masks)
self
.
_time_mask
=
(
t_0
,
t_0
+
t
)
# else:
return
xs
# n_masks = self.n_time_masks
# if self.adaptive_size_ratio > 0:
# T = self.adaptive_size_ratio * n_frames
# else:
# T = self.T
# for i in range(n_masks):
# t = int(self._rng.uniform(low=0, high=T))
# t = min(t, int(n_frames * self.p))
# t_0 = int(self._rng.uniform(low=0, high=n_frames - t))
# xs[:, t_0:t_0 + t] = 0
# assert t_0 <= t_0 + t
# self._time_mask = (t_0, t_0 + t)
# return xs
def
transform_feature
(
self
,
xs
:
np
.
ndarray
,
single
=
True
):
#
def transform_feature(self, xs: np.ndarray, single=True):
"""
#
"""
Args:
#
Args:
xs (FloatTensor): `[F, T]`
#
xs (FloatTensor): `[F, T]`
Returns:
#
Returns:
xs (FloatTensor): `[F, T]`
#
xs (FloatTensor): `[F, T]`
"""
#
"""
if
(
single
):
#
if(single):
self
.
randomize_parameters
(
xs
)
#
self.randomize_parameters(xs)
return
self
.
apply
(
xs
)
#
return self.apply(xs)
# def transform_feature(self, xs: np.ndarray):
# def transform_feature(self, xs: np.ndarray):
# """
# """
...
...
deepspeech/frontend/featurizer/audio_featurizer.py
浏览文件 @
1ec93dbd
...
@@ -221,17 +221,19 @@ class AudioFeaturizer(object):
...
@@ -221,17 +221,19 @@ class AudioFeaturizer(object):
"""append delat, delta-delta feature.
"""append delat, delta-delta feature.
Args:
Args:
feat (np.ndarray): (
D, T
)
feat (np.ndarray): (
T, D
)
Returns:
Returns:
np.ndarray: feat with delta-delta, (
3*D, T
)
np.ndarray: feat with delta-delta, (
T, 3*D
)
"""
"""
# transpose (T, D) --> (D, T)
feat
=
np
.
transpose
(
feat
)
feat
=
np
.
transpose
(
feat
)
# Deltas
# Deltas
d_feat
=
delta
(
feat
,
2
)
d_feat
=
delta
(
feat
,
2
)
# Deltas-Deltas
# Deltas-Deltas
dd_feat
=
delta
(
feat
,
2
)
dd_feat
=
delta
(
feat
,
2
)
# transpose
# transpose
# transpose (D, T) --> (T, D)
feat
=
np
.
transpose
(
feat
)
feat
=
np
.
transpose
(
feat
)
d_feat
=
np
.
transpose
(
d_feat
)
d_feat
=
np
.
transpose
(
d_feat
)
dd_feat
=
np
.
transpose
(
dd_feat
)
dd_feat
=
np
.
transpose
(
dd_feat
)
...
@@ -264,7 +266,7 @@ class AudioFeaturizer(object):
...
@@ -264,7 +266,7 @@ class AudioFeaturizer(object):
ValueError: stride_ms > window_ms
ValueError: stride_ms > window_ms
Returns:
Returns:
np.ndarray: mfcc feature, (
D, T
).
np.ndarray: mfcc feature, (
T, D
).
"""
"""
if
max_freq
is
None
:
if
max_freq
is
None
:
max_freq
=
sample_rate
/
2
max_freq
=
sample_rate
/
2
...
@@ -322,7 +324,7 @@ class AudioFeaturizer(object):
...
@@ -322,7 +324,7 @@ class AudioFeaturizer(object):
ValueError: stride_ms > window_ms
ValueError: stride_ms > window_ms
Returns:
Returns:
np.ndarray: mfcc feature, (
D, T
).
np.ndarray: mfcc feature, (
T, D
).
"""
"""
if
max_freq
is
None
:
if
max_freq
is
None
:
max_freq
=
sample_rate
/
2
max_freq
=
sample_rate
/
2
...
...
deepspeech/io/collator.py
浏览文件 @
1ec93dbd
...
@@ -229,7 +229,7 @@ class SpeechCollator():
...
@@ -229,7 +229,7 @@ class SpeechCollator():
def
randomize_feature_parameters
(
self
,
n_bins
,
n_frames
):
def
randomize_feature_parameters
(
self
,
n_bins
,
n_frames
):
self
.
_augmentation_pipeline
.
andomize_parameters_feature_transform
(
n_bins
,
n_frames
)
self
.
_augmentation_pipeline
.
andomize_parameters_feature_transform
(
n_bins
,
n_frames
)
def
process_
utterance
(
self
,
audio_file
,
transcript
):
def
process_
feature_and_transform
(
self
,
audio_file
,
transcript
):
"""Load, augment, featurize and normalize for speech data.
"""Load, augment, featurize and normalize for speech data.
:param audio_file: Filepath or file object of audio file.
:param audio_file: Filepath or file object of audio file.
...
@@ -254,6 +254,7 @@ class SpeechCollator():
...
@@ -254,6 +254,7 @@ class SpeechCollator():
# # apply specgram augment
# # apply specgram augment
# specgram = self._augmentation_pipeline.apply_feature_transform(specgram)
# specgram = self._augmentation_pipeline.apply_feature_transform(specgram)
return
specgram
,
transcript_part
return
specgram
,
transcript_part
...
@@ -318,12 +319,12 @@ class SpeechCollator():
...
@@ -318,12 +319,12 @@ class SpeechCollator():
for
utt
,
audio
,
text
in
batch
:
for
utt
,
audio
,
text
in
batch
:
if
not
self
.
config
.
randomize_each_batch
:
if
not
self
.
config
.
randomize_each_batch
:
self
.
randomize_audio_parameters
()
self
.
randomize_audio_parameters
()
audio
,
text
=
self
.
process_
utterance
(
audio
,
text
)
audio
,
text
=
self
.
process_
feature_and_transform
(
audio
,
text
)
#utt
#utt
utts
.
append
(
utt
)
utts
.
append
(
utt
)
# audio
# audio
audios
.
append
(
audio
.
T
)
# [T, D]
audios
.
append
(
audio
)
# [T, D]
audio_lens
.
append
(
audio
.
shape
[
1
])
audio_lens
.
append
(
audio
.
shape
[
0
])
# text
# text
# for training, text is token ids
# for training, text is token ids
# else text is string, convert to unicode ord
# else text is string, convert to unicode ord
...
@@ -346,8 +347,8 @@ class SpeechCollator():
...
@@ -346,8 +347,8 @@ class SpeechCollator():
text_lens
=
np
.
array
(
text_lens
).
astype
(
np
.
int64
)
text_lens
=
np
.
array
(
text_lens
).
astype
(
np
.
int64
)
#spec augment
#spec augment
n_bins
=
padded_audios
[
0
]
n_bins
=
padded_audios
.
shape
[
2
]
self
.
randomize_feature_parameters
(
n_bins
,
min
(
audio_lens
)
)
self
.
randomize_feature_parameters
(
min
(
audio_lens
),
n_bins
)
for
i
in
range
(
len
(
padded_audios
)):
for
i
in
range
(
len
(
padded_audios
)):
if
not
self
.
config
.
randomize_each_batch
:
if
not
self
.
config
.
randomize_each_batch
:
self
.
randomize_feature_parameters
(
n_bins
,
audio_lens
[
i
])
self
.
randomize_feature_parameters
(
n_bins
,
audio_lens
[
i
])
...
...
deepspeech/models/deepspeech2.py
浏览文件 @
1ec93dbd
...
@@ -198,11 +198,11 @@ class DeepSpeech2Model(nn.Layer):
...
@@ -198,11 +198,11 @@ class DeepSpeech2Model(nn.Layer):
cutoff_top_n
,
num_processes
)
cutoff_top_n
,
num_processes
)
@
classmethod
@
classmethod
def
from_pretrained
(
cls
,
data
set
,
config
,
checkpoint_path
):
def
from_pretrained
(
cls
,
data
loader
,
config
,
checkpoint_path
):
"""Build a DeepSpeech2Model model from a pretrained model.
"""Build a DeepSpeech2Model model from a pretrained model.
Parameters
Parameters
----------
----------
data
set: paddle.io.Dataset
data
loader: paddle.io.DataLoader
config: yacs.config.CfgNode
config: yacs.config.CfgNode
model configs
model configs
...
@@ -215,8 +215,8 @@ class DeepSpeech2Model(nn.Layer):
...
@@ -215,8 +215,8 @@ class DeepSpeech2Model(nn.Layer):
DeepSpeech2Model
DeepSpeech2Model
The model built from pretrained result.
The model built from pretrained result.
"""
"""
model
=
cls
(
feat_size
=
data
set
.
feature_size
,
model
=
cls
(
feat_size
=
data
loader
.
collate_fn
.
feature_size
,
dict_size
=
data
set
.
vocab_size
,
dict_size
=
data
loader
.
collate_fn
.
vocab_size
,
num_conv_layers
=
config
.
model
.
num_conv_layers
,
num_conv_layers
=
config
.
model
.
num_conv_layers
,
num_rnn_layers
=
config
.
model
.
num_rnn_layers
,
num_rnn_layers
=
config
.
model
.
num_rnn_layers
,
rnn_size
=
config
.
model
.
rnn_layer_size
,
rnn_size
=
config
.
model
.
rnn_layer_size
,
...
...
deepspeech/models/u2.py
浏览文件 @
1ec93dbd
...
@@ -876,11 +876,11 @@ class U2Model(U2BaseModel):
...
@@ -876,11 +876,11 @@ class U2Model(U2BaseModel):
return
model
return
model
@
classmethod
@
classmethod
def
from_pretrained
(
cls
,
data
set
,
config
,
checkpoint_path
):
def
from_pretrained
(
cls
,
data
loader
,
config
,
checkpoint_path
):
"""Build a DeepSpeech2Model model from a pretrained model.
"""Build a DeepSpeech2Model model from a pretrained model.
Args:
Args:
data
set (paddle.io.Dataset
): not used.
data
loader (paddle.io.DataLoader
): not used.
config (yacs.config.CfgNode): model configs
config (yacs.config.CfgNode): model configs
checkpoint_path (Path or str): the path of pretrained model checkpoint, without extension name
checkpoint_path (Path or str): the path of pretrained model checkpoint, without extension name
...
@@ -888,8 +888,8 @@ class U2Model(U2BaseModel):
...
@@ -888,8 +888,8 @@ class U2Model(U2BaseModel):
DeepSpeech2Model: The model built from pretrained result.
DeepSpeech2Model: The model built from pretrained result.
"""
"""
config
.
defrost
()
config
.
defrost
()
config
.
input_dim
=
data
set
.
feature_size
config
.
input_dim
=
data
loader
.
collate_fn
.
feature_size
config
.
output_dim
=
data
set
.
vocab_size
config
.
output_dim
=
data
loader
.
collate_fn
.
vocab_size
config
.
freeze
()
config
.
freeze
()
model
=
cls
.
from_config
(
config
)
model
=
cls
.
from_config
(
config
)
...
...
deepspeech/utils/socket_server.py
浏览文件 @
1ec93dbd
...
@@ -48,9 +48,9 @@ def warm_up_test(audio_process_handler,
...
@@ -48,9 +48,9 @@ def warm_up_test(audio_process_handler,
rng
=
random
.
Random
(
random_seed
)
rng
=
random
.
Random
(
random_seed
)
samples
=
rng
.
sample
(
manifest
,
num_test_cases
)
samples
=
rng
.
sample
(
manifest
,
num_test_cases
)
for
idx
,
sample
in
enumerate
(
samples
):
for
idx
,
sample
in
enumerate
(
samples
):
print
(
"Warm-up Test Case %d: %s"
,
idx
,
sample
[
'audio_filepath'
]
)
print
(
"Warm-up Test Case %d: %s"
%
(
idx
,
sample
[
'feat'
])
)
start_time
=
time
.
time
()
start_time
=
time
.
time
()
transcript
=
audio_process_handler
(
sample
[
'
audio_filepath
'
])
transcript
=
audio_process_handler
(
sample
[
'
feat
'
])
finish_time
=
time
.
time
()
finish_time
=
time
.
time
()
print
(
"Response Time: %f, Transcript: %s"
%
print
(
"Response Time: %f, Transcript: %s"
%
(
finish_time
-
start_time
,
transcript
))
(
finish_time
-
start_time
,
transcript
))
...
...
examples/aishell/s0/README.md
浏览文件 @
1ec93dbd
...
@@ -2,9 +2,10 @@
...
@@ -2,9 +2,10 @@
## Deepspeech2
## Deepspeech2
| Model | release | Config | Test set | Loss | CER |
| Model | Params | Release | Config | Test set | Loss | CER |
| --- | --- | --- | --- | --- | --- |
| --- | --- | --- | --- | --- | --- | --- |
| DeepSpeech2 | 2.1.0 | conf/deepspeech2.yaml + spec aug | test | 7.483316898345947 | 0.077860 |
| DeepSpeech2 | 58.4M | 2.2.0 | conf/deepspeech2.yaml + spec aug + new datapipe | test | 6.396368026733398 | 0.068382 |
| DeepSpeech2 | 2.1.0 | conf/deepspeech2.yaml | test | 7.299022197723389 | 0.078671 |
| DeepSpeech2 | 58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug | test | 7.483316898345947 | 0.077860 |
| DeepSpeech2 | 2.0.0 | conf/deepspeech2.yaml | test | - | 0.078977 |
| DeepSpeech2 | 58.4M | 2.1.0 | conf/deepspeech2.yaml | test | 7.299022197723389 | 0.078671 |
| DeepSpeech2 | 1.8.5 | - | test | - | 0.080447 |
| DeepSpeech2 | 58.4M | 2.0.0 | conf/deepspeech2.yaml | test | - | 0.078977 |
| DeepSpeech2 | 58.4M | 1.8.5 | - | test | - | 0.080447 |
examples/aishell/s0/conf/deepspeech2.yaml
浏览文件 @
1ec93dbd
...
@@ -10,8 +10,8 @@ data:
...
@@ -10,8 +10,8 @@ data:
min_output_input_ratio
:
0.00
min_output_input_ratio
:
0.00
max_output_input_ratio
:
.inf
max_output_input_ratio
:
.inf
collator
:
collator
:
batch_size
:
64
# one gpu
mean_std_filepath
:
data/mean_std.json
mean_std_filepath
:
data/mean_std.json
unit_type
:
char
unit_type
:
char
vocab_filepath
:
data/vocab.txt
vocab_filepath
:
data/vocab.txt
...
@@ -33,7 +33,6 @@ collator:
...
@@ -33,7 +33,6 @@ collator:
sortagrad
:
True
sortagrad
:
True
shuffle_method
:
batch_shuffle
shuffle_method
:
batch_shuffle
num_workers
:
0
num_workers
:
0
batch_size
:
64
# one gpu
model
:
model
:
num_conv_layers
:
2
num_conv_layers
:
2
...
...
examples/aishell/s0/run.sh
浏览文件 @
1ec93dbd
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
set
-e
set
-e
source
path.sh
source
path.sh
gpus
=
0
gpus
=
0
,1,2,3
stage
=
0
stage
=
0
stop_stage
=
100
stop_stage
=
100
conf_path
=
conf/deepspeech2.yaml
conf_path
=
conf/deepspeech2.yaml
...
@@ -31,10 +31,10 @@ fi
...
@@ -31,10 +31,10 @@ fi
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
if
[
${
stage
}
-le
3
]
&&
[
${
stop_stage
}
-ge
3
]
;
then
# test ckpt avg_n
# test ckpt avg_n
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/test.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
CUDA_VISIBLE_DEVICES
=
0
./local/test.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
||
exit
-1
fi
fi
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
# export ckpt avg_n
# export ckpt avg_n
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/export.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
CUDA_VISIBLE_DEVICES
=
0
./local/export.sh
${
conf_path
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
exp/
${
ckpt
}
/checkpoints/
${
avg_ckpt
}
.jit
fi
fi
examples/aishell/s1/README.md
浏览文件 @
1ec93dbd
...
@@ -2,21 +2,21 @@
...
@@ -2,21 +2,21 @@
## Conformer
## Conformer
| Model | Config | Augmentation| Test set | Decode method | Loss | WER |
| Model |
Params |
Config | Augmentation| Test set | Decode method | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- |
| --- | --- |
--- |
--- | --- | --- | --- | --- |
| conformer | conf/conformer.yaml | spec_aug + shift | test | attention | - | 0.059858 |
| conformer |
47.06M |
conf/conformer.yaml | spec_aug + shift | test | attention | - | 0.059858 |
| conformer | conf/conformer.yaml | spec_aug + shift | test | ctc_greedy_search | - | 0.062311 |
| conformer |
47.06M |
conf/conformer.yaml | spec_aug + shift | test | ctc_greedy_search | - | 0.062311 |
| conformer | conf/conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | - | 0.062196 |
| conformer |
47.06M |
conf/conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | - | 0.062196 |
| conformer | conf/conformer.yaml | spec_aug + shift | test | attention_rescoring | - | 0.054694 |
| conformer |
47.06M |
conf/conformer.yaml | spec_aug + shift | test | attention_rescoring | - | 0.054694 |
## Chunk Conformer
## Chunk Conformer
| Model | Config | Augmentation| Test set | Decode method | Chunk | Loss | WER |
| Model |
Params |
Config | Augmentation| Test set | Decode method | Chunk | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- | --- |
| --- | --- | --- | --- | --- | --- | --- | --- |
--- |
| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | attention | 16 | - | 0.061939 |
| conformer |
47.06M |
conf/chunk_conformer.yaml | spec_aug + shift | test | attention | 16 | - | 0.061939 |
| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16 | - | 0.070806 |
| conformer |
47.06M |
conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16 | - | 0.070806 |
| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16 | - | 0.070739 |
| conformer |
47.06M |
conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16 | - | 0.070739 |
| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16 | - | 0.059400 |
| conformer |
47.06M |
conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16 | - | 0.059400 |
## Transformer
## Transformer
...
...
examples/librispeech/s0/README.md
浏览文件 @
1ec93dbd
...
@@ -2,8 +2,8 @@
...
@@ -2,8 +2,8 @@
## Deepspeech2
## Deepspeech2
| Model |
r
elease | Config | Test set | Loss | WER |
| Model |
Params | R
elease | Config | Test set | Loss | WER |
| --- | --- | --- | --- | --- | --- |
| --- | --- |
--- |
--- | --- | --- | --- |
| DeepSpeech2 | 2.1.0 | conf/deepspeech2.yaml | 15.184467315673828 | test-clean | 0.072154 |
| DeepSpeech2 |
42.96M |
2.1.0 | conf/deepspeech2.yaml | 15.184467315673828 | test-clean | 0.072154 |
| DeepSpeech2 | 2.0.0 | conf/deepspeech2.yaml | - | test-clean | 0.073973 |
| DeepSpeech2 |
42.96M |
2.0.0 | conf/deepspeech2.yaml | - | test-clean | 0.073973 |
| DeepSpeech2 | 1.8.5 | - | test-clean | - | 0.074939 |
| DeepSpeech2 |
42.96M |
1.8.5 | - | test-clean | - | 0.074939 |
examples/librispeech/s0/conf/deepspeech2.yaml
浏览文件 @
1ec93dbd
...
@@ -3,16 +3,21 @@ data:
...
@@ -3,16 +3,21 @@ data:
train_manifest
:
data/manifest.train
train_manifest
:
data/manifest.train
dev_manifest
:
data/manifest.dev-clean
dev_manifest
:
data/manifest.dev-clean
test_manifest
:
data/manifest.test-clean
test_manifest
:
data/manifest.test-clean
mean_std_filepath
:
data/mean_std.json
vocab_filepath
:
data/vocab.txt
augmentation_config
:
conf/augmentation.json
batch_size
:
20
min_input_len
:
0.0
min_input_len
:
0.0
max_input_len
:
27.0
# second
max_input_len
:
27.0
# second
min_output_len
:
0.0
min_output_len
:
0.0
max_output_len
:
.inf
max_output_len
:
.inf
min_output_input_ratio
:
0.00
min_output_input_ratio
:
0.00
max_output_input_ratio
:
.inf
max_output_input_ratio
:
.inf
collator
:
batch_size
:
20
mean_std_filepath
:
data/mean_std.json
unit_type
:
char
vocab_filepath
:
data/vocab.txt
augmentation_config
:
conf/augmentation.json
random_seed
:
0
spm_model_prefix
:
specgram_type
:
linear
specgram_type
:
linear
target_sample_rate
:
16000
target_sample_rate
:
16000
max_freq
:
None
max_freq
:
None
...
...
tools/Makefile
浏览文件 @
1ec93dbd
...
@@ -31,5 +31,5 @@ sox.done:
...
@@ -31,5 +31,5 @@ sox.done:
soxbindings.done
:
soxbindings.done
:
test
-d
soxbindings
||
git clone https://github.com/pseeth/soxbindings.git
test
-d
soxbindings
||
git clone https://github.com/pseeth/soxbindings.git
source
venv/bin/activate
;
cd
soxbindings
&&
python
3
setup.py
install
source
venv/bin/activate
;
cd
soxbindings
&&
python setup.py
install
touch
soxbindings.done
touch
soxbindings.done
\ No newline at end of file
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录