Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
b878027c
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
b878027c
编写于
10月 26, 2021
作者:
H
Hui Zhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
format code
上级
e8bc9a2a
变更
19
显示空白变更内容
内联
并排
Showing
19 changed file
with
404 addition
and
302 deletion
+404
-302
deepspeech/decoders/recog.py
deepspeech/decoders/recog.py
+11
-12
deepspeech/decoders/recog_bin.py
deepspeech/decoders/recog_bin.py
+1
-0
deepspeech/models/lm/transformer.py
deepspeech/models/lm/transformer.py
+13
-14
deepspeech/transform/add_deltas.py
deepspeech/transform/add_deltas.py
+15
-3
deepspeech/transform/channel_selector.py
deepspeech/transform/channel_selector.py
+23
-12
deepspeech/transform/functional.py
deepspeech/transform/functional.py
+15
-1
deepspeech/transform/perturb.py
deepspeech/transform/perturb.py
+60
-53
deepspeech/transform/spec_augment.py
deepspeech/transform/spec_augment.py
+30
-22
deepspeech/transform/spectrogram.py
deepspeech/transform/spectrogram.py
+119
-121
deepspeech/transform/transform_interface.py
deepspeech/transform/transform_interface.py
+13
-0
deepspeech/transform/transformation.py
deepspeech/transform/transformation.py
+29
-22
deepspeech/transform/wpe.py
deepspeech/transform/wpe.py
+29
-17
deepspeech/utils/check_kwargs.py
deepspeech/utils/check_kwargs.py
+15
-1
deepspeech/utils/spec_augment.py
deepspeech/utils/spec_augment.py
+13
-0
examples/librispeech/s2/README.md
examples/librispeech/s2/README.md
+1
-1
parakeet/data/batch.py
parakeet/data/batch.py
+2
-2
parakeet/exps/tacotron2/ljspeech.py
parakeet/exps/tacotron2/ljspeech.py
+3
-6
requirements.txt
requirements.txt
+1
-2
utils/feat-to-shape.py
utils/feat-to-shape.py
+11
-13
未找到文件。
deepspeech/decoders/recog.py
浏览文件 @
b878027c
...
...
@@ -24,11 +24,7 @@ from .utils import add_results_to_json
from
deepspeech.exps
import
dynamic_import_tester
from
deepspeech.io.reader
import
LoadInputsAndTargets
from
deepspeech.models.asr_interface
import
ASRInterface
from
deepspeech.models.lm.transformer
import
TransformerLM
from
deepspeech.utils.log
import
Log
# from espnet.asr.asr_utils import get_model_conf
# from espnet.asr.asr_utils import torch_load
# from espnet.nets.lm_interface import dynamic_import_lm
logger
=
Log
(
__name__
).
getlog
()
...
...
@@ -49,12 +45,14 @@ def load_trained_model(args):
model
=
exp
.
model
return
model
,
char_list
,
exp
,
confs
def
get_config
(
config_path
):
stream
=
open
(
config_path
,
mode
=
'r'
,
encoding
=
"utf-8"
)
config
=
yaml
.
load
(
stream
,
Loader
=
yaml
.
FullLoader
)
stream
.
close
()
return
config
def
load_trained_lm
(
args
):
lm_args
=
get_config
(
args
.
rnnlm_conf
)
# NOTE: for a compatibility with less than 0.5.0 version models
...
...
@@ -65,6 +63,7 @@ def load_trained_lm(args):
lm
.
set_state_dict
(
model_dict
)
return
lm
def
recog_v2
(
args
):
"""Decode with custom models that implements ScorerInterface.
...
...
deepspeech/decoders/recog_bin.py
浏览文件 @
b878027c
...
...
@@ -21,6 +21,7 @@ from distutils.util import strtobool
import
configargparse
import
numpy
as
np
def
get_parser
():
"""Get default arguments."""
parser
=
configargparse
.
ArgumentParser
(
...
...
deepspeech/models/lm/transformer.py
浏览文件 @
b878027c
...
...
@@ -30,8 +30,7 @@ logger = Log(__name__).getlog()
class
TransformerLM
(
nn
.
Layer
,
LMInterface
,
BatchScorerInterface
):
def
__init__
(
self
,
def
__init__
(
self
,
n_vocab
:
int
,
pos_enc
:
str
=
None
,
embed_unit
:
int
=
128
,
...
...
deepspeech/transform/add_deltas.py
浏览文件 @
b878027c
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
...
...
@@ -9,7 +22,7 @@ def delta(feat, window):
delta_feat
[
i
:]
+=
-
i
*
feat
[:
-
i
]
delta_feat
[
-
i
:]
+=
i
*
feat
[
-
1
]
delta_feat
[:
i
]
+=
-
i
*
feat
[
0
]
delta_feat
/=
2
*
sum
(
i
**
2
for
i
in
range
(
1
,
window
+
1
))
delta_feat
/=
2
*
sum
(
i
**
2
for
i
in
range
(
1
,
window
+
1
))
return
delta_feat
...
...
@@ -34,8 +47,7 @@ class AddDeltas():
def
__repr__
(
self
):
return
"{name}(window={window}, order={order}"
.
format
(
name
=
self
.
__class__
.
__name__
,
window
=
self
.
window
,
order
=
self
.
order
)
name
=
self
.
__class__
.
__name__
,
window
=
self
.
window
,
order
=
self
.
order
)
def
__call__
(
self
,
x
):
return
add_deltas
(
x
,
window
=
self
.
window
,
order
=
self
.
order
)
deepspeech/transform/channel_selector.py
浏览文件 @
b878027c
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
...
...
@@ -10,15 +23,12 @@ class ChannelSelector():
self
.
axis
=
axis
def
__repr__
(
self
):
return
(
"{name}(train_channel={train_channel}, "
return
(
"{name}(train_channel={train_channel}, "
"eval_channel={eval_channel}, axis={axis})"
.
format
(
name
=
self
.
__class__
.
__name__
,
train_channel
=
self
.
train_channel
,
eval_channel
=
self
.
eval_channel
,
axis
=
self
.
axis
,
)
)
axis
=
self
.
axis
,
))
def
__call__
(
self
,
x
,
train
=
True
):
# Assuming x: [Time, Channel] by default
...
...
@@ -27,8 +37,8 @@ class ChannelSelector():
# If the dimension is insufficient, then unsqueeze
# (e.g [Time] -> [Time, 1])
ind
=
tuple
(
slice
(
None
)
if
i
<
x
.
ndim
else
None
for
i
in
range
(
self
.
axis
+
1
)
)
slice
(
None
)
if
i
<
x
.
ndim
else
None
for
i
in
range
(
self
.
axis
+
1
)
)
x
=
x
[
ind
]
if
train
:
...
...
@@ -41,5 +51,6 @@ class ChannelSelector():
else
:
ch
=
channel
ind
=
tuple
(
slice
(
None
)
if
i
!=
self
.
axis
else
ch
for
i
in
range
(
x
.
ndim
))
ind
=
tuple
(
slice
(
None
)
if
i
!=
self
.
axis
else
ch
for
i
in
range
(
x
.
ndim
))
return
x
[
ind
]
deepspeech/transform/functional.py
浏览文件 @
b878027c
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
inspect
from
deepspeech.transform.transform_interface
import
TransformInterface
...
...
@@ -57,7 +70,8 @@ class FuncTrans(TransformInterface):
except
ValueError
:
d
=
dict
()
return
{
k
:
v
.
default
for
k
,
v
in
d
.
items
()
if
v
.
default
!=
inspect
.
Parameter
.
empty
k
:
v
.
default
for
k
,
v
in
d
.
items
()
if
v
.
default
!=
inspect
.
Parameter
.
empty
}
def
__repr__
(
self
):
...
...
deepspeech/transform/perturb.py
浏览文件 @
b878027c
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
librosa
import
numpy
import
scipy
...
...
@@ -5,6 +18,7 @@ import soundfile
from
deepspeech.io.reader
import
SoundHDF5File
class
SpeedPerturbation
():
"""SpeedPerturbation
...
...
@@ -28,8 +42,7 @@ class SpeedPerturbation():
utt2ratio
=
None
,
keep_length
=
True
,
res_type
=
"kaiser_best"
,
seed
=
None
,
):
seed
=
None
,
):
self
.
res_type
=
res_type
self
.
keep_length
=
keep_length
self
.
state
=
numpy
.
random
.
RandomState
(
seed
)
...
...
@@ -60,12 +73,10 @@ class SpeedPerturbation():
self
.
lower
,
self
.
upper
,
self
.
keep_length
,
self
.
res_type
,
)
self
.
res_type
,
)
else
:
return
"{}({}, res_type={})"
.
format
(
self
.
__class__
.
__name__
,
self
.
utt2ratio_file
,
self
.
res_type
)
self
.
__class__
.
__name__
,
self
.
utt2ratio_file
,
self
.
res_type
)
def
__call__
(
self
,
x
,
uttid
=
None
,
train
=
True
):
if
not
train
:
...
...
@@ -85,15 +96,14 @@ class SpeedPerturbation():
diff
=
abs
(
len
(
x
)
-
len
(
y
))
if
len
(
y
)
>
len
(
x
):
# Truncate noise
y
=
y
[
diff
//
2
:
-
((
diff
+
1
)
//
2
)]
y
=
y
[
diff
//
2
:
-
((
diff
+
1
)
//
2
)]
elif
len
(
y
)
<
len
(
x
):
# Assume the time-axis is the first: (Time, Channel)
pad_width
=
[(
diff
//
2
,
(
diff
+
1
)
//
2
)]
+
[
(
0
,
0
)
for
_
in
range
(
y
.
ndim
-
1
)
]
y
=
numpy
.
pad
(
y
,
pad_width
=
pad_width
,
constant_values
=
0
,
mode
=
"constant"
)
y
,
pad_width
=
pad_width
,
constant_values
=
0
,
mode
=
"constant"
)
return
y
...
...
@@ -111,7 +121,7 @@ class BandpassPerturbation():
"""
def
__init__
(
self
,
lower
=
0.0
,
upper
=
0.75
,
seed
=
None
,
axes
=
(
-
1
,)):
def
__init__
(
self
,
lower
=
0.0
,
upper
=
0.75
,
seed
=
None
,
axes
=
(
-
1
,
)):
self
.
lower
=
lower
self
.
upper
=
upper
self
.
state
=
numpy
.
random
.
RandomState
(
seed
)
...
...
@@ -119,18 +129,16 @@ class BandpassPerturbation():
self
.
axes
=
axes
def
__repr__
(
self
):
return
"{}(lower={}, upper={})"
.
format
(
self
.
__class__
.
__name__
,
self
.
lower
,
self
.
upper
)
return
"{}(lower={}, upper={})"
.
format
(
self
.
__class__
.
__name__
,
self
.
lower
,
self
.
upper
)
def
__call__
(
self
,
x_stft
,
uttid
=
None
,
train
=
True
):
if
not
train
:
return
x_stft
if
x_stft
.
ndim
==
1
:
raise
RuntimeError
(
"Input in time-freq domain: "
"(Time, Channel, Freq) or (Time, Freq)"
)
raise
RuntimeError
(
"Input in time-freq domain: "
"(Time, Channel, Freq) or (Time, Freq)"
)
ratio
=
self
.
state
.
uniform
(
self
.
lower
,
self
.
upper
)
axes
=
[
i
if
i
>=
0
else
x_stft
.
ndim
-
i
for
i
in
self
.
axes
]
...
...
@@ -142,7 +150,12 @@ class BandpassPerturbation():
class
VolumePerturbation
():
def
__init__
(
self
,
lower
=-
1.6
,
upper
=
1.6
,
utt2ratio
=
None
,
dbunit
=
True
,
seed
=
None
):
def
__init__
(
self
,
lower
=-
1.6
,
upper
=
1.6
,
utt2ratio
=
None
,
dbunit
=
True
,
seed
=
None
):
self
.
dbunit
=
dbunit
self
.
utt2ratio_file
=
utt2ratio
self
.
lower
=
lower
...
...
@@ -168,12 +181,10 @@ class VolumePerturbation():
def
__repr__
(
self
):
if
self
.
utt2ratio
is
None
:
return
"{}(lower={}, upper={}, dbunit={})"
.
format
(
self
.
__class__
.
__name__
,
self
.
lower
,
self
.
upper
,
self
.
dbunit
)
self
.
__class__
.
__name__
,
self
.
lower
,
self
.
upper
,
self
.
dbunit
)
else
:
return
'{}("{}", dbunit={})'
.
format
(
self
.
__class__
.
__name__
,
self
.
utt2ratio_file
,
self
.
dbunit
)
self
.
__class__
.
__name__
,
self
.
utt2ratio_file
,
self
.
dbunit
)
def
__call__
(
self
,
x
,
uttid
=
None
,
train
=
True
):
if
not
train
:
...
...
@@ -186,7 +197,7 @@ class VolumePerturbation():
else
:
ratio
=
self
.
state
.
uniform
(
self
.
lower
,
self
.
upper
)
if
self
.
dbunit
:
ratio
=
10
**
(
ratio
/
20
)
ratio
=
10
**
(
ratio
/
20
)
return
x
*
ratio
...
...
@@ -201,8 +212,7 @@ class NoiseInjection():
utt2ratio
=
None
,
filetype
=
"list"
,
dbunit
=
True
,
seed
=
None
,
):
seed
=
None
,
):
self
.
utt2noise_file
=
utt2noise
self
.
utt2ratio_file
=
utt2ratio
self
.
filetype
=
filetype
...
...
@@ -242,19 +252,16 @@ class NoiseInjection():
if
utt2noise
is
not
None
and
utt2ratio
is
not
None
:
if
set
(
self
.
utt2ratio
)
!=
set
(
self
.
utt2noise
):
raise
RuntimeError
(
"The uttids mismatch between {} and {}"
.
format
(
utt2ratio
,
utt2noise
)
)
raise
RuntimeError
(
"The uttids mismatch between {} and {}"
.
format
(
utt2ratio
,
utt2noise
))
def
__repr__
(
self
):
if
self
.
utt2ratio
is
None
:
return
"{}(lower={}, upper={}, dbunit={})"
.
format
(
self
.
__class__
.
__name__
,
self
.
lower
,
self
.
upper
,
self
.
dbunit
)
self
.
__class__
.
__name__
,
self
.
lower
,
self
.
upper
,
self
.
dbunit
)
else
:
return
'{}("{}", dbunit={})'
.
format
(
self
.
__class__
.
__name__
,
self
.
utt2ratio_file
,
self
.
dbunit
)
self
.
__class__
.
__name__
,
self
.
utt2ratio_file
,
self
.
dbunit
)
def
__call__
(
self
,
x
,
uttid
=
None
,
train
=
True
):
if
not
train
:
...
...
@@ -268,8 +275,8 @@ class NoiseInjection():
ratio
=
self
.
state
.
uniform
(
self
.
lower
,
self
.
upper
)
if
self
.
dbunit
:
ratio
=
10
**
(
ratio
/
20
)
scale
=
ratio
*
numpy
.
sqrt
((
x
**
2
).
mean
())
ratio
=
10
**
(
ratio
/
20
)
scale
=
ratio
*
numpy
.
sqrt
((
x
**
2
).
mean
())
# 2. Get noise
if
self
.
utt2noise
is
not
None
:
...
...
@@ -280,16 +287,17 @@ class NoiseInjection():
# Randomly select the noise source
noise
=
self
.
state
.
choice
(
list
(
self
.
utt2noise
.
values
()))
# Normalize the level
noise
/=
numpy
.
sqrt
((
noise
**
2
).
mean
())
noise
/=
numpy
.
sqrt
((
noise
**
2
).
mean
())
# Adjust the noise length
diff
=
abs
(
len
(
x
)
-
len
(
noise
))
offset
=
self
.
state
.
randint
(
0
,
diff
)
if
len
(
noise
)
>
len
(
x
):
# Truncate noise
noise
=
noise
[
offset
:
-
(
diff
-
offset
)]
noise
=
noise
[
offset
:
-
(
diff
-
offset
)]
else
:
noise
=
numpy
.
pad
(
noise
,
pad_width
=
[
offset
,
diff
-
offset
],
mode
=
"wrap"
)
noise
=
numpy
.
pad
(
noise
,
pad_width
=
[
offset
,
diff
-
offset
],
mode
=
"wrap"
)
else
:
# Generate white noise
...
...
@@ -329,15 +337,14 @@ class RIRConvolve():
if
x
.
ndim
!=
1
:
# Must be single channel
raise
RuntimeError
(
"Input x must be one dimensional array, but got {}"
.
format
(
x
.
shape
)
)
"Input x must be one dimensional array, but got {}"
.
format
(
x
.
shape
)
)
rir
,
rate
=
self
.
utt2rir
[
uttid
]
if
rir
.
ndim
==
2
:
# FIXME(kamo): Use chainer.convolution_1d?
# return [Time, Channel]
return
numpy
.
stack
(
[
scipy
.
convolve
(
x
,
r
,
mode
=
"same"
)
for
r
in
rir
],
axis
=-
1
)
[
scipy
.
convolve
(
x
,
r
,
mode
=
"same"
)
for
r
in
rir
],
axis
=-
1
)
else
:
return
scipy
.
convolve
(
x
,
rir
,
mode
=
"same"
)
deepspeech/transform/spec_augment.py
浏览文件 @
b878027c
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Spec Augment module for preprocessing i.e., data augmentation"""
import
random
import
numpy
...
...
@@ -27,10 +39,12 @@ def time_warp(x, max_time_warp=80, inplace=False, mode="PIL"):
return
x
# NOTE: randrange(a, b) emits a, a + 1, ..., b - 1
center
=
random
.
randrange
(
window
,
t
-
window
)
warped
=
random
.
randrange
(
center
-
window
,
center
+
window
)
+
1
# 1 ... t - 1
warped
=
random
.
randrange
(
center
-
window
,
center
+
window
)
+
1
# 1 ... t - 1
left
=
Image
.
fromarray
(
x
[:
center
]).
resize
((
x
.
shape
[
1
],
warped
),
BICUBIC
)
right
=
Image
.
fromarray
(
x
[
center
:]).
resize
((
x
.
shape
[
1
],
t
-
warped
),
BICUBIC
)
right
=
Image
.
fromarray
(
x
[
center
:]).
resize
((
x
.
shape
[
1
],
t
-
warped
),
BICUBIC
)
if
inplace
:
x
[:
warped
]
=
left
x
[
warped
:]
=
right
...
...
@@ -44,11 +58,8 @@ def time_warp(x, max_time_warp=80, inplace=False, mode="PIL"):
# TODO(karita): make this differentiable again
return
spec_augment
.
time_warp
(
paddle
.
to_tensor
(
x
),
window
).
numpy
()
else
:
raise
NotImplementedError
(
"unknown resize mode: "
+
mode
+
", choose one from (PIL, sparse_image_warp)."
)
raise
NotImplementedError
(
"unknown resize mode: "
+
mode
+
", choose one from (PIL, sparse_image_warp)."
)
class
TimeWarp
(
FuncTrans
):
...
...
@@ -153,8 +164,7 @@ def spec_augment(
max_time_width
=
100
,
n_time_mask
=
2
,
inplace
=
True
,
replace_with_zero
=
True
,
):
replace_with_zero
=
True
,
):
"""spec agument
apply random time warping and time/freq masking
...
...
@@ -180,15 +190,13 @@ def spec_augment(
max_freq_width
,
n_freq_mask
,
inplace
=
inplace
,
replace_with_zero
=
replace_with_zero
,
)
replace_with_zero
=
replace_with_zero
,
)
x
=
time_mask
(
x
,
max_time_width
,
n_time_mask
,
inplace
=
inplace
,
replace_with_zero
=
replace_with_zero
,
)
replace_with_zero
=
replace_with_zero
,
)
return
x
...
...
deepspeech/transform/spectrogram.py
浏览文件 @
b878027c
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
librosa
import
numpy
as
np
def
stft
(
x
,
n_fft
,
n_shift
,
win_length
=
None
,
window
=
"hann"
,
center
=
True
,
pad_mode
=
"reflect"
):
def
stft
(
x
,
n_fft
,
n_shift
,
win_length
=
None
,
window
=
"hann"
,
center
=
True
,
pad_mode
=
"reflect"
):
# x: [Time, Channel]
if
x
.
ndim
==
1
:
single_channel
=
True
...
...
@@ -25,12 +42,9 @@ def stft(
win_length
=
win_length
,
window
=
window
,
center
=
center
,
pad_mode
=
pad_mode
,
).
T
for
ch
in
range
(
x
.
shape
[
1
])
pad_mode
=
pad_mode
,
).
T
for
ch
in
range
(
x
.
shape
[
1
])
],
axis
=
1
,
)
axis
=
1
,
)
if
single_channel
:
# x: [Time, Channel, Freq] -> [Time, Freq]
...
...
@@ -55,12 +69,9 @@ def istft(x, n_shift, win_length=None, window="hann", center=True):
hop_length
=
n_shift
,
win_length
=
win_length
,
window
=
window
,
center
=
center
,
)
for
ch
in
range
(
x
.
shape
[
1
])
center
=
center
,
)
for
ch
in
range
(
x
.
shape
[
1
])
],
axis
=
1
,
)
axis
=
1
,
)
if
single_channel
:
# x: [Time, Channel] -> [Time]
...
...
@@ -68,7 +79,13 @@ def istft(x, n_shift, win_length=None, window="hann", center=True):
return
x
def
stft2logmelspectrogram
(
x_stft
,
fs
,
n_mels
,
n_fft
,
fmin
=
None
,
fmax
=
None
,
eps
=
1e-10
):
def
stft2logmelspectrogram
(
x_stft
,
fs
,
n_mels
,
n_fft
,
fmin
=
None
,
fmax
=
None
,
eps
=
1e-10
):
# x_stft: (Time, Channel, Freq) or (Time, Freq)
fmin
=
0
if
fmin
is
None
else
fmin
fmax
=
fs
/
2
if
fmax
is
None
else
fmax
...
...
@@ -100,8 +117,7 @@ def logmelspectrogram(
fmin
=
None
,
fmax
=
None
,
eps
=
1e-10
,
pad_mode
=
"reflect"
,
):
pad_mode
=
"reflect"
,
):
# stft: (Time, Channel, Freq) or (Time, Freq)
x_stft
=
stft
(
x
,
...
...
@@ -109,12 +125,16 @@ def logmelspectrogram(
n_shift
=
n_shift
,
win_length
=
win_length
,
window
=
window
,
pad_mode
=
pad_mode
,
)
pad_mode
=
pad_mode
,
)
return
stft2logmelspectrogram
(
x_stft
,
fs
=
fs
,
n_mels
=
n_mels
,
n_fft
=
n_fft
,
fmin
=
fmin
,
fmax
=
fmax
,
eps
=
eps
)
x_stft
,
fs
=
fs
,
n_mels
=
n_mels
,
n_fft
=
n_fft
,
fmin
=
fmin
,
fmax
=
fmax
,
eps
=
eps
)
class
Spectrogram
():
...
...
@@ -125,16 +145,13 @@ class Spectrogram():
self
.
window
=
window
def
__repr__
(
self
):
return
(
"{name}(n_fft={n_fft}, n_shift={n_shift}, "
return
(
"{name}(n_fft={n_fft}, n_shift={n_shift}, "
"win_length={win_length}, window={window})"
.
format
(
name
=
self
.
__class__
.
__name__
,
n_fft
=
self
.
n_fft
,
n_shift
=
self
.
n_shift
,
win_length
=
self
.
win_length
,
window
=
self
.
window
,
)
)
window
=
self
.
window
,
))
def
__call__
(
self
,
x
):
return
spectrogram
(
...
...
@@ -142,8 +159,7 @@ class Spectrogram():
n_fft
=
self
.
n_fft
,
n_shift
=
self
.
n_shift
,
win_length
=
self
.
win_length
,
window
=
self
.
window
,
)
window
=
self
.
window
,
)
class
LogMelSpectrogram
():
...
...
@@ -157,8 +173,7 @@ class LogMelSpectrogram():
window
=
"hann"
,
fmin
=
None
,
fmax
=
None
,
eps
=
1e-10
,
):
eps
=
1e-10
,
):
self
.
fs
=
fs
self
.
n_mels
=
n_mels
self
.
n_fft
=
n_fft
...
...
@@ -170,8 +185,7 @@ class LogMelSpectrogram():
self
.
eps
=
eps
def
__repr__
(
self
):
return
(
"{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, "
return
(
"{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, "
"n_shift={n_shift}, win_length={win_length}, window={window}, "
"fmin={fmin}, fmax={fmax}, eps={eps}))"
.
format
(
name
=
self
.
__class__
.
__name__
,
...
...
@@ -183,9 +197,7 @@ class LogMelSpectrogram():
window
=
self
.
window
,
fmin
=
self
.
fmin
,
fmax
=
self
.
fmax
,
eps
=
self
.
eps
,
)
)
eps
=
self
.
eps
,
))
def
__call__
(
self
,
x
):
return
logmelspectrogram
(
...
...
@@ -195,8 +207,7 @@ class LogMelSpectrogram():
n_fft
=
self
.
n_fft
,
n_shift
=
self
.
n_shift
,
win_length
=
self
.
win_length
,
window
=
self
.
window
,
)
window
=
self
.
window
,
)
class
Stft2LogMelSpectrogram
():
...
...
@@ -209,8 +220,7 @@ class Stft2LogMelSpectrogram():
self
.
eps
=
eps
def
__repr__
(
self
):
return
(
"{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, "
return
(
"{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, "
"fmin={fmin}, fmax={fmax}, eps={eps}))"
.
format
(
name
=
self
.
__class__
.
__name__
,
fs
=
self
.
fs
,
...
...
@@ -218,9 +228,7 @@ class Stft2LogMelSpectrogram():
n_fft
=
self
.
n_fft
,
fmin
=
self
.
fmin
,
fmax
=
self
.
fmax
,
eps
=
self
.
eps
,
)
)
eps
=
self
.
eps
,
))
def
__call__
(
self
,
x
):
return
stft2logmelspectrogram
(
...
...
@@ -229,8 +237,7 @@ class Stft2LogMelSpectrogram():
n_mels
=
self
.
n_mels
,
n_fft
=
self
.
n_fft
,
fmin
=
self
.
fmin
,
fmax
=
self
.
fmax
,
)
fmax
=
self
.
fmax
,
)
class
Stft
():
...
...
@@ -241,8 +248,7 @@ class Stft():
win_length
=
None
,
window
=
"hann"
,
center
=
True
,
pad_mode
=
"reflect"
,
):
pad_mode
=
"reflect"
,
):
self
.
n_fft
=
n_fft
self
.
n_shift
=
n_shift
self
.
win_length
=
win_length
...
...
@@ -251,8 +257,7 @@ class Stft():
self
.
pad_mode
=
pad_mode
def
__repr__
(
self
):
return
(
"{name}(n_fft={n_fft}, n_shift={n_shift}, "
return
(
"{name}(n_fft={n_fft}, n_shift={n_shift}, "
"win_length={win_length}, window={window},"
"center={center}, pad_mode={pad_mode})"
.
format
(
name
=
self
.
__class__
.
__name__
,
...
...
@@ -261,9 +266,7 @@ class Stft():
win_length
=
self
.
win_length
,
window
=
self
.
window
,
center
=
self
.
center
,
pad_mode
=
self
.
pad_mode
,
)
)
pad_mode
=
self
.
pad_mode
,
))
def
__call__
(
self
,
x
):
return
stft
(
...
...
@@ -273,8 +276,7 @@ class Stft():
win_length
=
self
.
win_length
,
window
=
self
.
window
,
center
=
self
.
center
,
pad_mode
=
self
.
pad_mode
,
)
pad_mode
=
self
.
pad_mode
,
)
class
IStft
():
...
...
@@ -285,17 +287,14 @@ class IStft():
self
.
center
=
center
def
__repr__
(
self
):
return
(
"{name}(n_shift={n_shift}, "
return
(
"{name}(n_shift={n_shift}, "
"win_length={win_length}, window={window},"
"center={center})"
.
format
(
name
=
self
.
__class__
.
__name__
,
n_shift
=
self
.
n_shift
,
win_length
=
self
.
win_length
,
window
=
self
.
window
,
center
=
self
.
center
,
)
)
center
=
self
.
center
,
))
def
__call__
(
self
,
x
):
return
istft
(
...
...
@@ -303,5 +302,4 @@ class IStft():
self
.
n_shift
,
win_length
=
self
.
win_length
,
window
=
self
.
window
,
center
=
self
.
center
,
)
center
=
self
.
center
,
)
deepspeech/transform/transform_interface.py
浏览文件 @
b878027c
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# TODO(karita): add this to all the transform impl.
class
TransformInterface
:
"""Transform Interface"""
...
...
deepspeech/transform/transformation.py
浏览文件 @
b878027c
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Transformation module."""
from
collections.abc
import
Sequence
from
collections
import
OrderedDict
import
copy
from
inspect
import
signature
import
io
import
logging
from
collections
import
OrderedDict
from
collections.abc
import
Sequence
from
inspect
import
signature
import
yaml
from
deepspeech.utils.dynamic_import
import
dynamic_import
# TODO(karita): inherit TransformInterface
# TODO(karita): register cmd arguments in asr_train.py
import_alias
=
dict
(
...
...
@@ -33,8 +45,7 @@ import_alias = dict(
istft
=
"deepspeech.transform.spectrogram:IStft"
,
stft2fbank
=
"deepspeech.transform.spectrogram:Stft2LogMelSpectrogram"
,
wpe
=
"deepspeech.transform.wpe:WPE"
,
channel_selector
=
"deepspeech.transform.channel_selector:ChannelSelector"
,
)
channel_selector
=
"deepspeech.transform.channel_selector:ChannelSelector"
,
)
class
Transformation
():
...
...
@@ -83,21 +94,16 @@ class Transformation():
# Some function, e.g. built-in function, are failed
pass
else
:
logging
.
error
(
"Expected signature: {}({})"
.
format
(
class_obj
.
__name__
,
signa
)
)
logging
.
error
(
"Expected signature: {}({})"
.
format
(
class_obj
.
__name__
,
signa
))
raise
else
:
raise
NotImplementedError
(
"Not supporting mode={}"
.
format
(
self
.
conf
[
"mode"
])
)
"Not supporting mode={}"
.
format
(
self
.
conf
[
"mode"
]))
def
__repr__
(
self
):
rep
=
"
\n
"
+
"
\n
"
.
join
(
" {}: {}"
.
format
(
k
,
v
)
for
k
,
v
in
self
.
functions
.
items
()
)
rep
=
"
\n
"
+
"
\n
"
.
join
(
" {}: {}"
.
format
(
k
,
v
)
for
k
,
v
in
self
.
functions
.
items
())
return
"{}({})"
.
format
(
self
.
__class__
.
__name__
,
rep
)
def
__call__
(
self
,
xs
,
uttid_list
=
None
,
**
kwargs
):
...
...
@@ -130,18 +136,19 @@ class Transformation():
_kwargs
=
{
k
:
v
for
k
,
v
in
kwargs
.
items
()
if
k
in
param
}
try
:
if
uttid_list
is
not
None
and
"uttid"
in
param
:
xs
=
[
func
(
x
,
u
,
**
_kwargs
)
for
x
,
u
in
zip
(
xs
,
uttid_list
)]
xs
=
[
func
(
x
,
u
,
**
_kwargs
)
for
x
,
u
in
zip
(
xs
,
uttid_list
)
]
else
:
xs
=
[
func
(
x
,
**
_kwargs
)
for
x
in
xs
]
except
Exception
:
logging
.
fatal
(
"Catch a exception from {}th func: {}"
.
format
(
idx
,
func
)
)
logging
.
fatal
(
"Catch a exception from {}th func: {}"
.
format
(
idx
,
func
))
raise
else
:
raise
NotImplementedError
(
"Not supporting mode={}"
.
format
(
self
.
conf
[
"mode"
])
)
"Not supporting mode={}"
.
format
(
self
.
conf
[
"mode"
]))
if
is_batch
:
return
xs
...
...
deepspeech/transform/wpe.py
浏览文件 @
b878027c
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
nara_wpe.wpe
import
wpe
class
WPE
(
object
):
def
__init__
(
self
,
taps
=
10
,
delay
=
3
,
iterations
=
3
,
psd_context
=
0
,
statistics_mode
=
"full"
):
def
__init__
(
self
,
taps
=
10
,
delay
=
3
,
iterations
=
3
,
psd_context
=
0
,
statistics_mode
=
"full"
):
self
.
taps
=
taps
self
.
delay
=
delay
self
.
iterations
=
iterations
...
...
@@ -12,8 +28,7 @@ class WPE(object):
self
.
statistics_mode
=
statistics_mode
def
__repr__
(
self
):
return
(
"{name}(taps={taps}, delay={delay}"
return
(
"{name}(taps={taps}, delay={delay}"
"iterations={iterations}, psd_context={psd_context}, "
"statistics_mode={statistics_mode})"
.
format
(
name
=
self
.
__class__
.
__name__
,
...
...
@@ -21,9 +36,7 @@ class WPE(object):
delay
=
self
.
delay
,
iterations
=
self
.
iterations
,
psd_context
=
self
.
psd_context
,
statistics_mode
=
self
.
statistics_mode
,
)
)
statistics_mode
=
self
.
statistics_mode
,
))
def
__call__
(
self
,
xs
):
"""Return enhanced
...
...
@@ -40,6 +53,5 @@ class WPE(object):
delay
=
self
.
delay
,
iterations
=
self
.
iterations
,
psd_context
=
self
.
psd_context
,
statistics_mode
=
self
.
statistics_mode
,
)
statistics_mode
=
self
.
statistics_mode
,
)
return
xs
.
transpose
(
2
,
1
,
0
)
deepspeech/utils/check_kwargs.py
浏览文件 @
b878027c
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
inspect
...
...
@@ -17,4 +30,5 @@ def check_kwargs(func, kwargs, name=None):
name
=
func
.
__name__
for
k
in
kwargs
.
keys
():
if
k
not
in
params
:
raise
TypeError
(
f
"
{
name
}
() got an unexpected keyword argument '
{
k
}
'"
)
raise
TypeError
(
f
"
{
name
}
() got an unexpected keyword argument '
{
k
}
'"
)
deepspeech/utils/spec_augment.py
浏览文件 @
b878027c
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
examples/librispeech/s2/README.md
浏览文件 @
b878027c
parakeet/data/batch.py
浏览文件 @
b878027c
...
...
@@ -53,8 +53,8 @@ def batch_text_id(minibatch, pad_id=0, dtype=np.int64):
peek_example
=
minibatch
[
0
]
assert
len
(
peek_example
.
shape
)
==
1
,
"text example is an 1D tensor"
lengths
=
[
example
.
shape
[
0
]
for
example
in
minibatch
]
# assume (channel, n_samples) or (n_samples, )
lengths
=
[
example
.
shape
[
0
]
for
example
in
minibatch
]
# assume (channel, n_samples) or (n_samples, )
max_len
=
np
.
max
(
lengths
)
batch
=
[]
...
...
parakeet/exps/tacotron2/ljspeech.py
浏览文件 @
b878027c
...
...
@@ -67,19 +67,16 @@ class LJSpeechCollector(object):
# Sort by text_len in descending order
texts
=
[
i
for
i
,
_
in
sorted
(
i
for
i
,
_
in
sorted
(
zip
(
texts
,
text_lens
),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
]
mels
=
[
i
for
i
,
_
in
sorted
(
i
for
i
,
_
in
sorted
(
zip
(
mels
,
text_lens
),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
]
mel_lens
=
[
i
for
i
,
_
in
sorted
(
i
for
i
,
_
in
sorted
(
zip
(
mel_lens
,
text_lens
),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
]
...
...
requirements.txt
浏览文件 @
b878027c
...
...
@@ -13,7 +13,7 @@ librosa
llvmlite
loguru
matplotlib
nltk
n
ara_wpen
ltk
numba
numpy
==1.20.0
pandas
...
...
@@ -42,4 +42,3 @@ visualdl==2.2.0
webrtcvad
yacs
yq
nara_wpe
\ No newline at end of file
utils/feat-to-shape.py
浏览文件 @
b878027c
...
...
@@ -12,33 +12,32 @@ from deepspeech.utils.cli_utils import is_scipy_wav_style
def
get_parser
():
parser
=
argparse
.
ArgumentParser
(
description
=
"convert feature to its shape"
,
formatter_class
=
argparse
.
ArgumentDefaultsHelpFormatter
,
)
parser
.
add_argument
(
"--verbose"
,
"-V"
,
default
=
0
,
type
=
int
,
help
=
"Verbose option"
)
formatter_class
=
argparse
.
ArgumentDefaultsHelpFormatter
,
)
parser
.
add_argument
(
"--verbose"
,
"-V"
,
default
=
0
,
type
=
int
,
help
=
"Verbose option"
)
parser
.
add_argument
(
"--filetype"
,
type
=
str
,
default
=
"mat"
,
choices
=
[
"mat"
,
"hdf5"
,
"sound.hdf5"
,
"sound"
],
help
=
"Specify the file format for the rspecifier. "
'"mat" is the matrix format in kaldi'
,
)
'"mat" is the matrix format in kaldi'
,
)
parser
.
add_argument
(
"--preprocess-conf"
,
type
=
str
,
default
=
None
,
help
=
"The configuration file for the pre-processing"
,
)
help
=
"The configuration file for the pre-processing"
,
)
parser
.
add_argument
(
"rspecifier"
,
type
=
str
,
help
=
"Read specifier for feats. e.g. ark:some.ark"
)
"rspecifier"
,
type
=
str
,
help
=
"Read specifier for feats. e.g. ark:some.ark"
)
parser
.
add_argument
(
"out"
,
nargs
=
"?"
,
type
=
argparse
.
FileType
(
"w"
),
default
=
sys
.
stdout
,
help
=
"The output filename. "
"If omitted, then output to sys.stdout"
,
)
help
=
"The output filename. "
"If omitted, then output to sys.stdout"
,
)
return
parser
...
...
@@ -64,8 +63,7 @@ def main():
# so change to file_reader_helper to return shape.
# This make sense only with filetype="hdf5".
for
utt
,
mat
in
file_reader_helper
(
args
.
rspecifier
,
args
.
filetype
,
return_shape
=
preprocessing
is
None
):
args
.
rspecifier
,
args
.
filetype
,
return_shape
=
preprocessing
is
None
):
if
preprocessing
is
not
None
:
if
is_scipy_wav_style
(
mat
):
# If data is sound file, then got as Tuple[int, ndarray]
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录