Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
17092cbb
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
17092cbb
编写于
10月 09, 2021
作者:
H
Hui Zhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
D,T to T,D
上级
0c9fbaf7
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
122 addition
and
149 deletion
+122
-149
deepspeech/frontend/audio.py
deepspeech/frontend/audio.py
+13
-7
deepspeech/frontend/augmentor/spec_augment.py
deepspeech/frontend/augmentor/spec_augment.py
+4
-4
deepspeech/frontend/featurizer/__init__.py
deepspeech/frontend/featurizer/__init__.py
+3
-0
deepspeech/frontend/featurizer/audio_featurizer.py
deepspeech/frontend/featurizer/audio_featurizer.py
+48
-38
deepspeech/frontend/featurizer/speech_featurizer.py
deepspeech/frontend/featurizer/speech_featurizer.py
+33
-86
deepspeech/frontend/normalizer.py
deepspeech/frontend/normalizer.py
+9
-8
deepspeech/frontend/speech.py
deepspeech/frontend/speech.py
+8
-3
deepspeech/io/collator.py
deepspeech/io/collator.py
+2
-2
examples/librispeech/s1/path.sh
examples/librispeech/s1/path.sh
+2
-1
未找到文件。
deepspeech/frontend/audio.py
浏览文件 @
17092cbb
...
...
@@ -24,8 +24,10 @@ import soundfile
import
soxbindings
as
sox
from
scipy
import
signal
from
.utility
import
subfile_from_tar
class
AudioSegment
(
object
):
class
AudioSegment
():
"""Monaural audio segment abstraction.
:param samples: Audio samples [num_samples x num_channels].
...
...
@@ -68,16 +70,20 @@ class AudioSegment(object):
self
.
duration
,
self
.
rms_db
))
@
classmethod
def
from_file
(
cls
,
file
):
def
from_file
(
cls
,
file
,
infos
=
None
):
"""Create audio segment from audio file.
:param filepath: Filepath or file object to audio file.
:type filepath: str|file
:return: Audio segment instance.
:rtype: AudioSegment
Args:
filepath (str|file): Filepath or file object to audio file.
infos (TarLocalData, optional): tar2obj and tar2infos. Defaults to None.
Returns:
AudioSegment: Audio segment instance.
"""
if
isinstance
(
file
,
str
)
and
re
.
findall
(
r
".seqbin_\d+$"
,
file
):
return
cls
.
from_sequence_file
(
file
)
elif
isinstance
(
file
,
str
)
and
file
.
startswith
(
'tar:'
):
return
cls
.
from_file
(
subfile_from_tar
(
file
,
infos
))
else
:
samples
,
sample_rate
=
soundfile
.
read
(
file
,
dtype
=
'float32'
)
return
cls
(
samples
,
sample_rate
)
...
...
deepspeech/frontend/augmentor/spec_augment.py
浏览文件 @
17092cbb
...
...
@@ -29,10 +29,10 @@ class SpecAugmentor(AugmentorBase):
SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
https://arxiv.org/abs/1904.08779
SpecAugment on Large Scale Datasets
https://arxiv.org/abs/1912.05533
"""
def
__init__
(
self
,
...
...
@@ -61,7 +61,7 @@ class SpecAugmentor(AugmentorBase):
adaptive_size_ratio (float): adaptive size ratio for time masking
max_n_time_masks (int): maximum number of time masking
replace_with_zero (bool): pad zero on mask if true else use mean
warp_mode (str): "PIL" (default, fast, not differentiable)
warp_mode (str): "PIL" (default, fast, not differentiable)
or "sparse_image_warp" (slow, differentiable)
"""
super
().
__init__
()
...
...
@@ -133,7 +133,7 @@ class SpecAugmentor(AugmentorBase):
return
self
.
_time_mask
def
__repr__
(
self
):
return
f
"specaug: F-
{
F
}
, T-
{
T
}
, F-n-
{
n_freq_masks
}
, T-n-
{
n_time_masks
}
"
return
f
"specaug: F-
{
self
.
F
}
, T-
{
self
.
T
}
, F-n-
{
self
.
n_freq_masks
}
, T-n-
{
self
.
n_time_masks
}
"
def
time_warp
(
self
,
x
,
mode
=
'PIL'
):
"""time warp for spec augment
...
...
deepspeech/frontend/featurizer/__init__.py
浏览文件 @
17092cbb
...
...
@@ -11,3 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.audio_featurizer
import
AudioFeaturizer
#noqa: F401
from
.speech_featurizer
import
SpeechFeaturizer
from
.text_featurizer
import
TextFeaturizer
deepspeech/frontend/featurizer/audio_featurizer.py
浏览文件 @
17092cbb
...
...
@@ -18,7 +18,7 @@ from python_speech_features import logfbank
from
python_speech_features
import
mfcc
class
AudioFeaturizer
(
object
):
class
AudioFeaturizer
():
"""Audio featurizer, for extracting features from audio contents of
AudioSegment or SpeechSegment.
...
...
@@ -167,32 +167,6 @@ class AudioFeaturizer(object):
raise
ValueError
(
"Unknown spectrum_type %s. "
"Supported values: linear."
%
self
.
_spectrum_type
)
def
_compute_linear_specgram
(
self
,
samples
,
sample_rate
,
stride_ms
=
10.0
,
window_ms
=
20.0
,
max_freq
=
None
,
eps
=
1e-14
):
"""Compute the linear spectrogram from FFT energy."""
if
max_freq
is
None
:
max_freq
=
sample_rate
/
2
if
max_freq
>
sample_rate
/
2
:
raise
ValueError
(
"max_freq must not be greater than half of "
"sample rate."
)
if
stride_ms
>
window_ms
:
raise
ValueError
(
"Stride size must not be greater than "
"window size."
)
stride_size
=
int
(
0.001
*
sample_rate
*
stride_ms
)
window_size
=
int
(
0.001
*
sample_rate
*
window_ms
)
specgram
,
freqs
=
self
.
_specgram_real
(
samples
,
window_size
=
window_size
,
stride_size
=
stride_size
,
sample_rate
=
sample_rate
)
ind
=
np
.
where
(
freqs
<=
max_freq
)[
0
][
-
1
]
+
1
return
np
.
log
(
specgram
[:
ind
,
:]
+
eps
)
def
_specgram_real
(
self
,
samples
,
window_size
,
stride_size
,
sample_rate
):
"""Compute the spectrogram for samples from a real signal."""
# extract strided windows
...
...
@@ -217,26 +191,65 @@ class AudioFeaturizer(object):
freqs
=
float
(
sample_rate
)
/
window_size
*
np
.
arange
(
fft
.
shape
[
0
])
return
fft
,
freqs
def
_compute_linear_specgram
(
self
,
samples
,
sample_rate
,
stride_ms
=
10.0
,
window_ms
=
20.0
,
max_freq
=
None
,
eps
=
1e-14
):
"""Compute the linear spectrogram from FFT energy.
Args:
samples ([type]): [description]
sample_rate ([type]): [description]
stride_ms (float, optional): [description]. Defaults to 10.0.
window_ms (float, optional): [description]. Defaults to 20.0.
max_freq ([type], optional): [description]. Defaults to None.
eps ([type], optional): [description]. Defaults to 1e-14.
Raises:
ValueError: [description]
ValueError: [description]
Returns:
np.ndarray: log spectrogram, (time, freq)
"""
if
max_freq
is
None
:
max_freq
=
sample_rate
/
2
if
max_freq
>
sample_rate
/
2
:
raise
ValueError
(
"max_freq must not be greater than half of "
"sample rate."
)
if
stride_ms
>
window_ms
:
raise
ValueError
(
"Stride size must not be greater than "
"window size."
)
stride_size
=
int
(
0.001
*
sample_rate
*
stride_ms
)
window_size
=
int
(
0.001
*
sample_rate
*
window_ms
)
specgram
,
freqs
=
self
.
_specgram_real
(
samples
,
window_size
=
window_size
,
stride_size
=
stride_size
,
sample_rate
=
sample_rate
)
ind
=
np
.
where
(
freqs
<=
max_freq
)[
0
][
-
1
]
+
1
# (freq, time)
spec
=
np
.
log
(
specgram
[:
ind
,
:]
+
eps
)
return
np
.
transpose
(
spec
)
def
_concat_delta_delta
(
self
,
feat
):
"""append delat, delta-delta feature.
Args:
feat (np.ndarray): (
D, T
)
feat (np.ndarray): (
T, D
)
Returns:
np.ndarray: feat with delta-delta, (
3*D, T
)
np.ndarray: feat with delta-delta, (
T, 3*D
)
"""
feat
=
np
.
transpose
(
feat
)
# Deltas
d_feat
=
delta
(
feat
,
2
)
# Deltas-Deltas
dd_feat
=
delta
(
feat
,
2
)
# transpose
feat
=
np
.
transpose
(
feat
)
d_feat
=
np
.
transpose
(
d_feat
)
dd_feat
=
np
.
transpose
(
dd_feat
)
# concat above three features
concat_feat
=
np
.
concatenate
((
feat
,
d_feat
,
dd_feat
))
concat_feat
=
np
.
concatenate
((
feat
,
d_feat
,
dd_feat
)
,
axis
=
1
)
return
concat_feat
def
_compute_mfcc
(
self
,
...
...
@@ -292,7 +305,6 @@ class AudioFeaturizer(object):
ceplifter
=
22
,
useEnergy
=
True
,
winfunc
=
'povey'
)
mfcc_feat
=
np
.
transpose
(
mfcc_feat
)
if
delta_delta
:
mfcc_feat
=
self
.
_concat_delta_delta
(
mfcc_feat
)
return
mfcc_feat
...
...
@@ -346,8 +358,6 @@ class AudioFeaturizer(object):
remove_dc_offset
=
True
,
preemph
=
0.97
,
wintype
=
'povey'
)
fbank_feat
=
np
.
transpose
(
fbank_feat
)
if
delta_delta
:
fbank_feat
=
self
.
_concat_delta_delta
(
fbank_feat
)
return
fbank_feat
deepspeech/frontend/featurizer/speech_featurizer.py
浏览文件 @
17092cbb
...
...
@@ -16,38 +16,8 @@ from deepspeech.frontend.featurizer.audio_featurizer import AudioFeaturizer
from
deepspeech.frontend.featurizer.text_featurizer
import
TextFeaturizer
class
SpeechFeaturizer
(
object
):
"""Speech featurizer, for extracting features from both audio and transcript
contents of SpeechSegment.
Currently, for audio parts, it supports feature types of linear
spectrogram and mfcc; for transcript parts, it only supports char-level
tokenizing and conversion into a list of token indices. Note that the
token indexing order follows the given vocabulary file.
:param vocab_filepath: Filepath to load vocabulary for token indices
conversion.
:type spectrum_type: str
:param spectrum_type: Specgram feature type. Options: 'linear', 'mfcc'.
:type spectrum_type: str
:param stride_ms: Striding size (in milliseconds) for generating frames.
:type stride_ms: float
:param window_ms: Window size (in milliseconds) for generating frames.
:type window_ms: float
:param max_freq: When spectrum_type is 'linear', only FFT bins
corresponding to frequencies between [0, max_freq] are
returned; when spectrum_type is 'mfcc', max_freq is the
highest band edge of mel filters.
:types max_freq: None|float
:param target_sample_rate: Speech are resampled (if upsampling or
downsampling is allowed) to this before
extracting spectrogram features.
:type target_sample_rate: float
:param use_dB_normalization: Whether to normalize the audio to a certain
decibels before extracting the features.
:type use_dB_normalization: bool
:param target_dB: Target audio decibels for normalization.
:type target_dB: float
class
SpeechFeaturizer
():
"""Speech and Text feature extraction.
"""
def
__init__
(
self
,
...
...
@@ -64,8 +34,12 @@ class SpeechFeaturizer(object):
target_sample_rate
=
16000
,
use_dB_normalization
=
True
,
target_dB
=-
20
,
dither
=
1.0
):
self
.
_audio_featurizer
=
AudioFeaturizer
(
dither
=
1.0
,
maskctc
=
False
):
self
.
stride_ms
=
stride_ms
self
.
window_ms
=
window_ms
self
.
audio_feature
=
AudioFeaturizer
(
spectrum_type
=
spectrum_type
,
feat_dim
=
feat_dim
,
delta_delta
=
delta_delta
,
...
...
@@ -77,8 +51,14 @@ class SpeechFeaturizer(object):
use_dB_normalization
=
use_dB_normalization
,
target_dB
=
target_dB
,
dither
=
dither
)
self
.
_text_featurizer
=
TextFeaturizer
(
unit_type
,
vocab_filepath
,
spm_model_prefix
)
self
.
feature_size
=
self
.
audio_feature
.
feature_size
self
.
text_feature
=
TextFeaturizer
(
unit_type
=
unit_type
,
vocab_filepath
=
vocab_filepath
,
spm_model_prefix
=
spm_model_prefix
,
maskctc
=
maskctc
)
self
.
vocab_size
=
self
.
text_feature
.
vocab_size
def
featurize
(
self
,
speech_segment
,
keep_transcription_text
):
"""Extract features for speech segment.
...
...
@@ -94,66 +74,33 @@ class SpeechFeaturizer(object):
Returns:
tuple: 1) spectrogram audio feature in 2darray, 2) list oftoken indices.
"""
spec_feature
=
self
.
_audio_featurizer
.
featurize
(
speech_segment
)
spec_feature
=
self
.
audio_feature
.
featurize
(
speech_segment
)
if
keep_transcription_text
:
return
spec_feature
,
speech_segment
.
transcript
if
speech_segment
.
has_token
:
text_ids
=
speech_segment
.
token_ids
else
:
text_ids
=
self
.
_text_featurizer
.
featurize
(
speech_segment
.
transcript
)
text_ids
=
self
.
text_feature
.
featurize
(
speech_segment
.
transcript
)
return
spec_feature
,
text_ids
@
property
def
vocab_size
(
self
):
"""Return the vocabulary size.
Returns:
int: Vocabulary size.
"""
return
self
.
_text_featurizer
.
vocab_size
@
property
def
vocab_list
(
self
):
"""Return the vocabulary in list.
Returns:
List[str]:
"""
return
self
.
_text_featurizer
.
vocab_list
@
property
def
vocab_dict
(
self
):
"""Return the vocabulary in dict.
Returns:
Dict[str, int]:
"""
return
self
.
_text_featurizer
.
vocab_dict
@
property
def
feature_size
(
self
):
"""Return the audio feature size.
def
text_featurize
(
self
,
text
,
keep_transcription_text
):
"""Extract features for speech segment.
Returns:
int: audio feature size.
"""
return
self
.
_audio_featurizer
.
feature_size
1. For audio parts, extract the audio features.
2. For transcript parts, keep the original text or convert text string
to a list of token indices in char-level.
@
property
def
stride_ms
(
self
):
"""time length in `ms` unit per frame
Args:
text (str): text.
keep_transcription_text (bool): True, keep transcript text, False, token ids
Returns:
float: time(ms)/frame
(str|List[int]): text, or list of token indices.
"""
return
self
.
_audio_featurizer
.
stride_ms
@
property
def
text_feature
(
self
):
"""Return the text feature object.
if
keep_transcription_text
:
return
text
Returns:
TextFeaturizer: object.
"""
return
self
.
_text_featurizer
text_ids
=
self
.
text_feature
.
featurize
(
text
)
return
text_ids
deepspeech/frontend/normalizer.py
浏览文件 @
17092cbb
...
...
@@ -40,21 +40,21 @@ class CollateFunc(object):
number
=
0
for
item
in
batch
:
audioseg
=
AudioSegment
.
from_file
(
item
[
'feat'
])
feat
=
self
.
feature_func
(
audioseg
)
#(
D, T
)
feat
=
self
.
feature_func
(
audioseg
)
#(
T, D
)
sums
=
np
.
sum
(
feat
,
axis
=
1
)
sums
=
np
.
sum
(
feat
,
axis
=
0
)
if
mean_stat
is
None
:
mean_stat
=
sums
else
:
mean_stat
+=
sums
square_sums
=
np
.
sum
(
np
.
square
(
feat
),
axis
=
1
)
square_sums
=
np
.
sum
(
np
.
square
(
feat
),
axis
=
0
)
if
var_stat
is
None
:
var_stat
=
square_sums
else
:
var_stat
+=
square_sums
number
+=
feat
.
shape
[
1
]
number
+=
feat
.
shape
[
0
]
return
number
,
mean_stat
,
var_stat
...
...
@@ -120,7 +120,7 @@ class FeatureNormalizer(object):
"""Normalize features to be of zero mean and unit stddev.
:param features: Input features to be normalized.
:type features: ndarray, shape (
D, T
)
:type features: ndarray, shape (
T, D
)
:param eps: added to stddev to provide numerical stablibity.
:type eps: float
:return: Normalized features.
...
...
@@ -130,9 +130,10 @@ class FeatureNormalizer(object):
def
_read_mean_std_from_file
(
self
,
filepath
,
eps
=
1e-20
):
"""Load mean and std from file."""
mean
,
istd
=
load_cmvn
(
filepath
,
filetype
=
'json'
)
self
.
_mean
=
np
.
expand_dims
(
mean
,
axis
=-
1
)
self
.
_istd
=
np
.
expand_dims
(
istd
,
axis
=-
1
)
filetype
=
filepath
.
split
(
"."
)[
-
1
]
mean
,
istd
=
load_cmvn
(
filepath
,
filetype
=
filetype
)
self
.
_mean
=
np
.
expand_dims
(
mean
,
axis
=
0
)
self
.
_istd
=
np
.
expand_dims
(
istd
,
axis
=
0
)
def
write_to_file
(
self
,
filepath
):
"""Write the mean and stddev to the file.
...
...
deepspeech/frontend/speech.py
浏览文件 @
17092cbb
...
...
@@ -68,7 +68,12 @@ class SpeechSegment(AudioSegment):
return
not
self
.
__eq__
(
other
)
@
classmethod
def
from_file
(
cls
,
filepath
,
transcript
,
tokens
=
None
,
token_ids
=
None
):
def
from_file
(
cls
,
filepath
,
transcript
,
tokens
=
None
,
token_ids
=
None
,
infos
=
None
):
"""Create speech segment from audio file and corresponding transcript.
Args:
...
...
@@ -76,12 +81,12 @@ class SpeechSegment(AudioSegment):
transcript (str): Transcript text for the speech.
tokens (List[str], optional): text tokens. Defaults to None.
token_ids (List[int], optional): text token ids. Defaults to None.
infos (TarLocalData, optional): tar2obj and tar2infos. Defaults to None.
Returns:
SpeechSegment: Speech segment instance.
"""
audio
=
AudioSegment
.
from_file
(
filepath
)
audio
=
AudioSegment
.
from_file
(
filepath
,
infos
)
return
cls
(
audio
.
samples
,
audio
.
sample_rate
,
transcript
,
tokens
,
token_ids
)
...
...
deepspeech/io/collator.py
浏览文件 @
17092cbb
...
...
@@ -56,8 +56,8 @@ class SpeechCollator():
for
utt
,
audio
,
text
in
batch
:
utts
.
append
(
utt
)
# audio
audios
.
append
(
audio
.
T
)
# [T, D]
audio_lens
.
append
(
audio
.
shape
[
1
])
audios
.
append
(
audio
)
# [T, D]
audio_lens
.
append
(
audio
.
shape
[
0
])
# text
# for training, text is token ids
# else text is string, convert to unicode ord
...
...
examples/librispeech/s1/path.sh
浏览文件 @
17092cbb
...
...
@@ -3,8 +3,9 @@ export MAIN_ROOT=${PWD}/../../../
export
PATH
=
${
MAIN_ROOT
}
:
${
MAIN_ROOT
}
/utils:
${
PATH
}
export
LC_ALL
=
C
export
PYTHONDONTWRITEBYTECODE
=
1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export
PYTHONIOENCODING
=
UTF-8
export
PYTHONIOENCODING
=
UTF-8
export
PYTHONPATH
=
${
MAIN_ROOT
}
:
${
PYTHONPATH
}
export
LD_LIBRARY_PATH
=
${
LD_LIBRARY_PATH
}
:/usr/local/lib/
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录