Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
aaeef54f
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
aaeef54f
编写于
7月 01, 2021
作者:
H
Haoxin Ma
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix
上级
82ca0f65
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
28 addition
and
33 deletion
+28
-33
deepspeech/frontend/augmentor/augmentation.py
deepspeech/frontend/augmentor/augmentation.py
+3
-3
deepspeech/frontend/augmentor/shift_perturb.py
deepspeech/frontend/augmentor/shift_perturb.py
+2
-4
deepspeech/frontend/augmentor/spec_augment.py
deepspeech/frontend/augmentor/spec_augment.py
+7
-9
deepspeech/frontend/augmentor/speed_perturb.py
deepspeech/frontend/augmentor/speed_perturb.py
+3
-4
deepspeech/frontend/featurizer/audio_featurizer.py
deepspeech/frontend/featurizer/audio_featurizer.py
+2
-2
deepspeech/io/collator.py
deepspeech/io/collator.py
+11
-11
未找到文件。
deepspeech/frontend/augmentor/augmentation.py
浏览文件 @
aaeef54f
...
@@ -103,7 +103,7 @@ class AugmentationPipeline():
...
@@ -103,7 +103,7 @@ class AugmentationPipeline():
"""
"""
for
augmentor
,
rate
in
zip
(
self
.
_augmentors
,
self
.
_rates
):
for
augmentor
,
rate
in
zip
(
self
.
_augmentors
,
self
.
_rates
):
augmentor
.
randomize_parameters
()
augmentor
.
randomize_parameters
()
def
randomize_parameters_feature_transform
(
self
,
n_frames
,
n_bins
):
def
randomize_parameters_feature_transform
(
self
,
n_frames
,
n_bins
):
"""Run the pre-processing pipeline for data augmentation.
"""Run the pre-processing pipeline for data augmentation.
...
@@ -142,7 +142,7 @@ class AugmentationPipeline():
...
@@ -142,7 +142,7 @@ class AugmentationPipeline():
# """Run the pre-processing pipeline for data augmentation.
# """Run the pre-processing pipeline for data augmentation.
# Note that this is an in-place transformation.
# Note that this is an in-place transformation.
# :param audio_segment: Audio segment to process.
# :param audio_segment: Audio segment to process.
# :type audio_segment: AudioSegmenet|SpeechSegment
# :type audio_segment: AudioSegmenet|SpeechSegment
# """
# """
...
@@ -152,7 +152,7 @@ class AugmentationPipeline():
...
@@ -152,7 +152,7 @@ class AugmentationPipeline():
# def transform_feature(self, spec_segment, single=True):
# def transform_feature(self, spec_segment, single=True):
# """spectrogram augmentation.
# """spectrogram augmentation.
# Args:
# Args:
# spec_segment (np.ndarray): audio feature, (D, T).
# spec_segment (np.ndarray): audio feature, (D, T).
# """
# """
...
...
deepspeech/frontend/augmentor/shift_perturb.py
浏览文件 @
aaeef54f
...
@@ -32,7 +32,8 @@ class ShiftPerturbAugmentor(AugmentorBase):
...
@@ -32,7 +32,8 @@ class ShiftPerturbAugmentor(AugmentorBase):
self
.
_rng
=
rng
self
.
_rng
=
rng
def
randomize_parameters
(
self
):
def
randomize_parameters
(
self
):
self
.
shift_ms
=
self
.
_rng
.
uniform
(
self
.
_min_shift_ms
,
self
.
_max_shift_ms
)
self
.
shift_ms
=
self
.
_rng
.
uniform
(
self
.
_min_shift_ms
,
self
.
_max_shift_ms
)
def
apply
(
self
,
audio_segment
):
def
apply
(
self
,
audio_segment
):
audio_segment
.
shift
(
self
.
shift_ms
)
audio_segment
.
shift
(
self
.
shift_ms
)
...
@@ -49,7 +50,6 @@ class ShiftPerturbAugmentor(AugmentorBase):
...
@@ -49,7 +50,6 @@ class ShiftPerturbAugmentor(AugmentorBase):
# self.randomize_parameters()
# self.randomize_parameters()
# self.apply(audio_segment)
# self.apply(audio_segment)
# def transform_audio(self, audio_segment):
# def transform_audio(self, audio_segment):
# """Shift audio.
# """Shift audio.
...
@@ -60,5 +60,3 @@ class ShiftPerturbAugmentor(AugmentorBase):
...
@@ -60,5 +60,3 @@ class ShiftPerturbAugmentor(AugmentorBase):
# """
# """
# shift_ms = self._rng.uniform(self._min_shift_ms, self._max_shift_ms)
# shift_ms = self._rng.uniform(self._min_shift_ms, self._max_shift_ms)
# audio_segment.shift(shift_ms)
# audio_segment.shift(shift_ms)
deepspeech/frontend/augmentor/spec_augment.py
浏览文件 @
aaeef54f
...
@@ -123,18 +123,18 @@ class SpecAugmentor(AugmentorBase):
...
@@ -123,18 +123,18 @@ class SpecAugmentor(AugmentorBase):
def
time_warp
(
xs
,
W
=
40
):
def
time_warp
(
xs
,
W
=
40
):
raise
NotImplementedError
raise
NotImplementedError
def
randomize_parameters
(
self
,
n_frames
,
n_bins
):
def
randomize_parameters
(
self
,
n_frames
,
n_bins
):
# n_bins = xs.shape[0]
# n_bins = xs.shape[0]
# n_frames = xs.shape[1]
# n_frames = xs.shape[1]
self
.
f
=
[]
self
.
f
=
[]
self
.
f_0
=
[]
self
.
f_0
=
[]
self
.
t
=
[]
self
.
t
=
[]
self
.
t_0
=
[]
self
.
t_0
=
[]
for
i
in
range
(
0
,
self
.
n_freq_masks
):
for
i
in
range
(
0
,
self
.
n_freq_masks
):
f
=
int
(
self
.
_rng
.
uniform
(
low
=
0
,
high
=
self
.
F
))
f
=
int
(
self
.
_rng
.
uniform
(
low
=
0
,
high
=
self
.
F
))
self
.
f
.
append
(
f
)
self
.
f
.
append
(
f
)
self
.
f_0
.
append
(
int
(
self
.
_rng
.
uniform
(
low
=
0
,
high
=
n_bins
-
f
)))
self
.
f_0
.
append
(
int
(
self
.
_rng
.
uniform
(
low
=
0
,
high
=
n_bins
-
f
)))
...
@@ -166,7 +166,7 @@ class SpecAugmentor(AugmentorBase):
...
@@ -166,7 +166,7 @@ class SpecAugmentor(AugmentorBase):
f_0
=
self
.
f_0
[
i
]
f_0
=
self
.
f_0
[
i
]
xs
[:,
f_0
:
f_0
+
f
]
=
0
xs
[:,
f_0
:
f_0
+
f
]
=
0
assert
f_0
<=
f_0
+
f
assert
f_0
<=
f_0
+
f
for
i
in
range
(
self
.
n_masks
):
for
i
in
range
(
self
.
n_masks
):
t
=
self
.
t
[
i
]
t
=
self
.
t
[
i
]
t_0
=
self
.
t_0
[
i
]
t_0
=
self
.
t_0
[
i
]
...
@@ -174,7 +174,6 @@ class SpecAugmentor(AugmentorBase):
...
@@ -174,7 +174,6 @@ class SpecAugmentor(AugmentorBase):
assert
t_0
<=
t_0
+
t
assert
t_0
<=
t_0
+
t
return
xs
return
xs
# def mask_freq(self, xs, replace_with_zero=False):
# def mask_freq(self, xs, replace_with_zero=False):
# n_bins = xs.shape[0]
# n_bins = xs.shape[0]
# for i in range(0, self.n_freq_masks):
# for i in range(0, self.n_freq_masks):
...
@@ -208,7 +207,6 @@ class SpecAugmentor(AugmentorBase):
...
@@ -208,7 +207,6 @@ class SpecAugmentor(AugmentorBase):
# self._time_mask = (t_0, t_0 + t)
# self._time_mask = (t_0, t_0 + t)
# return xs
# return xs
# def transform_feature(self, xs: np.ndarray, single=True):
# def transform_feature(self, xs: np.ndarray, single=True):
# """
# """
# Args:
# Args:
...
...
deepspeech/frontend/augmentor/speed_perturb.py
浏览文件 @
aaeef54f
...
@@ -79,7 +79,6 @@ class SpeedPerturbAugmentor(AugmentorBase):
...
@@ -79,7 +79,6 @@ class SpeedPerturbAugmentor(AugmentorBase):
self
.
_rates
=
np
.
linspace
(
self
.
_rates
=
np
.
linspace
(
self
.
_min_rate
,
self
.
_max_rate
,
self
.
_num_rates
,
endpoint
=
True
)
self
.
_min_rate
,
self
.
_max_rate
,
self
.
_num_rates
,
endpoint
=
True
)
def
randomize_parameters
(
self
):
def
randomize_parameters
(
self
):
if
self
.
_num_rates
<
0
:
if
self
.
_num_rates
<
0
:
self
.
speed_rate
=
self
.
_rng
.
uniform
(
self
.
_min_rate
,
self
.
_max_rate
)
self
.
speed_rate
=
self
.
_rng
.
uniform
(
self
.
_min_rate
,
self
.
_max_rate
)
...
@@ -92,8 +91,8 @@ class SpeedPerturbAugmentor(AugmentorBase):
...
@@ -92,8 +91,8 @@ class SpeedPerturbAugmentor(AugmentorBase):
return
return
audio_segment
.
change_speed
(
speed_rate
)
audio_segment
.
change_speed
(
speed_rate
)
def
transform_audio
(
self
,
audio_segment
,
single
=
True
):
def
transform_audio
(
self
,
audio_segment
,
single
=
True
):
"""Sample a new speed rate from the given range and
"""Sample a new speed rate from the given range and
changes the speed of the given audio clip.
changes the speed of the given audio clip.
...
@@ -102,7 +101,7 @@ class SpeedPerturbAugmentor(AugmentorBase):
...
@@ -102,7 +101,7 @@ class SpeedPerturbAugmentor(AugmentorBase):
:param audio_segment: Audio segment to add effects to.
:param audio_segment: Audio segment to add effects to.
:type audio_segment: AudioSegment|SpeechSegment
:type audio_segment: AudioSegment|SpeechSegment
"""
"""
if
(
single
):
if
(
single
):
self
.
randomize_parameters
()
self
.
randomize_parameters
()
self
.
apply
(
audio_segment
)
self
.
apply
(
audio_segment
)
...
...
deepspeech/frontend/featurizer/audio_featurizer.py
浏览文件 @
aaeef54f
...
@@ -195,7 +195,7 @@ class AudioFeaturizer(object):
...
@@ -195,7 +195,7 @@ class AudioFeaturizer(object):
ind
=
np
.
where
(
freqs
<=
max_freq
)[
0
][
-
1
]
+
1
ind
=
np
.
where
(
freqs
<=
max_freq
)[
0
][
-
1
]
+
1
specgram
=
np
.
log
(
specgram
[:
ind
,
:]
+
eps
)
specgram
=
np
.
log
(
specgram
[:
ind
,
:]
+
eps
)
specgram
=
np
.
transpose
(
specgram
)
#T,D
specgram
=
np
.
transpose
(
specgram
)
#T,D
return
specgram
return
specgram
def
_specgram_real
(
self
,
samples
,
window_size
,
stride_size
,
sample_rate
):
def
_specgram_real
(
self
,
samples
,
window_size
,
stride_size
,
sample_rate
):
...
@@ -299,7 +299,7 @@ class AudioFeaturizer(object):
...
@@ -299,7 +299,7 @@ class AudioFeaturizer(object):
ceplifter
=
22
,
ceplifter
=
22
,
useEnergy
=
True
,
useEnergy
=
True
,
winfunc
=
'povey'
)
winfunc
=
'povey'
)
mfcc_feat
=
np
.
transpose
(
mfcc_feat
)
mfcc_feat
=
np
.
transpose
(
mfcc_feat
)
if
delta_delta
:
if
delta_delta
:
mfcc_feat
=
self
.
_concat_delta_delta
(
mfcc_feat
)
mfcc_feat
=
self
.
_concat_delta_delta
(
mfcc_feat
)
...
...
deepspeech/io/collator.py
浏览文件 @
aaeef54f
...
@@ -173,7 +173,6 @@ class SpeechCollator():
...
@@ -173,7 +173,6 @@ class SpeechCollator():
self
.
_stride_ms
=
stride_ms
self
.
_stride_ms
=
stride_ms
self
.
_target_sample_rate
=
target_sample_rate
self
.
_target_sample_rate
=
target_sample_rate
self
.
_speech_featurizer
=
SpeechFeaturizer
(
self
.
_speech_featurizer
=
SpeechFeaturizer
(
unit_type
=
unit_type
,
unit_type
=
unit_type
,
...
@@ -229,9 +228,10 @@ class SpeechCollator():
...
@@ -229,9 +228,10 @@ class SpeechCollator():
def
randomize_audio_parameters
(
self
):
def
randomize_audio_parameters
(
self
):
self
.
_augmentation_pipeline
.
randomize_parameters_audio_transform
()
self
.
_augmentation_pipeline
.
randomize_parameters_audio_transform
()
def
randomize_feature_parameters
(
self
,
n_frames
,
n_bins
):
def
randomize_feature_parameters
(
self
,
n_frames
,
n_bins
):
self
.
_augmentation_pipeline
.
randomize_parameters_feature_transform
(
n_frames
,
n_bins
)
self
.
_augmentation_pipeline
.
randomize_parameters_feature_transform
(
n_frames
,
n_bins
)
def
process_feature_and_transform
(
self
,
audio_file
,
transcript
):
def
process_feature_and_transform
(
self
,
audio_file
,
transcript
):
"""Load, augment, featurize and normalize for speech data.
"""Load, augment, featurize and normalize for speech data.
...
@@ -252,7 +252,7 @@ class SpeechCollator():
...
@@ -252,7 +252,7 @@ class SpeechCollator():
# Spectrum transform
# Spectrum transform
specgram
,
transcript_part
=
self
.
_speech_featurizer
.
featurize
(
specgram
,
transcript_part
=
self
.
_speech_featurizer
.
featurize
(
speech_segment
,
self
.
_keep_transcription_text
)
speech_segment
,
self
.
_keep_transcription_text
)
if
self
.
_normalizer
:
if
self
.
_normalizer
:
specgram
=
self
.
_normalizer
.
apply
(
specgram
)
specgram
=
self
.
_normalizer
.
apply
(
specgram
)
...
@@ -261,7 +261,6 @@ class SpeechCollator():
...
@@ -261,7 +261,6 @@ class SpeechCollator():
return
specgram
,
transcript_part
return
specgram
,
transcript_part
# def process_utterance(self, audio_file, transcript, single=True):
# def process_utterance(self, audio_file, transcript, single=True):
# """Load, augment, featurize and normalize for speech data.
# """Load, augment, featurize and normalize for speech data.
...
@@ -282,11 +281,10 @@ class SpeechCollator():
...
@@ -282,11 +281,10 @@ class SpeechCollator():
# # audio augment
# # audio augment
# self._augmentation_pipeline.transform_audio(speech_segment)
# self._augmentation_pipeline.transform_audio(speech_segment)
# # Spectrum transform
# # Spectrum transform
# specgram, transcript_part = self._speech_featurizer.featurize(
# specgram, transcript_part = self._speech_featurizer.featurize(
# speech_segment, self._keep_transcription_text)
# speech_segment, self._keep_transcription_text)
# if self._normalizer:
# if self._normalizer:
# specgram = self._normalizer.apply(specgram)
# specgram = self._normalizer.apply(specgram)
...
@@ -350,14 +348,16 @@ class SpeechCollator():
...
@@ -350,14 +348,16 @@ class SpeechCollator():
padded_texts
=
pad_sequence
(
padded_texts
=
pad_sequence
(
texts
,
padding_value
=
IGNORE_ID
).
astype
(
np
.
int64
)
texts
,
padding_value
=
IGNORE_ID
).
astype
(
np
.
int64
)
text_lens
=
np
.
array
(
text_lens
).
astype
(
np
.
int64
)
text_lens
=
np
.
array
(
text_lens
).
astype
(
np
.
int64
)
#spec augment
#spec augment
n_bins
=
padded_audios
.
shape
[
2
]
n_bins
=
padded_audios
.
shape
[
2
]
self
.
randomize_feature_parameters
(
min
(
audio_lens
),
n_bins
)
self
.
randomize_feature_parameters
(
min
(
audio_lens
),
n_bins
)
for
i
in
range
(
len
(
padded_audios
)):
for
i
in
range
(
len
(
padded_audios
)):
if
not
self
.
_randomize_each_batch
:
if
not
self
.
_randomize_each_batch
:
self
.
randomize_feature_parameters
(
audio_lens
[
i
],
n_bins
)
self
.
randomize_feature_parameters
(
audio_lens
[
i
],
n_bins
)
padded_audios
[
i
]
=
self
.
_augmentation_pipeline
.
apply_feature_transform
(
padded_audios
[
i
])
padded_audios
[
i
]
=
self
.
_augmentation_pipeline
.
apply_feature_transform
(
padded_audios
[
i
])
return
utts
,
padded_audios
,
audio_lens
,
padded_texts
,
text_lens
return
utts
,
padded_audios
,
audio_lens
,
padded_texts
,
text_lens
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录