Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
93ae5999
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
93ae5999
编写于
6月 17, 2021
作者:
H
Haoxin Ma
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add resampler and apply
上级
89a00eab
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
139 addition
and
25 deletion
+139
-25
deepspeech/frontend/augmentor/augmentation.py
deepspeech/frontend/augmentor/augmentation.py
+4
-4
deepspeech/frontend/augmentor/shift_perturb.py
deepspeech/frontend/augmentor/shift_perturb.py
+24
-3
deepspeech/frontend/augmentor/spec_augment.py
deepspeech/frontend/augmentor/spec_augment.py
+65
-5
deepspeech/frontend/augmentor/speed_perturb.py
deepspeech/frontend/augmentor/speed_perturb.py
+36
-9
deepspeech/io/collator.py
deepspeech/io/collator.py
+10
-4
未找到文件。
deepspeech/frontend/augmentor/augmentation.py
浏览文件 @
93ae5999
...
...
@@ -93,7 +93,7 @@ class AugmentationPipeline():
self
.
_spec_augmentors
,
self
.
_spec_rates
=
self
.
_parse_pipeline_from
(
augmentation_config
,
'feature'
)
def
transform_audio
(
self
,
audio_segment
):
def
transform_audio
(
self
,
audio_segment
,
single
=
True
):
"""Run the pre-processing pipeline for data augmentation.
Note that this is an in-place transformation.
...
...
@@ -103,9 +103,9 @@ class AugmentationPipeline():
"""
for
augmentor
,
rate
in
zip
(
self
.
_augmentors
,
self
.
_rates
):
if
self
.
_rng
.
uniform
(
0.
,
1.
)
<
rate
:
augmentor
.
transform_audio
(
audio_segment
)
augmentor
.
transform_audio
(
audio_segment
,
single
)
def
transform_feature
(
self
,
spec_segment
):
def
transform_feature
(
self
,
spec_segment
,
single
=
True
):
"""spectrogram augmentation.
Args:
...
...
@@ -113,7 +113,7 @@ class AugmentationPipeline():
"""
for
augmentor
,
rate
in
zip
(
self
.
_spec_augmentors
,
self
.
_spec_rates
):
if
self
.
_rng
.
uniform
(
0.
,
1.
)
<
rate
:
spec_segment
=
augmentor
.
transform_feature
(
spec_segment
)
spec_segment
=
augmentor
.
transform_feature
(
spec_segment
,
single
)
return
spec_segment
def
_parse_pipeline_from
(
self
,
config_json
,
aug_type
=
'audio'
):
...
...
deepspeech/frontend/augmentor/shift_perturb.py
浏览文件 @
93ae5999
...
...
@@ -31,7 +31,13 @@ class ShiftPerturbAugmentor(AugmentorBase):
self
.
_max_shift_ms
=
max_shift_ms
self
.
_rng
=
rng
def
transform_audio
(
self
,
audio_segment
):
def
randomize_parameters
(
self
):
self
.
shift_ms
=
self
.
_rng
.
uniform
(
self
.
_min_shift_ms
,
self
.
_max_shift_ms
)
def
apply
(
self
,
audio_segment
):
audio_segment
.
shift
(
self
.
shift_ms
)
def
transform_audio
(
self
,
audio_segment
,
single
):
"""Shift audio.
Note that this is an in-place transformation.
...
...
@@ -39,5 +45,20 @@ class ShiftPerturbAugmentor(AugmentorBase):
:param audio_segment: Audio segment to add effects to.
:type audio_segment: AudioSegmenet|SpeechSegment
"""
shift_ms
=
self
.
_rng
.
uniform
(
self
.
_min_shift_ms
,
self
.
_max_shift_ms
)
audio_segment
.
shift
(
shift_ms
)
if
(
single
):
self
.
randomize_parameters
()
self
.
apply
(
audio_segment
)
# def transform_audio(self, audio_segment):
# """Shift audio.
# Note that this is an in-place transformation.
# :param audio_segment: Audio segment to add effects to.
# :type audio_segment: AudioSegmenet|SpeechSegment
# """
# shift_ms = self._rng.uniform(self._min_shift_ms, self._max_shift_ms)
# audio_segment.shift(shift_ms)
deepspeech/frontend/augmentor/spec_augment.py
浏览文件 @
93ae5999
...
...
@@ -123,6 +123,54 @@ class SpecAugmentor(AugmentorBase):
def
time_warp
(
xs
,
W
=
40
):
raise
NotImplementedError
def
randomize_parameters
(
self
,
xs
):
n_bins
=
xs
.
shape
[
0
]
n_frames
=
xs
.
shape
[
1
]
self
.
f
=
[]
self
.
f_0
=
[]
self
.
t
=
[]
self
.
t_0
=
[]
for
i
in
range
(
0
,
self
.
n_freq_masks
):
f
=
int
(
self
.
_rng
.
uniform
(
low
=
0
,
high
=
self
.
F
))
self
.
f
.
append
(
f
)
self
.
f_0
.
append
(
int
(
self
.
_rng
.
uniform
(
low
=
0
,
high
=
n_bins
-
f
)))
if
self
.
adaptive_number_ratio
>
0
:
n_masks
=
int
(
n_frames
*
self
.
adaptive_number_ratio
)
self
.
n_masks
=
min
(
n_masks
,
self
.
max_n_time_masks
)
else
:
self
.
n_masks
=
self
.
n_time_masks
if
self
.
adaptive_size_ratio
>
0
:
T
=
self
.
adaptive_size_ratio
*
n_frames
else
:
T
=
self
.
T
for
i
in
range
(
self
.
n_masks
):
t
=
int
(
self
.
_rng
.
uniform
(
low
=
0
,
high
=
T
))
t
=
min
(
t
,
int
(
n_frames
*
self
.
p
))
self
.
t
.
append
(
t
)
self
.
t_0
.
append
(
int
(
self
.
_rng
.
uniform
(
low
=
0
,
high
=
n_frames
-
t
)))
def
apply
(
self
,
xs
:
np
.
ndarray
):
n_bins
=
xs
.
shape
[
0
]
n_frames
=
xs
.
shape
[
1
]
for
i
in
range
(
0
,
self
.
n_freq_masks
):
f
=
self
.
f
[
i
]
f_0
=
self
.
f_0
[
i
]
xs
[
f_0
:
f_0
+
f
,
:]
=
0
assert
f_0
<=
f_0
+
f
for
i
in
range
(
self
.
n_masks
):
t
=
self
.
t
[
i
]
t_0
=
self
.
t_0
[
i
]
xs
[:,
t_0
:
t_0
+
t
]
=
0
assert
t_0
<=
t_0
+
t
return
xs
def
mask_freq
(
self
,
xs
,
replace_with_zero
=
False
):
n_bins
=
xs
.
shape
[
0
]
...
...
@@ -157,14 +205,26 @@ class SpecAugmentor(AugmentorBase):
self
.
_time_mask
=
(
t_0
,
t_0
+
t
)
return
xs
def
transform_feature
(
self
,
xs
:
np
.
ndarray
):
def
transform_feature
(
self
,
xs
:
np
.
ndarray
,
single
=
True
):
"""
Args:
xs (FloatTensor): `[F, T]`
Returns:
xs (FloatTensor): `[F, T]`
"""
# xs = self.time_warp(xs)
xs
=
self
.
mask_freq
(
xs
)
xs
=
self
.
mask_time
(
xs
)
return
xs
if
(
single
):
self
.
randomize_parameters
(
xs
)
return
self
.
apply
(
xs
)
# def transform_feature(self, xs: np.ndarray):
# """
# Args:
# xs (FloatTensor): `[F, T]`
# Returns:
# xs (FloatTensor): `[F, T]`
# """
# # xs = self.time_warp(xs)
# xs = self.mask_freq(xs)
# xs = self.mask_time(xs)
# return xs
deepspeech/frontend/augmentor/speed_perturb.py
浏览文件 @
93ae5999
...
...
@@ -79,7 +79,21 @@ class SpeedPerturbAugmentor(AugmentorBase):
self
.
_rates
=
np
.
linspace
(
self
.
_min_rate
,
self
.
_max_rate
,
self
.
_num_rates
,
endpoint
=
True
)
def
transform_audio
(
self
,
audio_segment
):
def
randomize_parameters
(
self
):
if
self
.
_num_rates
<
0
:
self
.
speed_rate
=
self
.
_rng
.
uniform
(
self
.
_min_rate
,
self
.
_max_rate
)
else
:
self
.
speed_rate
=
self
.
_rng
.
choice
(
self
.
_rates
)
def
apply
(
self
,
audio_segment
):
# Skip perturbation in case of identity speed rate
if
speed_rate
==
1.0
:
return
audio_segment
.
change_speed
(
speed_rate
)
def
transform_audio
(
self
,
audio_segment
,
single
=
True
):
"""Sample a new speed rate from the given range and
changes the speed of the given audio clip.
...
...
@@ -88,13 +102,26 @@ class SpeedPerturbAugmentor(AugmentorBase):
:param audio_segment: Audio segment to add effects to.
:type audio_segment: AudioSegment|SpeechSegment
"""
if
self
.
_num_rates
<
0
:
speed_rate
=
self
.
_rng
.
uniform
(
self
.
_min_rate
,
self
.
_max_rate
)
else
:
speed_rate
=
self
.
_rng
.
choice
(
self
.
_rates
)
if
(
single
):
self
.
randomize_parameters
()
self
.
apply
(
audio_segment
)
# Skip perturbation in case of identity speed rate
if
speed_rate
==
1.0
:
return
# def transform_audio(self, audio_segment):
# """Sample a new speed rate from the given range and
# changes the speed of the given audio clip.
audio_segment
.
change_speed
(
speed_rate
)
# Note that this is an in-place transformation.
# :param audio_segment: Audio segment to add effects to.
# :type audio_segment: AudioSegment|SpeechSegment
# """
# if self._num_rates < 0:
# speed_rate = self._rng.uniform(self._min_rate, self._max_rate)
# else:
# speed_rate = self._rng.choice(self._rates)
# # Skip perturbation in case of identity speed rate
# if speed_rate == 1.0:
# return
# audio_segment.change_speed(speed_rate)
deepspeech/io/collator.py
浏览文件 @
93ae5999
...
...
@@ -192,7 +192,7 @@ class SpeechCollator():
return
self
.
_local_data
.
tar2object
[
tarpath
].
extractfile
(
self
.
_local_data
.
tar2info
[
tarpath
][
filename
])
def
process_utterance
(
self
,
audio_file
,
transcript
):
def
process_utterance
(
self
,
audio_file
,
transcript
,
single
=
True
):
"""Load, augment, featurize and normalize for speech data.
:param audio_file: Filepath or file object of audio file.
...
...
@@ -214,7 +214,7 @@ class SpeechCollator():
# audio augment
start_time
=
time
.
time
()
self
.
_augmentation_pipeline
.
transform_audio
(
speech_segment
)
self
.
_augmentation_pipeline
.
transform_audio
(
speech_segment
,
single
)
audio_aug_time
=
time
.
time
()
-
start_time
#logger.debug(f"audio augmentation time: {audio_aug_time}")
...
...
@@ -228,7 +228,7 @@ class SpeechCollator():
# specgram augment
start_time
=
time
.
time
()
specgram
=
self
.
_augmentation_pipeline
.
transform_feature
(
specgram
)
specgram
=
self
.
_augmentation_pipeline
.
transform_feature
(
specgram
,
single
)
feature_aug_time
=
time
.
time
()
-
start_time
#logger.debug(f"audio feature augmentation time: {feature_aug_time}")
return
specgram
,
transcript_part
...
...
@@ -253,8 +253,14 @@ class SpeechCollator():
texts
=
[]
text_lens
=
[]
utts
=
[]
# print('----debug---')
# print(batch)
# print(type(batch))
# print(len(batch))
resample
=
True
for
utt
,
audio
,
text
in
batch
:
audio
,
text
=
self
.
process_utterance
(
audio
,
text
)
audio
,
text
=
self
.
process_utterance
(
audio
,
text
,
single
=
resample
)
# resample=False
#utt
utts
.
append
(
utt
)
# audio
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录