Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
ae566f66
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
ae566f66
编写于
6月 25, 2021
作者:
H
Haoxin Ma
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
separate
上级
60ac4bc2
变更
3
显示空白变更内容
内联
并排
Showing
3 changed file
with
121 addition
and
21 deletion
+121
-21
deepspeech/frontend/augmentor/augmentation.py
deepspeech/frontend/augmentor/augmentation.py
+49
-4
deepspeech/frontend/augmentor/spec_augment.py
deepspeech/frontend/augmentor/spec_augment.py
+3
-3
deepspeech/io/collator.py
deepspeech/io/collator.py
+69
-14
未找到文件。
deepspeech/frontend/augmentor/augmentation.py
浏览文件 @
ae566f66
...
...
@@ -93,7 +93,29 @@ class AugmentationPipeline():
self
.
_spec_augmentors
,
self
.
_spec_rates
=
self
.
_parse_pipeline_from
(
augmentation_config
,
'feature'
)
def
transform_audio
(
self
,
audio_segment
,
single
=
True
):
def
randomize_parameters_audio_transform
(
self
):
"""Run the pre-processing pipeline for data augmentation.
Note that this is an in-place transformation.
:param audio_segment: Audio segment to process.
:type audio_segment: AudioSegmenet|SpeechSegment
"""
for
augmentor
,
rate
in
zip
(
self
.
_augmentors
,
self
.
_rates
):
augmentor
.
randomize_parameters
()
def
randomize_parameters_feature_transform
(
self
,
audio
):
"""Run the pre-processing pipeline for data augmentation.
Note that this is an in-place transformation.
:param audio_segment: Audio segment to process.
:type audio_segment: AudioSegmenet|SpeechSegment
"""
for
augmentor
,
rate
in
zip
(
self
.
_augmentors
,
self
.
_rates
):
augmentor
.
randomize_parameters
(
audio
)
def
apply_audio_transform
(
self
,
audio_segment
):
"""Run the pre-processing pipeline for data augmentation.
Note that this is an in-place transformation.
...
...
@@ -103,9 +125,9 @@ class AugmentationPipeline():
"""
for
augmentor
,
rate
in
zip
(
self
.
_augmentors
,
self
.
_rates
):
if
self
.
_rng
.
uniform
(
0.
,
1.
)
<
rate
:
augmentor
.
transform_audio
(
audio_segment
,
single
)
augmentor
.
apply
(
audio_segment
)
def
transform_feature
(
self
,
spec_segment
,
single
=
True
):
def
apply_feature_transform
(
self
,
spec_segment
):
"""spectrogram augmentation.
Args:
...
...
@@ -113,9 +135,32 @@ class AugmentationPipeline():
"""
for
augmentor
,
rate
in
zip
(
self
.
_spec_augmentors
,
self
.
_spec_rates
):
if
self
.
_rng
.
uniform
(
0.
,
1.
)
<
rate
:
spec_segment
=
augmentor
.
transform_feature
(
spec_segment
,
single
)
spec_segment
=
augmentor
.
apply
(
spec_segment
)
return
spec_segment
# def transform_audio(self, audio_segment, single=True):
# """Run the pre-processing pipeline for data augmentation.
# Note that this is an in-place transformation.
# :param audio_segment: Audio segment to process.
# :type audio_segment: AudioSegmenet|SpeechSegment
# """
# for augmentor, rate in zip(self._augmentors, self._rates):
# if self._rng.uniform(0., 1.) < rate:
# augmentor.transform_audio(audio_segment, single)
# def transform_feature(self, spec_segment, single=True):
# """spectrogram augmentation.
# Args:
# spec_segment (np.ndarray): audio feature, (D, T).
# """
# for augmentor, rate in zip(self._spec_augmentors, self._spec_rates):
# if self._rng.uniform(0., 1.) < rate:
# spec_segment = augmentor.transform_feature(spec_segment, single)
# return spec_segment
def
_parse_pipeline_from
(
self
,
config_json
,
aug_type
=
'audio'
):
"""Parse the config json to build a augmentation pipelien."""
assert
aug_type
in
(
'audio'
,
'feature'
),
aug_type
...
...
deepspeech/frontend/augmentor/spec_augment.py
浏览文件 @
ae566f66
...
...
@@ -124,9 +124,9 @@ class SpecAugmentor(AugmentorBase):
def
time_warp
(
xs
,
W
=
40
):
raise
NotImplementedError
def
randomize_parameters
(
self
,
xs
):
n_bins
=
xs
.
shape
[
0
]
n_frames
=
xs
.
shape
[
1
]
def
randomize_parameters
(
self
,
n_bins
,
n_frame
):
#
n_bins = xs.shape[0]
#
n_frames = xs.shape[1]
self
.
f
=
[]
self
.
f_0
=
[]
...
...
deepspeech/io/collator.py
浏览文件 @
ae566f66
...
...
@@ -215,7 +215,21 @@ class SpeechCollator():
return
self
.
_local_data
.
tar2object
[
tarpath
].
extractfile
(
self
.
_local_data
.
tar2info
[
tarpath
][
filename
])
def
process_utterance
(
self
,
audio_file
,
transcript
,
single
=
True
):
def
load_audio
(
self
,
audio_file
,
transcript
):
if
isinstance
(
audio_file
,
str
)
and
audio_file
.
startswith
(
'tar:'
):
speech_segment
=
SpeechSegment
.
from_file
(
self
.
_subfile_from_tar
(
audio_file
),
transcript
)
else
:
speech_segment
=
SpeechSegment
.
from_file
(
audio_file
,
transcript
)
return
speech_segment
def
randomize_audio_parameters
(
self
):
self
.
_augmentation_pipeline
.
andomize_parameters_audio_transform
()
def
randomize_feature_parameters
(
self
,
n_bins
,
n_frames
):
self
.
_augmentation_pipeline
.
andomize_parameters_feature_transform
(
n_bins
,
n_frames
)
def
process_utterance
(
self
,
audio_file
,
transcript
):
"""Load, augment, featurize and normalize for speech data.
:param audio_file: Filepath or file object of audio file.
...
...
@@ -226,25 +240,56 @@ class SpeechCollator():
where transcription part could be token ids or text.
:rtype: tuple of (2darray, list)
"""
if
isinstance
(
audio_file
,
str
)
and
audio_file
.
startswith
(
'tar:'
):
speech_segment
=
SpeechSegment
.
from_file
(
self
.
_subfile_from_tar
(
audio_file
),
transcript
)
else
:
speech_segment
=
SpeechSegment
.
from_file
(
audio_file
,
transcript
)
speech_segment
=
self
.
load_audio
(
audio_file
,
transcript
)
# audio augment
self
.
_augmentation_pipeline
.
transform_audio
(
speech_segment
)
# a
pply a
udio augment
self
.
_augmentation_pipeline
.
apply_audio_transform
(
speech_segment
)
# Spectrum transform
specgram
,
transcript_part
=
self
.
_speech_featurizer
.
featurize
(
speech_segment
,
self
.
_keep_transcription_text
)
if
self
.
_normalizer
:
specgram
=
self
.
_normalizer
.
apply
(
specgram
)
# specgram augment
specgram
=
self
.
_augmentation_pipeline
.
transform_feature
(
specgram
)
# # apply specgram augment
# specgram = self._augmentation_pipeline.apply_feature_transform(specgram)
return
specgram
,
transcript_part
# def process_utterance(self, audio_file, transcript, single=True):
# """Load, augment, featurize and normalize for speech data.
# :param audio_file: Filepath or file object of audio file.
# :type audio_file: str | file
# :param transcript: Transcription text.
# :type transcript: str
# :return: Tuple of audio feature tensor and data of transcription part,
# where transcription part could be token ids or text.
# :rtype: tuple of (2darray, list)
# """
# if isinstance(audio_file, str) and audio_file.startswith('tar:'):
# speech_segment = SpeechSegment.from_file(
# self._subfile_from_tar(audio_file), transcript)
# else:
# speech_segment = SpeechSegment.from_file(audio_file, transcript)
# # audio augment
# self._augmentation_pipeline.transform_audio(speech_segment)
# # Spectrum transform
# specgram, transcript_part = self._speech_featurizer.featurize(
# speech_segment, self._keep_transcription_text)
# if self._normalizer:
# specgram = self._normalizer.apply(specgram)
# # specgram augment
# specgram = self._augmentation_pipeline.transform_feature(specgram)
# return specgram, transcript_part
def
__call__
(
self
,
batch
):
"""batch examples
...
...
@@ -269,10 +314,11 @@ class SpeechCollator():
# print(batch)
# print(type(batch))
# print(len(batch))
resample
=
True
self
.
randomize_audio_parameters
()
for
utt
,
audio
,
text
in
batch
:
audio
,
text
=
self
.
process_utterance
(
audio
,
text
,
single
=
resample
)
# resample=False
if
not
self
.
config
.
randomize_each_batch
:
self
.
randomize_audio_parameters
()
audio
,
text
=
self
.
process_utterance
(
audio
,
text
)
#utt
utts
.
append
(
utt
)
# audio
...
...
@@ -298,6 +344,15 @@ class SpeechCollator():
padded_texts
=
pad_sequence
(
texts
,
padding_value
=
IGNORE_ID
).
astype
(
np
.
int64
)
text_lens
=
np
.
array
(
text_lens
).
astype
(
np
.
int64
)
#spec augment
n_bins
=
padded_audios
[
0
]
self
.
randomize_feature_parameters
(
n_bins
,
min
(
audio_lens
))
for
i
in
range
(
len
(
padded_audios
)):
if
not
self
.
config
.
randomize_each_batch
:
self
.
randomize_feature_parameters
(
n_bins
,
audio_lens
[
i
])
padded_audios
[
i
]
=
self
.
_augmentation_pipeline
.
apply_feature_transform
(
padded_audios
[
i
])
return
utts
,
padded_audios
,
audio_lens
,
padded_texts
,
text_lens
@
property
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录