Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
82ca0f65
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
82ca0f65
编写于
7月 01, 2021
作者:
H
Haoxin Ma
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix the bug of spec shape
上级
043127b6
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
11 addition
and
7 deletion
+11
-7
deepspeech/frontend/featurizer/audio_featurizer.py
deepspeech/frontend/featurizer/audio_featurizer.py
+7
-1
deepspeech/frontend/normalizer.py
deepspeech/frontend/normalizer.py
+2
-2
deepspeech/io/collator.py
deepspeech/io/collator.py
+1
-3
examples/aishell/s0/conf/deepspeech2.yaml
examples/aishell/s0/conf/deepspeech2.yaml
+1
-1
未找到文件。
deepspeech/frontend/featurizer/audio_featurizer.py
浏览文件 @
82ca0f65
...
@@ -175,6 +175,7 @@ class AudioFeaturizer(object):
...
@@ -175,6 +175,7 @@ class AudioFeaturizer(object):
max_freq
=
None
,
max_freq
=
None
,
eps
=
1e-14
):
eps
=
1e-14
):
"""Compute the linear spectrogram from FFT energy."""
"""Compute the linear spectrogram from FFT energy."""
# return T,D
if
max_freq
is
None
:
if
max_freq
is
None
:
max_freq
=
sample_rate
/
2
max_freq
=
sample_rate
/
2
if
max_freq
>
sample_rate
/
2
:
if
max_freq
>
sample_rate
/
2
:
...
@@ -190,8 +191,12 @@ class AudioFeaturizer(object):
...
@@ -190,8 +191,12 @@ class AudioFeaturizer(object):
window_size
=
window_size
,
window_size
=
window_size
,
stride_size
=
stride_size
,
stride_size
=
stride_size
,
sample_rate
=
sample_rate
)
sample_rate
=
sample_rate
)
ind
=
np
.
where
(
freqs
<=
max_freq
)[
0
][
-
1
]
+
1
ind
=
np
.
where
(
freqs
<=
max_freq
)[
0
][
-
1
]
+
1
return
np
.
log
(
specgram
[:
ind
,
:]
+
eps
)
specgram
=
np
.
log
(
specgram
[:
ind
,
:]
+
eps
)
specgram
=
np
.
transpose
(
specgram
)
#T,D
return
specgram
def
_specgram_real
(
self
,
samples
,
window_size
,
stride_size
,
sample_rate
):
def
_specgram_real
(
self
,
samples
,
window_size
,
stride_size
,
sample_rate
):
"""Compute the spectrogram for samples from a real signal."""
"""Compute the spectrogram for samples from a real signal."""
...
@@ -294,6 +299,7 @@ class AudioFeaturizer(object):
...
@@ -294,6 +299,7 @@ class AudioFeaturizer(object):
ceplifter
=
22
,
ceplifter
=
22
,
useEnergy
=
True
,
useEnergy
=
True
,
winfunc
=
'povey'
)
winfunc
=
'povey'
)
mfcc_feat
=
np
.
transpose
(
mfcc_feat
)
mfcc_feat
=
np
.
transpose
(
mfcc_feat
)
if
delta_delta
:
if
delta_delta
:
mfcc_feat
=
self
.
_concat_delta_delta
(
mfcc_feat
)
mfcc_feat
=
self
.
_concat_delta_delta
(
mfcc_feat
)
...
...
deepspeech/frontend/normalizer.py
浏览文件 @
82ca0f65
...
@@ -131,8 +131,8 @@ class FeatureNormalizer(object):
...
@@ -131,8 +131,8 @@ class FeatureNormalizer(object):
def
_read_mean_std_from_file
(
self
,
filepath
,
eps
=
1e-20
):
def
_read_mean_std_from_file
(
self
,
filepath
,
eps
=
1e-20
):
"""Load mean and std from file."""
"""Load mean and std from file."""
mean
,
istd
=
load_cmvn
(
filepath
,
filetype
=
'json'
)
mean
,
istd
=
load_cmvn
(
filepath
,
filetype
=
'json'
)
self
.
_mean
=
np
.
expand_dims
(
mean
,
axis
=
-
1
)
self
.
_mean
=
np
.
expand_dims
(
mean
,
axis
=
0
)
self
.
_istd
=
np
.
expand_dims
(
istd
,
axis
=
-
1
)
self
.
_istd
=
np
.
expand_dims
(
istd
,
axis
=
0
)
def
write_to_file
(
self
,
filepath
):
def
write_to_file
(
self
,
filepath
):
"""Write the mean and stddev to the file.
"""Write the mean and stddev to the file.
...
...
deepspeech/io/collator.py
浏览文件 @
82ca0f65
...
@@ -326,10 +326,8 @@ class SpeechCollator():
...
@@ -326,10 +326,8 @@ class SpeechCollator():
audio
,
text
=
self
.
process_feature_and_transform
(
audio
,
text
)
audio
,
text
=
self
.
process_feature_and_transform
(
audio
,
text
)
#utt
#utt
utts
.
append
(
utt
)
utts
.
append
(
utt
)
# audio
# print("---debug---")
# print("---debug---")
# print(audio.shape)
# print(audio.shape)
audio
=
audio
.
T
audios
.
append
(
audio
)
# [T, D]
audios
.
append
(
audio
)
# [T, D]
audio_lens
.
append
(
audio
.
shape
[
0
])
audio_lens
.
append
(
audio
.
shape
[
0
])
# text
# text
...
@@ -358,7 +356,7 @@ class SpeechCollator():
...
@@ -358,7 +356,7 @@ class SpeechCollator():
self
.
randomize_feature_parameters
(
min
(
audio_lens
),
n_bins
)
self
.
randomize_feature_parameters
(
min
(
audio_lens
),
n_bins
)
for
i
in
range
(
len
(
padded_audios
)):
for
i
in
range
(
len
(
padded_audios
)):
if
not
self
.
_randomize_each_batch
:
if
not
self
.
_randomize_each_batch
:
self
.
randomize_feature_parameters
(
n_bins
,
audio_lens
[
i
]
)
self
.
randomize_feature_parameters
(
audio_lens
[
i
],
n_bins
)
padded_audios
[
i
]
=
self
.
_augmentation_pipeline
.
apply_feature_transform
(
padded_audios
[
i
])
padded_audios
[
i
]
=
self
.
_augmentation_pipeline
.
apply_feature_transform
(
padded_audios
[
i
])
return
utts
,
padded_audios
,
audio_lens
,
padded_texts
,
text_lens
return
utts
,
padded_audios
,
audio_lens
,
padded_texts
,
text_lens
...
...
examples/aishell/s0/conf/deepspeech2.yaml
浏览文件 @
82ca0f65
...
@@ -11,7 +11,7 @@ data:
...
@@ -11,7 +11,7 @@ data:
max_output_input_ratio
:
.inf
max_output_input_ratio
:
.inf
collator
:
collator
:
batch_size
:
32
#
64 # one gpu
batch_size
:
64
# one gpu
randomize_each_batch
:
False
randomize_each_batch
:
False
mean_std_filepath
:
data/mean_std.json
mean_std_filepath
:
data/mean_std.json
unit_type
:
char
unit_type
:
char
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录