Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
8b0e344c
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
1 年多 前同步成功
通知
207
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
8b0e344c
编写于
11月 08, 2021
作者:
H
Hui Zhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix logfbank using PCM16
上级
d62092ac
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
76 addition
and
36 deletion
+76
-36
examples/librispeech/s1/conf/preprocess.yaml
examples/librispeech/s1/conf/preprocess.yaml
+0
-4
paddlespeech/s2t/frontend/audio.py
paddlespeech/s2t/frontend/audio.py
+4
-26
paddlespeech/s2t/frontend/utility.py
paddlespeech/s2t/frontend/utility.py
+50
-1
paddlespeech/s2t/transform/spectrogram.py
paddlespeech/s2t/transform/spectrogram.py
+22
-5
未找到文件。
examples/librispeech/s1/conf/preprocess.yaml
浏览文件 @
8b0e344c
...
...
@@ -23,7 +23,3 @@ process:
n_mask
:
2
inplace
:
true
replace_with_zero
:
true
paddlespeech/s2t/frontend/audio.py
浏览文件 @
8b0e344c
...
...
@@ -25,6 +25,8 @@ import soxbindings as sox
from
scipy
import
signal
from
.utility
import
subfile_from_tar
from
.utility
import
convert_samples_to_float32
from
.utility
import
convert_samples_from_float32
class
AudioSegment
():
...
...
@@ -689,15 +691,7 @@ class AudioSegment():
Audio sample type is usually integer or float-point.
Integers will be scaled to [-1, 1] in float32.
"""
float32_samples
=
samples
.
astype
(
'float32'
)
if
samples
.
dtype
in
np
.
sctypes
[
'int'
]:
bits
=
np
.
iinfo
(
samples
.
dtype
).
bits
float32_samples
*=
(
1.
/
2
**
(
bits
-
1
))
elif
samples
.
dtype
in
np
.
sctypes
[
'float'
]:
pass
else
:
raise
TypeError
(
"Unsupported sample type: %s."
%
samples
.
dtype
)
return
float32_samples
return
convert_samples_to_float32
(
samples
)
def
_convert_samples_from_float32
(
self
,
samples
,
dtype
):
"""Convert sample type from float32 to dtype.
...
...
@@ -708,20 +702,4 @@ class AudioSegment():
This is for writing a audio file.
"""
dtype
=
np
.
dtype
(
dtype
)
output_samples
=
samples
.
copy
()
if
dtype
in
np
.
sctypes
[
'int'
]:
bits
=
np
.
iinfo
(
dtype
).
bits
output_samples
*=
(
2
**
(
bits
-
1
)
/
1.
)
min_val
=
np
.
iinfo
(
dtype
).
min
max_val
=
np
.
iinfo
(
dtype
).
max
output_samples
[
output_samples
>
max_val
]
=
max_val
output_samples
[
output_samples
<
min_val
]
=
min_val
elif
samples
.
dtype
in
np
.
sctypes
[
'float'
]:
min_val
=
np
.
finfo
(
dtype
).
min
max_val
=
np
.
finfo
(
dtype
).
max
output_samples
[
output_samples
>
max_val
]
=
max_val
output_samples
[
output_samples
<
min_val
]
=
min_val
else
:
raise
TypeError
(
"Unsupported sample type: %s."
%
samples
.
dtype
)
return
output_samples
.
astype
(
dtype
)
return
convert_samples_from_float32
(
samples
,
dtype
)
paddlespeech/s2t/frontend/utility.py
浏览文件 @
8b0e344c
...
...
@@ -30,7 +30,8 @@ logger = Log(__name__).getlog()
__all__
=
[
"load_dict"
,
"load_cmvn"
,
"read_manifest"
,
"rms_to_db"
,
"rms_to_dbfs"
,
"max_dbfs"
,
"mean_dbfs"
,
"gain_db_to_ratio"
,
"normalize_audio"
,
"SOS"
,
"EOS"
,
"UNK"
,
"BLANK"
,
"MASKCTC"
,
"SPACE"
"EOS"
,
"UNK"
,
"BLANK"
,
"MASKCTC"
,
"SPACE"
,
"convert_samples_to_float32"
,
"convert_samples_from_float32"
]
IGNORE_ID
=
-
1
...
...
@@ -342,3 +343,51 @@ def load_cmvn(cmvn_file: str, filetype: str):
else
:
raise
ValueError
(
f
"cmvn file type no support:
{
filetype
}
"
)
return
cmvn
[
0
],
cmvn
[
1
]
def
convert_samples_to_float32
(
samples
):
"""Convert sample type to float32.
Audio sample type is usually integer or float-point.
Integers will be scaled to [-1, 1] in float32.
PCM16 -> PCM32
"""
float32_samples
=
samples
.
astype
(
'float32'
)
if
samples
.
dtype
in
np
.
sctypes
[
'int'
]:
bits
=
np
.
iinfo
(
samples
.
dtype
).
bits
float32_samples
*=
(
1.
/
2
**
(
bits
-
1
))
elif
samples
.
dtype
in
np
.
sctypes
[
'float'
]:
pass
else
:
raise
TypeError
(
"Unsupported sample type: %s."
%
samples
.
dtype
)
return
float32_samples
def
convert_samples_from_float32
(
samples
,
dtype
):
"""Convert sample type from float32 to dtype.
Audio sample type is usually integer or float-point. For integer
type, float32 will be rescaled from [-1, 1] to the maximum range
supported by the integer type.
PCM32 -> PCM16
"""
dtype
=
np
.
dtype
(
dtype
)
output_samples
=
samples
.
copy
()
if
dtype
in
np
.
sctypes
[
'int'
]:
bits
=
np
.
iinfo
(
dtype
).
bits
output_samples
*=
(
2
**
(
bits
-
1
)
/
1.
)
min_val
=
np
.
iinfo
(
dtype
).
min
max_val
=
np
.
iinfo
(
dtype
).
max
output_samples
[
output_samples
>
max_val
]
=
max_val
output_samples
[
output_samples
<
min_val
]
=
min_val
elif
samples
.
dtype
in
np
.
sctypes
[
'float'
]:
min_val
=
np
.
finfo
(
dtype
).
min
max_val
=
np
.
finfo
(
dtype
).
max
output_samples
[
output_samples
>
max_val
]
=
max_val
output_samples
[
output_samples
<
min_val
]
=
min_val
else
:
raise
TypeError
(
"Unsupported sample type: %s."
%
samples
.
dtype
)
return
output_samples
.
astype
(
dtype
)
paddlespeech/s2t/transform/spectrogram.py
浏览文件 @
8b0e344c
...
...
@@ -307,6 +307,9 @@ class IStft():
center
=
self
.
center
,
)
from
paddlespeech.s2t.utils.log
import
Log
logger
=
Log
(
__name__
).
getlog
()
class
LogMelSpectrogramKaldi
():
def
__init__
(
self
,
...
...
@@ -346,7 +349,7 @@ class LogMelSpectrogramKaldi():
def
__repr__
(
self
):
return
(
"{name}(fs={fs}, n_mels={n_mels}, n_fft={n_fft}, "
"n_shift={n_shift}, win_length={win_length}, window={window}, "
"fmin={fmin}, fmax={fmax}, eps={eps}))"
.
format
(
"fmin={fmin}, fmax={fmax}, eps={eps}
, preemph={preemph}, window={window}, dither={dither}
))"
.
format
(
name
=
self
.
__class__
.
__name__
,
fs
=
self
.
fs
,
n_mels
=
self
.
n_mels
,
...
...
@@ -356,7 +359,10 @@ class LogMelSpectrogramKaldi():
window
=
self
.
window
,
fmin
=
self
.
fmin
,
fmax
=
self
.
fmax
,
eps
=
self
.
eps
,
))
eps
=
self
.
eps
,
preemph
=
self
.
preemph
,
window
=
self
.
window
,
dither
=
self
.
dither
))
def
__call__
(
self
,
x
):
"""
...
...
@@ -372,9 +378,16 @@ class LogMelSpectrogramKaldi():
"""
if
x
.
ndim
!=
1
:
raise
ValueError
(
"Not support x: [Time, Channel]"
)
if
x
.
dtype
==
np
.
int16
:
x
=
x
/
2
**
(
16
-
1
)
return
logfbank
(
logger
.
info
(
f
"in
{
x
}
"
)
if
x
.
dtype
in
np
.
sctypes
[
'float'
]:
# PCM32 -> PCM16
bits
=
np
.
iinfo
(
np
.
int16
).
bits
x
=
x
*
2
**
(
bits
-
1
)
logger
.
info
(
f
"b
{
x
}
"
)
# logfbank need PCM16 input
y
=
logfbank
(
signal
=
x
,
samplerate
=
self
.
fs
,
winlen
=
self
.
win_length
,
# unit ms
...
...
@@ -387,3 +400,7 @@ class LogMelSpectrogramKaldi():
remove_dc_offset
=
self
.
remove_dc_offset
,
preemph
=
self
.
preemph
,
wintype
=
self
.
window
)
logger
.
info
(
f
"a
{
y
}
"
)
return
y
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录