Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
42f93b2c
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
42f93b2c
编写于
6月 15, 2021
作者:
H
Hui Zhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
read with 2d; window process
上级
58f540c8
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
66 addition
and
21 deletion
+66
-21
third_party/paddle_audio/frontend/kaldi.py
third_party/paddle_audio/frontend/kaldi.py
+58
-16
third_party/paddle_audio/frontend/kaldi_test.py
third_party/paddle_audio/frontend/kaldi_test.py
+8
-5
未找到文件。
third_party/paddle_audio/frontend/kaldi.py
浏览文件 @
42f93b2c
...
@@ -9,12 +9,26 @@ import soundfile as sf
...
@@ -9,12 +9,26 @@ import soundfile as sf
from
.common
import
get_window
,
dft_matrix
from
.common
import
get_window
,
dft_matrix
def
read
(
wavpath
:
str
,
sr
:
int
=
None
,
dtype
=
'int16'
)
->
Tuple
[
int
,
np
.
ndarray
]:
def
read
(
wavpath
:
str
,
sr
:
int
=
None
,
start
=
0
,
stop
=
None
,
dtype
=
'int16'
,
always_2d
=
True
)
->
Tuple
[
int
,
np
.
ndarray
]:
wav
,
r_sr
=
sf
.
read
(
wavpath
,
dtype
=
dtype
)
"""load wav file.
Args:
wavpath (str): wav path.
sr (int, optional): expect sample rate. Defaults to None.
dtype (str, optional): wav data bits. Defaults to 'int16'.
Returns:
Tuple[int, np.ndarray]: sr (int), wav (int16) [T, C].
"""
wav
,
r_sr
=
sf
.
read
(
wavpath
,
start
=
start
,
stop
=
stop
,
dtype
=
dtype
,
always_2d
=
always_2d
)
if
sr
:
if
sr
:
assert
sr
==
r_sr
assert
sr
==
r_sr
return
r_sr
,
wav
return
r_sr
,
wav
def
write
(
wavpath
:
str
,
wav
:
np
.
ndarray
,
sr
:
int
,
dtype
=
'PCM_16'
):
sf
.
write
(
wavpath
,
wav
,
sr
,
subtype
=
dtype
)
def
frames
(
x
:
Tensor
,
def
frames
(
x
:
Tensor
,
num_samples
:
Tensor
,
num_samples
:
Tensor
,
...
@@ -68,27 +82,46 @@ def frames(x: Tensor,
...
@@ -68,27 +82,46 @@ def frames(x: Tensor,
return
frames
,
num_frames
return
frames
,
num_frames
def
dither
(
signal
,
dither_value
=
1.0
):
def
dither
(
signal
:
Tensor
,
dither_value
=
1.0
)
->
Tensor
:
signal
+=
paddle
.
normal
(
shape
=
signal
.
shape
)
*
dither_value
"""dither frames for log compute.
Args:
signal (Tensor): [B, T, D]
dither_value (float, optional): [scalar]. Defaults to 1.0.
Returns:
Tensor: [B, T, D]
"""
signal
+=
paddle
.
normal
(
shape
=
[
1
,
1
,
signal
.
shape
[
-
1
]])
*
dither_value
return
signal
return
signal
def
remove_dc_offset
(
signal
):
def
remove_dc_offset
(
signal
:
Tensor
)
->
Tensor
:
signal
-=
paddle
.
mean
(
signal
)
"""remove dc.
return
signal
Args:
signal (Tensor): [B, T, D]
Returns:
Tensor: [B, T, D]
"""
signal
-=
paddle
.
mean
(
signal
,
axis
=-
1
)
return
signal
def
preemphasis
(
signal
,
coeff
=
0.97
)
:
def
preemphasis
(
signal
:
Tensor
,
coeff
=
0.97
)
->
Tensor
:
"""perform preemphasis on the input signal.
"""perform preemphasis on the input signal.
:param signal: The signal to filter.
Args:
:param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
signal (Tensor): [B, T, D], The signal to filter.
:returns: the filtered signal.
coeff (float, optional): [scalar].The preemphasis coefficient. 0 is no filter, Defaults to 0.97.
Returns:
Tensor: [B, T, D]
"""
"""
return
paddle
.
concat
([
return
paddle
.
concat
([
(
1
-
coeff
)
*
signal
[
0
:
1
],
(
1
-
coeff
)
*
signal
[
:,
:,
0
:
1
],
signal
[
1
:]
-
coeff
*
signal
[
:
-
1
]
signal
[
:,
:,
1
:]
-
coeff
*
signal
[:,
:,
:
-
1
]
])
]
,
axis
=-
1
)
class
STFT
(
nn
.
Layer
):
class
STFT
(
nn
.
Layer
):
...
@@ -130,16 +163,19 @@ class STFT(nn.Layer):
...
@@ -130,16 +163,19 @@ class STFT(nn.Layer):
self
.
dither
=
dither
self
.
dither
=
dither
self
.
preemph_coeff
=
preemph_coeff
self
.
preemph_coeff
=
preemph_coeff
self
.
remove_dc_offset
=
remove_dc_offset
self
.
remove_dc_offset
=
remove_dc_offset
self
.
window_type
=
window_type
self
.
clip
=
clip
self
.
clip
=
clip
self
.
n_fft
=
n_fft
self
.
n_fft
=
n_fft
self
.
n_bin
=
1
+
n_fft
//
2
self
.
n_bin
=
1
+
n_fft
//
2
w_real
,
w_imag
,
kernel_size
=
dft_matrix
(
self
.
n_fft
,
int
(
self
.
win_length
*
sr
),
self
.
n_bin
)
w_real
,
w_imag
,
kernel_size
=
dft_matrix
(
self
.
n_fft
,
int
(
self
.
win_length
*
self
.
sr
),
self
.
n_bin
)
# calculate window
# calculate window
window
=
get_window
(
window_type
,
kernel_size
)
window
=
get_window
(
window_type
,
kernel_size
)
# (2 * n_bins, kernel_size)
# (2 * n_bins, kernel_size)
w
=
np
.
concatenate
([
w_real
,
w_imag
],
axis
=
0
)
w
=
np
.
concatenate
([
w_real
,
w_imag
],
axis
=
0
)
w
=
w
*
window
w
=
w
*
window
...
@@ -166,6 +202,12 @@ class STFT(nn.Layer):
...
@@ -166,6 +202,12 @@ class STFT(nn.Layer):
"""
"""
batch_size
=
paddle
.
shape
(
num_samples
)
batch_size
=
paddle
.
shape
(
num_samples
)
F
,
nframe
=
frames
(
x
,
num_samples
,
self
.
sr
,
self
.
win_length
,
self
.
stride_length
,
clip
=
self
.
clip
)
F
,
nframe
=
frames
(
x
,
num_samples
,
self
.
sr
,
self
.
win_length
,
self
.
stride_length
,
clip
=
self
.
clip
)
if
self
.
dither
:
F
=
dither
(
F
,
dither
)
if
self
.
remove_dc_offset
:
F
=
remove_dc_offset
(
F
)
if
self
.
preemph_coeff
:
F
=
preemphasis
(
F
)
C
=
paddle
.
matmul
(
F
,
self
.
weight
)
# [B, T, K] [K, 2 * n_bins]
C
=
paddle
.
matmul
(
F
,
self
.
weight
)
# [B, T, K] [K, 2 * n_bins]
C
=
paddle
.
reshape
(
C
,
[
batch_size
,
-
1
,
2
,
self
.
n_bin
])
C
=
paddle
.
reshape
(
C
,
[
batch_size
,
-
1
,
2
,
self
.
n_bin
])
C
=
C
.
transpose
([
0
,
1
,
3
,
2
])
C
=
C
.
transpose
([
0
,
1
,
3
,
2
])
...
...
third_party/paddle_audio/frontend/kaldi_test.py
浏览文件 @
42f93b2c
...
@@ -376,11 +376,13 @@ class TestKaldiFE(unittest.TestCase):
...
@@ -376,11 +376,13 @@ class TestKaldiFE(unittest.TestCase):
import
scipy.io.wavfile
as
wav
import
scipy.io.wavfile
as
wav
rate
,
sig
=
wav
.
read
(
self
.
wavpath
)
rate
,
sig
=
wav
.
read
(
self
.
wavpath
)
sr
,
wav
=
kaldi
.
read
(
self
.
wavpath
)
sr
,
wav
=
kaldi
.
read
(
self
.
wavpath
)
wav
=
wav
[:,
0
]
self
.
assertTrue
(
np
.
all
(
sig
==
wav
))
self
.
assertTrue
(
np
.
all
(
sig
==
wav
))
self
.
assertEqual
(
rate
,
sr
)
self
.
assertEqual
(
rate
,
sr
)
def
test_frames
(
self
):
def
test_frames
(
self
):
sr
,
wav
=
kaldi
.
read
(
self
.
wavpath
)
sr
,
wav
=
kaldi
.
read
(
self
.
wavpath
)
wav
=
wav
[:,
0
]
_
,
fs
=
frames
(
wav
,
samplerate
=
sr
,
_
,
fs
=
frames
(
wav
,
samplerate
=
sr
,
winlen
=
self
.
winlen
,
winstep
=
self
.
winstep
,
winlen
=
self
.
winlen
,
winstep
=
self
.
winstep
,
nfilt
=
self
.
nfilt
,
nfft
=
self
.
nfft
,
nfilt
=
self
.
nfilt
,
nfft
=
self
.
nfft
,
...
@@ -397,6 +399,7 @@ class TestKaldiFE(unittest.TestCase):
...
@@ -397,6 +399,7 @@ class TestKaldiFE(unittest.TestCase):
def
test_stft
(
self
):
def
test_stft
(
self
):
sr
,
wav
=
kaldi
.
read
(
self
.
wavpath
)
sr
,
wav
=
kaldi
.
read
(
self
.
wavpath
)
wav
=
wav
[:,
0
]
for
wintype
in
[
''
,
'hamm'
,
'hann'
,
'povey'
]:
for
wintype
in
[
''
,
'hamm'
,
'hann'
,
'povey'
]:
print
(
wintype
)
print
(
wintype
)
...
@@ -412,7 +415,7 @@ class TestKaldiFE(unittest.TestCase):
...
@@ -412,7 +415,7 @@ class TestKaldiFE(unittest.TestCase):
t_wav
=
paddle
.
to_tensor
([
wav
],
dtype
=
'float32'
)
t_wav
=
paddle
.
to_tensor
([
wav
],
dtype
=
'float32'
)
t_wavlen
=
paddle
.
to_tensor
([
len
(
wav
)])
t_wavlen
=
paddle
.
to_tensor
([
len
(
wav
)])
stft_class
=
kaldi
.
STFT
(
self
.
nfft
,
sr
,
self
.
winlen
,
self
.
winstep
,
window_type
=
self
.
wintype
,
clip
=
False
)
stft_class
=
kaldi
.
STFT
(
self
.
nfft
,
sr
,
self
.
winlen
,
self
.
winstep
,
window_type
=
self
.
wintype
,
dither
=
0.0
,
preemph_coeff
=
0.0
,
remove_dc_offset
=
False
,
clip
=
False
)
t_stft
,
t_nframe
=
stft_class
(
t_wav
,
t_wavlen
)
t_stft
,
t_nframe
=
stft_class
(
t_wav
,
t_wavlen
)
t_stft
=
t_stft
.
astype
(
stft_c_win
.
real
.
dtype
)[
0
]
t_stft
=
t_stft
.
astype
(
stft_c_win
.
real
.
dtype
)[
0
]
t_real
=
t_stft
[:,
:,
0
]
t_real
=
t_stft
[:,
:,
0
]
...
@@ -434,7 +437,7 @@ class TestKaldiFE(unittest.TestCase):
...
@@ -434,7 +437,7 @@ class TestKaldiFE(unittest.TestCase):
def
test_magspec
(
self
):
def
test_magspec
(
self
):
sr
,
wav
=
kaldi
.
read
(
self
.
wavpath
)
sr
,
wav
=
kaldi
.
read
(
self
.
wavpath
)
wav
=
wav
[:,
0
]
for
wintype
in
[
''
,
'hamm'
,
'hann'
,
'povey'
]:
for
wintype
in
[
''
,
'hamm'
,
'hann'
,
'povey'
]:
print
(
wintype
)
print
(
wintype
)
self
.
wintype
=
wintype
self
.
wintype
=
wintype
...
@@ -448,7 +451,7 @@ class TestKaldiFE(unittest.TestCase):
...
@@ -448,7 +451,7 @@ class TestKaldiFE(unittest.TestCase):
t_wav
=
paddle
.
to_tensor
([
wav
],
dtype
=
'float32'
)
t_wav
=
paddle
.
to_tensor
([
wav
],
dtype
=
'float32'
)
t_wavlen
=
paddle
.
to_tensor
([
len
(
wav
)])
t_wavlen
=
paddle
.
to_tensor
([
len
(
wav
)])
stft_class
=
kaldi
.
STFT
(
self
.
nfft
,
sr
,
self
.
winlen
,
self
.
winstep
,
window_type
=
self
.
wintype
,
clip
=
False
)
stft_class
=
kaldi
.
STFT
(
self
.
nfft
,
sr
,
self
.
winlen
,
self
.
winstep
,
window_type
=
self
.
wintype
,
dither
=
0.0
,
preemph_coeff
=
0.0
,
remove_dc_offset
=
False
,
clip
=
False
)
t_stft
,
t_nframe
=
stft_class
(
t_wav
,
t_wavlen
)
t_stft
,
t_nframe
=
stft_class
(
t_wav
,
t_wavlen
)
t_stft
=
t_stft
.
astype
(
stft_win
.
dtype
)
t_stft
=
t_stft
.
astype
(
stft_win
.
dtype
)
t_spec
=
kaldi
.
magspec
(
t_stft
)[
0
]
t_spec
=
kaldi
.
magspec
(
t_stft
)[
0
]
...
@@ -463,7 +466,7 @@ class TestKaldiFE(unittest.TestCase):
...
@@ -463,7 +466,7 @@ class TestKaldiFE(unittest.TestCase):
def
test_powspec
(
self
):
def
test_powspec
(
self
):
sr
,
wav
=
kaldi
.
read
(
self
.
wavpath
)
sr
,
wav
=
kaldi
.
read
(
self
.
wavpath
)
wav
=
wav
[:,
0
]
for
wintype
in
[
''
,
'hamm'
,
'hann'
,
'povey'
]:
for
wintype
in
[
''
,
'hamm'
,
'hann'
,
'povey'
]:
print
(
wintype
)
print
(
wintype
)
self
.
wintype
=
wintype
self
.
wintype
=
wintype
...
@@ -478,7 +481,7 @@ class TestKaldiFE(unittest.TestCase):
...
@@ -478,7 +481,7 @@ class TestKaldiFE(unittest.TestCase):
t_wav
=
paddle
.
to_tensor
([
wav
],
dtype
=
'float32'
)
t_wav
=
paddle
.
to_tensor
([
wav
],
dtype
=
'float32'
)
t_wavlen
=
paddle
.
to_tensor
([
len
(
wav
)])
t_wavlen
=
paddle
.
to_tensor
([
len
(
wav
)])
stft_class
=
kaldi
.
STFT
(
self
.
nfft
,
sr
,
self
.
winlen
,
self
.
winstep
,
window_type
=
self
.
wintype
,
clip
=
False
)
stft_class
=
kaldi
.
STFT
(
self
.
nfft
,
sr
,
self
.
winlen
,
self
.
winstep
,
window_type
=
self
.
wintype
,
dither
=
0.0
,
preemph_coeff
=
0.0
,
remove_dc_offset
=
False
,
clip
=
False
)
t_stft
,
t_nframe
=
stft_class
(
t_wav
,
t_wavlen
)
t_stft
,
t_nframe
=
stft_class
(
t_wav
,
t_wavlen
)
t_stft
=
t_stft
.
astype
(
stft_win
.
dtype
)
t_stft
=
t_stft
.
astype
(
stft_win
.
dtype
)
t_spec
=
kaldi
.
powspec
(
t_stft
)[
0
]
t_spec
=
kaldi
.
powspec
(
t_stft
)[
0
]
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录