Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
b98c7cd1
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
b98c7cd1
编写于
6月 15, 2021
作者:
H
Hui Zhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
can do frames, real stft
上级
a8448714
变更
5
显示空白变更内容
内联
并排
Showing
5 changed file
with
485 addition
and
0 deletion
+485
-0
third_party/__init__.py
third_party/__init__.py
+0
-0
third_party/paddle_audio/__init__.py
third_party/paddle_audio/__init__.py
+0
-0
third_party/paddle_audio/frontend/english.wav
third_party/paddle_audio/frontend/english.wav
+0
-0
third_party/paddle_audio/frontend/kaldi.py
third_party/paddle_audio/frontend/kaldi.py
+188
-0
third_party/paddle_audio/frontend/kaldi_test.py
third_party/paddle_audio/frontend/kaldi_test.py
+297
-0
未找到文件。
third_party/__init__.py
0 → 100644
浏览文件 @
b98c7cd1
third_party/paddle_audio/__init__.py
0 → 100644
浏览文件 @
b98c7cd1
third_party/paddle_audio/frontend/english.wav
0 → 100644
浏览文件 @
b98c7cd1
文件已添加
third_party/paddle_audio/frontend.py
→
third_party/paddle_audio/frontend
/kaldi
.py
浏览文件 @
b98c7cd1
...
@@ -4,25 +4,35 @@ import paddle
...
@@ -4,25 +4,35 @@ import paddle
from
paddle
import
Tensor
from
paddle
import
Tensor
from
paddle
import
nn
from
paddle
import
nn
from
paddle.nn
import
functional
as
F
from
paddle.nn
import
functional
as
F
import
soundfile
as
sf
def
read
(
wavpath
:
str
,
sr
:
int
=
None
,
dtype
=
'int16'
)
->
Tuple
[
int
,
np
.
ndarray
]:
wav
,
r_sr
=
sf
.
read
(
wavpath
,
dtype
=
dtype
)
if
sr
:
assert
sr
==
r_sr
return
r_sr
,
wav
def
frame
(
x
:
Tensor
,
def
frames
(
x
:
Tensor
,
num_samples
:
Tensor
,
num_samples
:
Tensor
,
win_length
:
int
,
sr
:
int
,
hop_length
:
int
,
win_length
:
float
,
clip
:
bool
=
True
)
->
Tuple
[
Tensor
,
Tensor
]:
stride_length
:
float
,
clip
:
bool
=
False
)
->
Tuple
[
Tensor
,
Tensor
]:
"""Extract frames from audio.
"""Extract frames from audio.
Parameters
Parameters
----------
----------
x : Tensor
x : Tensor
Shape (
N
, T), batched waveform.
Shape (
B
, T), batched waveform.
num_samples : Tensor
num_samples : Tensor
Shape (N, ), number of samples of each waveform.
Shape (B, ), number of samples of each waveform.
win_length : int
sr: int
Window length.
Sampling Rate.
hop_length : int
win_length : float
Number of samples shifted between ajancent frames.
Window length in ms.
stride_length : float
Stride length in ms.
clip : bool, optional
clip : bool, optional
Whether to clip audio that does not fit into the last frame, by
Whether to clip audio that does not fit into the last frame, by
default True
default True
...
@@ -30,40 +40,55 @@ def frame(x: Tensor,
...
@@ -30,40 +40,55 @@ def frame(x: Tensor,
Returns
Returns
-------
-------
frames : Tensor
frames : Tensor
Shape (
N
, T', win_length).
Shape (
B
, T', win_length).
num_frames : Tensor
num_frames : Tensor
Shape (
N
, ) number of valid frames
Shape (
B
, ) number of valid frames
"""
"""
assert
hop_length
<=
win_length
assert
stride_length
<=
win_length
num_frames
=
(
num_samples
-
win_length
)
//
hop_length
stride_length
=
int
(
stride_length
*
sr
)
win_length
=
int
(
win_length
*
sr
)
num_frames
=
(
num_samples
-
win_length
)
//
stride_length
padding
=
(
0
,
0
)
padding
=
(
0
,
0
)
if
not
clip
:
if
not
clip
:
num_frames
+=
1
num_frames
+=
1
# NOTE: pad hop_length - 1 to the right to ensure that there is at most
need_samples
=
num_frames
*
stride_length
+
win_length
# one frame dangling to the righe edge
padding
=
(
0
,
need_samples
-
num_samples
-
1
)
padding
=
(
0
,
hop_length
-
1
)
weight
=
paddle
.
eye
(
win_length
).
unsqueeze
(
1
)
weight
=
paddle
.
eye
(
win_length
).
unsqueeze
(
1
)
#[win_length, 1, win_length]
frames
=
F
.
conv1d
(
x
.
unsqueeze
(
1
),
frames
=
F
.
conv1d
(
x
.
unsqueeze
(
-
1
),
weight
,
weight
,
padding
=
padding
,
padding
=
padding
,
stride
=
(
hop_length
,
))
stride
=
(
stride_length
,
),
data_format
=
'NLC'
)
return
frames
,
num_frames
return
frames
,
num_frames
def
_povey_window
(
frame_len
:
int
)
->
np
.
ndarray
:
win
=
np
.
empty
(
frame_len
)
for
i
in
range
(
frame_len
):
win
[
i
]
=
(
0.5
-
0.5
*
np
.
cos
(
2
*
np
.
pi
*
i
/
(
frame_len
-
1
))
)
**
0.85
return
win
class
STFT
(
nn
.
Layer
):
class
STFT
(
nn
.
Layer
):
"""A module for computing stft transformation in a differentiable way.
"""A module for computing stft transformation in a differentiable way.
http://practicalcryptography.com/miscellaneous/machine-learning/intuitive-guide-discrete-fourier-transform/
Parameters
Parameters
------------
------------
n_fft : int
n_fft : int
Number of samples in a frame.
Number of samples in a frame.
hop_length : int
sr: int
Number of Samplilng rate.
stride_length : float
Number of samples shifted between adjacent frames.
Number of samples shifted between adjacent frames.
win_length :
in
t
win_length :
floa
t
Length of the window.
Length of the window.
clip: bool
clip: bool
...
@@ -71,40 +96,56 @@ class STFT(nn.Layer):
...
@@ -71,40 +96,56 @@ class STFT(nn.Layer):
"""
"""
def
__init__
(
self
,
def
__init__
(
self
,
n_fft
:
int
,
n_fft
:
int
,
hop_length
:
int
,
sr
:
int
,
win_length
:
int
,
win_length
:
float
,
stride_length
:
float
,
window_type
:
str
=
None
,
window_type
:
str
=
None
,
clip
:
bool
=
Tru
e
):
clip
:
bool
=
Fals
e
):
super
().
__init__
()
super
().
__init__
()
self
.
sr
=
sr
self
.
win_length
=
int
(
win_length
*
sr
)
self
.
stride_length
=
int
(
stride_length
*
sr
)
self
.
clip
=
clip
self
.
hop_length
=
hop_length
self
.
n_bin
=
1
+
n_fft
//
2
self
.
n_fft
=
n_fft
self
.
n_fft
=
n_fft
self
.
clip
=
clip
self
.
n_bin
=
1
+
n_fft
//
2
# https://github.com/numpy/numpy/blob/v1.20.0/numpy/fft/_pocketfft.py#L49
kernel_size
=
min
(
self
.
n_fft
,
self
.
win_length
)
# calculate window
# calculate window
if
window_type
is
Non
e
:
if
not
window_typ
e
:
window
=
np
.
ones
(
win_length
)
window
=
np
.
ones
(
kernel_size
)
elif
window_type
==
"hann"
:
elif
window_type
==
"hann"
:
window
=
np
.
hanning
(
win_length
)
window
=
np
.
hanning
(
kernel_size
)
elif
window_type
==
"hamming"
:
elif
window_type
==
"hamm"
:
window
=
np
.
hamming
(
win_length
)
window
=
np
.
hamming
(
kernel_size
)
elif
window_type
==
"povey"
:
window
=
_povey_window
(
kernel_size
)
else
:
else
:
raise
ValueError
(
"Not supported yet!"
)
msg
=
f
"
{
window_type
}
Not supported yet!"
raise
ValueError
(
msg
)
if
win_length
<
n_fft
:
window
=
F
.
pad
(
window
,
(
0
,
n_fft
-
win_length
))
elif
win_length
>
n_fft
:
window
=
window
[:
n_fft
]
# https://en.wikipedia.org/wiki/Discrete_Fourier_transform
# (n_bins, n_fft) complex
# (n_bins, n_fft) complex
kernel_size
=
min
(
n_fft
,
win_length
)
n
=
np
.
arange
(
0
,
self
.
n_fft
,
1.
)
weight
=
np
.
fft
.
fft
(
np
.
eye
(
n_fft
))[:
self
.
n_bin
,
:
kernel_size
]
wsin
=
np
.
empty
((
self
.
n_bin
,
kernel_size
))
#[Cout, kernel_size]
w_real
=
weight
.
real
wcos
=
np
.
empty
((
self
.
n_bin
,
kernel_size
))
#[Cout, kernel_size]
w_imag
=
weight
.
imag
for
k
in
range
(
self
.
n_bin
):
# Only half of the bins contain useful info
wsin
[
k
,:]
=
np
.
sin
(
2
*
np
.
pi
*
k
*
n
/
self
.
n_fft
)[:
kernel_size
]
wcos
[
k
,:]
=
np
.
cos
(
2
*
np
.
pi
*
k
*
n
/
self
.
n_fft
)[:
kernel_size
]
w_real
=
wcos
w_imag
=
wsin
# https://en.wikipedia.org/wiki/DFT_matrix
# https://ccrma.stanford.edu/~jos/st/Matrix_Formulation_DFT.html
# weight = np.fft.fft(np.eye(n_fft))[:self.n_bin, :kernel_size]
# w_real = weight.real
# w_imag = weight.imag
# (2 * n_bins, kernel_size)
# (2 * n_bins, kernel_size)
w
=
np
.
concatenate
([
w_real
,
w_imag
],
axis
=
0
)
#w = np.concatenate([w_real, w_imag], axis=0)
w
=
w_real
w
=
w
*
window
w
=
w
*
window
# (2 * n_bins, 1, kernel_size) # (C_out, C_in, kernel_size)
# (2 * n_bins, 1, kernel_size) # (C_out, C_in, kernel_size)
...
@@ -118,29 +159,30 @@ class STFT(nn.Layer):
...
@@ -118,29 +159,30 @@ class STFT(nn.Layer):
------------
------------
x : Tensor [shape=(B, T)]
x : Tensor [shape=(B, T)]
The input waveform.
The input waveform.
num_samples : Tensor
num_samples : Tensor
[shape=(B,)]
Number of samples of each waveform.
Number of samples of each waveform.
Returns
Returns
------------
------------
D : Tensor
D : Tensor
Shape(
N
, T', n_bins, 2) Spectrogram.
Shape(
B
, T', n_bins, 2) Spectrogram.
num_frames: Tensor
num_frames: Tensor
Shape (
N
,) number of samples of each spectrogram
Shape (
B
,) number of samples of each spectrogram
"""
"""
num_frames
=
(
num_samples
-
self
.
win_length
)
//
self
.
hop
_length
num_frames
=
(
num_samples
-
self
.
win_length
)
//
self
.
stride
_length
padding
=
(
0
,
0
)
padding
=
(
0
,
0
)
if
not
self
.
clip
:
if
not
self
.
clip
:
num_frames
+=
1
num_frames
+=
1
padding
=
(
0
,
self
.
hop_length
-
1
)
need_samples
=
num_frames
*
self
.
stride_length
+
self
.
win_length
padding
=
(
0
,
need_samples
-
num_samples
-
1
)
batch_size
,
_
,
_
=
paddle
.
shape
(
x
)
batch_size
,
_
=
paddle
.
shape
(
x
)
x
=
x
.
unsqueeze
(
-
1
)
x
=
x
.
unsqueeze
(
-
1
)
D
=
F
.
conv1d
(
self
.
weight
,
D
=
F
.
conv1d
(
x
,
self
.
weight
,
x
,
stride
=
(
self
.
stride_length
,
),
stride
=
(
self
.
hop_length
,
),
padding
=
padding
,
padding
=
padding
,
data_format
=
"NLC"
)
data_format
=
"NLC"
)
D
=
paddle
.
reshape
(
D
,
[
batch_size
,
-
1
,
self
.
n_bin
,
2
])
#D = paddle.reshape(D, [batch_size, -1, self.n_bin, 2])
D
=
paddle
.
reshape
(
D
,
[
batch_size
,
-
1
,
self
.
n_bin
,
1
])
return
D
,
num_frames
return
D
,
num_frames
third_party/paddle_audio/frontend/kaldi_test.py
0 → 100644
浏览文件 @
b98c7cd1
from
typing
import
Tuple
import
numpy
as
np
import
paddle
import
unittest
import
decimal
import
numpy
import
math
import
logging
from
pathlib
import
Path
from
third_party.paddle_audio.frontend
import
kaldi
def
round_half_up
(
number
):
return
int
(
decimal
.
Decimal
(
number
).
quantize
(
decimal
.
Decimal
(
'1'
),
rounding
=
decimal
.
ROUND_HALF_UP
))
def
rolling_window
(
a
,
window
,
step
=
1
):
# http://ellisvalentiner.com/post/2017-03-21-np-strides-trick
shape
=
a
.
shape
[:
-
1
]
+
(
a
.
shape
[
-
1
]
-
window
+
1
,
window
)
strides
=
a
.
strides
+
(
a
.
strides
[
-
1
],)
return
numpy
.
lib
.
stride_tricks
.
as_strided
(
a
,
shape
=
shape
,
strides
=
strides
)[::
step
]
def
do_dither
(
signal
,
dither_value
=
1.0
):
signal
+=
numpy
.
random
.
normal
(
size
=
signal
.
shape
)
*
dither_value
return
signal
def
do_remove_dc_offset
(
signal
):
signal
-=
numpy
.
mean
(
signal
)
return
signal
def
do_preemphasis
(
signal
,
coeff
=
0.97
):
"""perform preemphasis on the input signal.
:param signal: The signal to filter.
:param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
:returns: the filtered signal.
"""
return
numpy
.
append
((
1
-
coeff
)
*
signal
[
0
],
signal
[
1
:]
-
coeff
*
signal
[:
-
1
])
def
framesig
(
sig
,
frame_len
,
frame_step
,
dither
=
1.0
,
preemph
=
0.97
,
remove_dc_offset
=
True
,
wintype
=
'hamming'
,
stride_trick
=
True
):
"""Frame a signal into overlapping frames.
:param sig: the audio signal to frame.
:param frame_len: length of each frame measured in samples.
:param frame_step: number of samples after the start of the previous frame that the next frame should begin.
:param winfunc: the analysis window to apply to each frame. By default no window is applied.
:param stride_trick: use stride trick to compute the rolling window and window multiplication faster
:returns: an array of frames. Size is NUMFRAMES by frame_len.
"""
slen
=
len
(
sig
)
frame_len
=
int
(
round_half_up
(
frame_len
))
frame_step
=
int
(
round_half_up
(
frame_step
))
if
slen
<=
frame_len
:
numframes
=
1
else
:
numframes
=
1
+
((
slen
-
frame_len
)
//
frame_step
)
# check kaldi/src/feat/feature-window.h
padsignal
=
sig
[:(
numframes
-
1
)
*
frame_step
+
frame_len
]
if
wintype
is
'povey'
:
win
=
numpy
.
empty
(
frame_len
)
for
i
in
range
(
frame_len
):
win
[
i
]
=
(
0.5
-
0.5
*
numpy
.
cos
(
2
*
numpy
.
pi
/
(
frame_len
-
1
)
*
i
))
**
0.85
else
:
# the hamming window
win
=
numpy
.
hamming
(
frame_len
)
if
stride_trick
:
frames
=
rolling_window
(
padsignal
,
window
=
frame_len
,
step
=
frame_step
)
else
:
indices
=
numpy
.
tile
(
numpy
.
arange
(
0
,
frame_len
),
(
numframes
,
1
))
+
numpy
.
tile
(
numpy
.
arange
(
0
,
numframes
*
frame_step
,
frame_step
),
(
frame_len
,
1
)).
T
indices
=
numpy
.
array
(
indices
,
dtype
=
numpy
.
int32
)
frames
=
padsignal
[
indices
]
win
=
numpy
.
tile
(
win
,
(
numframes
,
1
))
frames
=
frames
.
astype
(
numpy
.
float32
)
raw_frames
=
numpy
.
zeros
(
frames
.
shape
)
for
frm
in
range
(
frames
.
shape
[
0
]):
frames
[
frm
,:]
=
do_dither
(
frames
[
frm
,:],
dither
)
# dither
frames
[
frm
,:]
=
do_remove_dc_offset
(
frames
[
frm
,:])
# remove dc offset
raw_frames
[
frm
,:]
=
frames
[
frm
,:]
frames
[
frm
,:]
=
do_preemphasis
(
frames
[
frm
,:],
preemph
)
# preemphasize
return
frames
*
win
,
raw_frames
def
magspec
(
frames
,
NFFT
):
"""Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
:param frames: the array of frames. Each row is a frame.
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
"""
if
numpy
.
shape
(
frames
)[
1
]
>
NFFT
:
logging
.
warn
(
'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.'
,
numpy
.
shape
(
frames
)[
1
],
NFFT
)
complex_spec
=
numpy
.
fft
.
rfft
(
frames
,
NFFT
)
return
numpy
.
absolute
(
complex_spec
)
def
powspec
(
frames
,
NFFT
):
"""Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
:param frames: the array of frames. Each row is a frame.
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame.
"""
return
numpy
.
square
(
magspec
(
frames
,
NFFT
))
def
framesig_without_dither_dc_preemphasize
(
sig
,
frame_len
,
frame_step
,
wintype
=
'hamming'
,
stride_trick
=
True
):
"""Frame a signal into overlapping frames.
:param sig: the audio signal to frame.
:param frame_len: length of each frame measured in samples.
:param frame_step: number of samples after the start of the previous frame that the next frame should begin.
:param winfunc: the analysis window to apply to each frame. By default no window is applied.
:param stride_trick: use stride trick to compute the rolling window and window multiplication faster
:returns: an array of frames. Size is NUMFRAMES by frame_len.
"""
slen
=
len
(
sig
)
frame_len
=
int
(
round_half_up
(
frame_len
))
frame_step
=
int
(
round_half_up
(
frame_step
))
if
slen
<=
frame_len
:
numframes
=
1
else
:
numframes
=
1
+
((
slen
-
frame_len
)
//
frame_step
)
# check kaldi/src/feat/feature-window.h
padsignal
=
sig
[:(
numframes
-
1
)
*
frame_step
+
frame_len
]
if
wintype
is
'povey'
:
win
=
numpy
.
empty
(
frame_len
)
for
i
in
range
(
frame_len
):
win
[
i
]
=
(
0.5
-
0.5
*
numpy
.
cos
(
2
*
numpy
.
pi
/
(
frame_len
-
1
)
*
i
))
**
0.85
elif
wintype
==
''
:
win
=
numpy
.
ones
(
frame_len
)
elif
wintype
==
'hann'
:
win
=
numpy
.
hanning
(
frame_len
)
else
:
# the hamming window
win
=
numpy
.
hamming
(
frame_len
)
if
stride_trick
:
frames
=
rolling_window
(
padsignal
,
window
=
frame_len
,
step
=
frame_step
)
else
:
indices
=
numpy
.
tile
(
numpy
.
arange
(
0
,
frame_len
),
(
numframes
,
1
))
+
numpy
.
tile
(
numpy
.
arange
(
0
,
numframes
*
frame_step
,
frame_step
),
(
frame_len
,
1
)).
T
indices
=
numpy
.
array
(
indices
,
dtype
=
numpy
.
int32
)
frames
=
padsignal
[
indices
]
win
=
numpy
.
tile
(
win
,
(
numframes
,
1
))
frames
=
frames
.
astype
(
numpy
.
float32
)
raw_frames
=
frames
return
frames
*
win
,
raw_frames
def
frames
(
signal
,
samplerate
=
16000
,
winlen
=
0.025
,
winstep
=
0.01
,
nfilt
=
40
,
nfft
=
512
,
lowfreq
=
0
,
highfreq
=
None
,
wintype
=
'hamming'
):
frames_with_win
,
raw_frames
=
framesig_without_dither_dc_preemphasize
(
signal
,
winlen
*
samplerate
,
winstep
*
samplerate
,
wintype
)
return
frames_with_win
,
raw_frames
def
complexspec
(
frames
,
NFFT
):
"""Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
:param frames: the array of frames. Each row is a frame.
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
"""
if
numpy
.
shape
(
frames
)[
1
]
>
NFFT
:
logging
.
warn
(
'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.'
,
numpy
.
shape
(
frames
)[
1
],
NFFT
)
complex_spec
=
numpy
.
fft
.
rfft
(
frames
,
NFFT
)
return
complex_spec
def
stft_with_window
(
signal
,
samplerate
=
16000
,
winlen
=
0.025
,
winstep
=
0.01
,
nfilt
=
40
,
nfft
=
512
,
lowfreq
=
0
,
highfreq
=
None
,
dither
=
1.0
,
remove_dc_offset
=
True
,
preemph
=
0.97
,
wintype
=
'hamming'
):
frames_with_win
,
raw_frames
=
framesig_without_dither_dc_preemphasize
(
signal
,
winlen
*
samplerate
,
winstep
*
samplerate
,
wintype
)
spec
=
magspec
(
frames_with_win
,
nfft
)
# nearly the same until this part
scomplex
=
complexspec
(
frames_with_win
,
nfft
)
rspec
=
magspec
(
raw_frames
,
nfft
)
rcomplex
=
complexspec
(
raw_frames
,
nfft
)
return
spec
,
scomplex
,
rspec
,
rcomplex
class
TestKaldiFE
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
this_dir
=
Path
(
__file__
).
parent
self
.
wavpath
=
str
(
self
.
this_dir
/
'english.wav'
)
self
.
winlen
=
0.025
# ms
self
.
winstep
=
0.01
# ms
self
.
nfft
=
512
self
.
lowfreq
=
0
self
.
highfreq
=
None
self
.
wintype
=
'hamm'
self
.
nfilt
=
40
def
test_read
(
self
):
import
scipy.io.wavfile
as
wav
rate
,
sig
=
wav
.
read
(
self
.
wavpath
)
sr
,
wav
=
kaldi
.
read
(
self
.
wavpath
)
self
.
assertTrue
(
np
.
all
(
sig
==
wav
))
self
.
assertEqual
(
rate
,
sr
)
def
test_frames
(
self
):
sr
,
wav
=
kaldi
.
read
(
self
.
wavpath
)
_
,
fs
=
frames
(
wav
,
samplerate
=
sr
,
winlen
=
self
.
winlen
,
winstep
=
self
.
winstep
,
nfilt
=
self
.
nfilt
,
nfft
=
self
.
nfft
,
lowfreq
=
self
.
lowfreq
,
highfreq
=
self
.
highfreq
,
wintype
=
self
.
wintype
)
t_wav
=
paddle
.
to_tensor
([
wav
],
dtype
=
'float32'
)
t_wavlen
=
paddle
.
to_tensor
([
len
(
wav
)])
t_fs
,
t_nframe
=
kaldi
.
frames
(
t_wav
,
t_wavlen
,
sr
,
self
.
winlen
,
self
.
winstep
,
clip
=
False
)
t_fs
=
t_fs
.
astype
(
fs
.
dtype
)[
0
]
self
.
assertEqual
(
t_nframe
.
item
(),
fs
.
shape
[
0
])
self
.
assertTrue
(
np
.
allclose
(
t_fs
.
numpy
(),
fs
))
def
test_stft
(
self
):
sr
,
wav
=
kaldi
.
read
(
self
.
wavpath
)
for
wintype
in
[
''
,
'hamm'
,
'hann'
,
'povey'
]:
print
(
wintype
)
self
.
wintype
=
wintype
sftf_win
,
stft_c_win
,
_
,
stft_c
=
stft_with_window
(
wav
,
samplerate
=
sr
,
winlen
=
self
.
winlen
,
winstep
=
self
.
winstep
,
nfilt
=
self
.
nfilt
,
nfft
=
self
.
nfft
,
lowfreq
=
self
.
lowfreq
,
highfreq
=
self
.
highfreq
,
wintype
=
self
.
wintype
)
print
(
'py'
,
stft_c_win
.
real
)
#print(stft_c_win.imag)
t_wav
=
paddle
.
to_tensor
([
wav
],
dtype
=
'float32'
)
t_wavlen
=
paddle
.
to_tensor
([
len
(
wav
)])
stft_class
=
kaldi
.
STFT
(
self
.
nfft
,
sr
,
self
.
winlen
,
self
.
winstep
,
window_type
=
self
.
wintype
,
clip
=
False
)
t_stft
,
t_nframe
=
stft_class
(
t_wav
,
t_wavlen
)
t_stft
=
t_stft
.
astype
(
sftf_win
.
dtype
)[
0
]
t_real
=
t_stft
[:,
:,
0
]
#t_imag = t_stft[:, :, 1]
print
(
'pd'
,
t_real
.
numpy
())
#print(t_imag.numpy())
self
.
assertEqual
(
t_nframe
.
item
(),
sftf_win
.
shape
[
0
])
self
.
assertLess
(
np
.
sum
(
t_real
.
numpy
())
-
np
.
sum
(
stft_c_win
.
real
),
1
)
print
(
np
.
sum
(
t_real
.
numpy
()))
print
(
np
.
sum
(
stft_c_win
.
real
))
self
.
assertTrue
(
np
.
allclose
(
t_real
.
numpy
(),
stft_c_win
.
real
,
atol
=
1e-1
))
#self.assertTrue(np.allclose(t_imag.numpy(), stft_c_win.imag))
# from python_speech_features import mfcc
# from python_speech_features import delta
# from python_speech_features import logfbank
# import scipy.io.wavfile as wav
# (rate,sig) = wav.read("english.wav")
# # note that generally nfilt=40 is used for speech recognition
# fbank_feat = logfbank(sig,nfilt=23,lowfreq=20,dither=0,wintype='povey')
# # the computed fbank coefficents of english.wav with dimension [110,23]
# # [ 12.2865 12.6906 13.1765 15.714 16.064 15.7553 16.5746 16.9205 16.6472 16.1302 16.4576 16.7326 16.8864 17.7215 18.88 19.1377 19.1495 18.6683 18.3886 20.3506 20.2772 18.8248 18.1899
# # 11.9198 13.146 14.7215 15.8642 17.4288 16.394 16.8238 16.1095 16.4297 16.6331 16.3163 16.5093 17.4981 18.3429 19.6555 19.6263 19.8435 19.0534 19.001 20.0287 19.7707 19.5852 19.1112
# # ...
# # ...
# # the same with that using kaldi commands: compute-fbank-feats --dither=0.0
# mfcc_feat = mfcc(sig,dither=0,useEnergy=True,wintype='povey')
# # the computed mfcc coefficents of english.wav with dimension [110,13]
# # [ 17.1337 -23.3651 -7.41751 -7.73686 -21.3682 -8.93884 -3.70843 4.68346 -16.0676 12.782 -7.24054 8.25089 10.7292
# # 17.1692 -23.3028 -5.61872 -4.0075 -23.287 -20.6101 -5.51584 -6.15273 -14.4333 8.13052 -0.0345329 2.06274 -0.564298
# # ...
# # ...
# # the same with that using kaldi commands: compute-mfcc-feats --dither=0.0
if
__name__
==
'__main__'
:
unittest
.
main
()
\ No newline at end of file
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录