Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
959408ba
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 1 年 前同步成功
通知
207
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
959408ba
编写于
3月 03, 2022
作者:
K
KP
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Refactor and add doc string.
上级
504c2c9d
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
155 addition
and
200 deletion
+155
-200
paddleaudio/paddleaudio/compliance/kaldi.py
paddleaudio/paddleaudio/compliance/kaldi.py
+145
-196
paddleaudio/paddleaudio/features/layers.py
paddleaudio/paddleaudio/features/layers.py
+10
-4
未找到文件。
paddleaudio/paddleaudio/compliance/kaldi.py
浏览文件 @
959408ba
...
...
@@ -105,7 +105,7 @@ def _get_log_energy(strided_input: Tensor, epsilon: Tensor,
def
_get_waveform_and_window_properties
(
waveform
:
Tensor
,
channel
:
int
,
s
ample_frequency
:
floa
t
,
s
r
:
in
t
,
frame_shift
:
float
,
frame_length
:
float
,
round_to_power_of_two
:
bool
,
...
...
@@ -115,9 +115,9 @@ def _get_waveform_and_window_properties(
'Invalid channel {} for size {}'
.
format
(
channel
,
waveform
.
shape
[
0
]))
waveform
=
waveform
[
channel
,
:]
# size (n)
window_shift
=
int
(
s
ample_frequency
*
frame_shift
*
s
r
*
frame_shift
*
0.001
)
# pass frame_shift and frame_length in milliseconds
window_size
=
int
(
s
ample_frequency
*
frame_length
*
0.001
)
window_size
=
int
(
s
r
*
frame_length
*
0.001
)
padded_window_size
=
_next_power_of_2
(
window_size
)
if
round_to_power_of_two
else
window_size
...
...
@@ -128,7 +128,7 @@ def _get_waveform_and_window_properties(
assert
padded_window_size
%
2
==
0
,
'the padded `window_size` must be divisible by two.'
\
' use `round_to_power_of_two` or change `frame_length`'
assert
0.
<=
preemphasis_coefficient
<=
1.0
,
'`preemphasis_coefficient` must be between [0,1]'
assert
s
ample_frequency
>
0
,
'`sample_frequency
` must be greater than zero'
assert
s
r
>
0
,
'`sr
` must be greater than zero'
return
waveform
,
window_shift
,
window_size
,
padded_window_size
...
...
@@ -147,45 +147,38 @@ def _get_window(waveform: Tensor,
dtype
=
waveform
.
dtype
epsilon
=
_get_epsilon
(
dtype
)
#
size
(m, window_size)
# (m, window_size)
strided_input
=
_get_strided
(
waveform
,
window_size
,
window_shift
,
snip_edges
)
if
dither
!=
0.0
:
# Returns a random number strictly between 0 and 1
x
=
paddle
.
maximum
(
epsilon
,
paddle
.
rand
(
strided_input
.
shape
,
dtype
=
dtype
))
rand_gauss
=
paddle
.
sqrt
(
-
2
*
x
.
log
())
*
paddle
.
cos
(
2
*
math
.
pi
*
x
)
strided_input
=
strided_input
+
rand_gauss
*
dither
if
remove_dc_offset
:
# Subtract each row/frame by its mean
row_means
=
paddle
.
mean
(
strided_input
,
axis
=
1
).
unsqueeze
(
1
)
# size (m, 1)
row_means
=
paddle
.
mean
(
strided_input
,
axis
=
1
).
unsqueeze
(
1
)
# (m, 1)
strided_input
=
strided_input
-
row_means
if
raw_energy
:
# Compute the log energy of each row/frame before applying preemphasis and
# window function
signal_log_energy
=
_get_log_energy
(
strided_input
,
epsilon
,
energy_floor
)
#
size
(m)
energy_floor
)
# (m)
if
preemphasis_coefficient
!=
0.0
:
# strided_input[i,j] -= preemphasis_coefficient * strided_input[i, max(0, j-1)] for all i,j
offset_strided_input
=
paddle
.
nn
.
functional
.
pad
(
strided_input
.
unsqueeze
(
0
),
(
1
,
0
),
data_format
=
'NCL'
,
mode
=
'replicate'
).
squeeze
(
0
)
#
size
(m, window_size + 1)
mode
=
'replicate'
).
squeeze
(
0
)
# (m, window_size + 1)
strided_input
=
strided_input
-
preemphasis_coefficient
*
offset_strided_input
[:,
:
-
1
]
# Apply window_function to each row/frame
window_function
=
_feature_window_function
(
window_type
,
window_size
,
blackman_coeff
,
dtype
).
unsqueeze
(
0
)
#
size
(1, window_size)
strided_input
=
strided_input
*
window_function
#
size
(m, window_size)
dtype
).
unsqueeze
(
0
)
# (1, window_size)
strided_input
=
strided_input
*
window_function
# (m, window_size)
#
Pad columns with zero until we reach size
(m, padded_window_size)
# (m, padded_window_size)
if
padded_window_size
!=
window_size
:
padding_right
=
padded_window_size
-
window_size
strided_input
=
paddle
.
nn
.
functional
.
pad
(
...
...
@@ -194,7 +187,6 @@ def _get_window(waveform: Tensor,
mode
=
'constant'
,
value
=
0
).
squeeze
(
0
)
# Compute energy after window function (not the raw one)
if
not
raw_energy
:
signal_log_energy
=
_get_log_energy
(
strided_input
,
epsilon
,
energy_floor
)
# size (m)
...
...
@@ -203,8 +195,6 @@ def _get_window(waveform: Tensor,
def
_subtract_column_mean
(
tensor
:
Tensor
,
subtract_mean
:
bool
)
->
Tensor
:
# subtracts the column mean of the tensor size (m, n) if subtract_mean=True
# it returns size (m, n)
if
subtract_mean
:
col_means
=
paddle
.
mean
(
tensor
,
axis
=
0
).
unsqueeze
(
0
)
tensor
=
tensor
-
col_means
...
...
@@ -218,61 +208,56 @@ def spectrogram(waveform: Tensor,
energy_floor
:
float
=
1.0
,
frame_length
:
float
=
25.0
,
frame_shift
:
float
=
10.0
,
min_duration
:
float
=
0.0
,
preemphasis_coefficient
:
float
=
0.97
,
raw_energy
:
bool
=
True
,
remove_dc_offset
:
bool
=
True
,
round_to_power_of_two
:
bool
=
True
,
s
ample_frequency
:
float
=
16000.
0
,
s
r
:
int
=
1600
0
,
snip_edges
:
bool
=
True
,
subtract_mean
:
bool
=
False
,
window_type
:
str
=
POVEY
)
->
Tensor
:
"""
[summary]
"""
Compute and return a spectrogram from a waveform. The output is identical to Kaldi's.
Args:
waveform (Tensor): [description]
blackman_coeff (float, optional): [description]. Defaults to 0.42.
channel (int, optional): [description]. Defaults to -1.
dither (float, optional): [description]. Defaults to 0.0.
energy_floor (float, optional): [description]. Defaults to 1.0.
frame_length (float, optional): [description]. Defaults to 25.0.
frame_shift (float, optional): [description]. Defaults to 10.0.
min_duration (float, optional): [description]. Defaults to 0.0.
preemphasis_coefficient (float, optional): [description]. Defaults to 0.97.
raw_energy (bool, optional): [description]. Defaults to True.
remove_dc_offset (bool, optional): [description]. Defaults to True.
round_to_power_of_two (bool, optional): [description]. Defaults to True.
sample_frequency (float, optional): [description]. Defaults to 16000.0.
snip_edges (bool, optional): [description]. Defaults to True.
subtract_mean (bool, optional): [description]. Defaults to False.
window_type (str, optional): [description]. Defaults to POVEY.
waveform (Tensor): A waveform tensor with shape [C, T].
blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
channel (int, optional): Select the channel of waveform. Defaults to -1.
dither (float, optional): Dithering constant . Defaults to 0.0.
energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
to FFT. Defaults to True.
sr (int, optional): Sample rate of input waveform. Defaults to 16000.
snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
Returns:
Tensor: [description]
Tensor: A spectrogram tensor with shape (m, padded_window_size // 2 + 1) where m is the number of frames
depends on frame_length and frame_shift.
"""
dtype
=
waveform
.
dtype
epsilon
=
_get_epsilon
(
dtype
)
waveform
,
window_shift
,
window_size
,
padded_window_size
=
_get_waveform_and_window_properties
(
waveform
,
channel
,
sample_frequency
,
frame_shift
,
frame_length
,
round_to_power_of_two
,
preemphasis_coefficient
)
if
len
(
waveform
)
<
min_duration
*
sample_frequency
:
# signal is too short
return
paddle
.
empty
([
0
])
waveform
,
channel
,
sr
,
frame_shift
,
frame_length
,
round_to_power_of_two
,
preemphasis_coefficient
)
strided_input
,
signal_log_energy
=
_get_window
(
waveform
,
padded_window_size
,
window_size
,
window_shift
,
window_type
,
blackman_coeff
,
snip_edges
,
raw_energy
,
energy_floor
,
dither
,
remove_dc_offset
,
preemphasis_coefficient
)
#
size
(m, padded_window_size // 2 + 1, 2)
# (m, padded_window_size // 2 + 1, 2)
fft
=
paddle
.
fft
.
rfft
(
strided_input
)
# Convert the FFT into a power spectrum
power_spectrum
=
paddle
.
maximum
(
fft
.
abs
().
pow
(
2.
),
epsilon
).
log
()
# size (m, padded_window_size // 2 + 1)
fft
.
abs
().
pow
(
2.
),
epsilon
).
log
()
# (m, padded_window_size // 2 + 1)
power_spectrum
[:,
0
]
=
signal_log_energy
power_spectrum
=
_subtract_column_mean
(
power_spectrum
,
subtract_mean
)
...
...
@@ -306,25 +291,19 @@ def _vtln_warp_freq(vtln_low_cutoff: float,
l
=
vtln_low_cutoff
*
max
(
1.0
,
vtln_warp_factor
)
h
=
vtln_high_cutoff
*
min
(
1.0
,
vtln_warp_factor
)
scale
=
1.0
/
vtln_warp_factor
Fl
=
scale
*
l
# F(l)
Fh
=
scale
*
h
# F(h)
Fl
=
scale
*
l
Fh
=
scale
*
h
assert
l
>
low_freq
and
h
<
high_freq
# slope of left part of the 3-piece linear function
scale_left
=
(
Fl
-
low_freq
)
/
(
l
-
low_freq
)
# [slope of center part is just "scale"]
# slope of right part of the 3-piece linear function
scale_right
=
(
high_freq
-
Fh
)
/
(
high_freq
-
h
)
res
=
paddle
.
empty_like
(
freq
)
outside_low_high_freq
=
paddle
.
less_than
(
freq
,
paddle
.
to_tensor
(
low_freq
))
\
|
paddle
.
greater_than
(
freq
,
paddle
.
to_tensor
(
high_freq
))
# freq < low_freq || freq > high_freq
before_l
=
paddle
.
less_than
(
freq
,
paddle
.
to_tensor
(
l
))
# freq < l
before_h
=
paddle
.
less_than
(
freq
,
paddle
.
to_tensor
(
h
))
# freq < h
after_h
=
paddle
.
greater_equal
(
freq
,
paddle
.
to_tensor
(
h
))
# freq >= h
|
paddle
.
greater_than
(
freq
,
paddle
.
to_tensor
(
high_freq
))
before_l
=
paddle
.
less_than
(
freq
,
paddle
.
to_tensor
(
l
))
before_h
=
paddle
.
less_than
(
freq
,
paddle
.
to_tensor
(
h
))
after_h
=
paddle
.
greater_equal
(
freq
,
paddle
.
to_tensor
(
h
))
# order of operations matter here (since there is overlapping frequency regions)
res
[
after_h
]
=
high_freq
+
scale_right
*
(
freq
[
after_h
]
-
high_freq
)
res
[
before_h
]
=
scale
*
freq
[
before_h
]
res
[
before_l
]
=
low_freq
+
scale_left
*
(
freq
[
before_l
]
-
low_freq
)
...
...
@@ -363,13 +342,10 @@ def _get_mel_banks(num_bins: int,
assert
(
0.0
<=
low_freq
<
nyquist
)
and
(
0.0
<
high_freq
<=
nyquist
)
and
(
low_freq
<
high_freq
),
\
(
'Bad values in options: low-freq {} and high-freq {} vs. nyquist {}'
.
format
(
low_freq
,
high_freq
,
nyquist
))
# fft-bin width [think of it as Nyquist-freq / half-window-length]
fft_bin_width
=
sample_freq
/
window_length_padded
mel_low_freq
=
_mel_scale_scalar
(
low_freq
)
mel_high_freq
=
_mel_scale_scalar
(
high_freq
)
# divide by num_bins+1 in next line because of end-effects where the bins
# spread out to the sides.
mel_freq_delta
=
(
mel_high_freq
-
mel_low_freq
)
/
(
num_bins
+
1
)
if
vtln_high
<
0.0
:
...
...
@@ -381,10 +357,9 @@ def _get_mel_banks(num_bins: int,
'low-freq {} and high-freq {}'
.
format
(
vtln_low
,
vtln_high
,
low_freq
,
high_freq
))
bin
=
paddle
.
arange
(
num_bins
).
unsqueeze
(
1
)
left_mel
=
mel_low_freq
+
bin
*
mel_freq_delta
# size(num_bins, 1)
center_mel
=
mel_low_freq
+
(
bin
+
1.0
)
*
mel_freq_delta
# size(num_bins, 1)
right_mel
=
mel_low_freq
+
(
bin
+
2.0
)
*
mel_freq_delta
# size(num_bins, 1)
left_mel
=
mel_low_freq
+
bin
*
mel_freq_delta
# (num_bins, 1)
center_mel
=
mel_low_freq
+
(
bin
+
1.0
)
*
mel_freq_delta
# (num_bins, 1)
right_mel
=
mel_low_freq
+
(
bin
+
2.0
)
*
mel_freq_delta
# (num_bins, 1)
if
vtln_warp_factor
!=
1.0
:
left_mel
=
_vtln_warp_mel_freq
(
vtln_low
,
vtln_high
,
low_freq
,
high_freq
,
...
...
@@ -395,25 +370,23 @@ def _get_mel_banks(num_bins: int,
right_mel
=
_vtln_warp_mel_freq
(
vtln_low
,
vtln_high
,
low_freq
,
high_freq
,
vtln_warp_factor
,
right_mel
)
center_freqs
=
_inverse_mel_scale
(
center_mel
)
#
size
(num_bins)
#
size
(1, num_fft_bins)
center_freqs
=
_inverse_mel_scale
(
center_mel
)
# (num_bins)
# (1, num_fft_bins)
mel
=
_mel_scale
(
fft_bin_width
*
paddle
.
arange
(
num_fft_bins
)).
unsqueeze
(
0
)
#
size
(num_bins, num_fft_bins)
# (num_bins, num_fft_bins)
up_slope
=
(
mel
-
left_mel
)
/
(
center_mel
-
left_mel
)
down_slope
=
(
right_mel
-
mel
)
/
(
right_mel
-
center_mel
)
if
vtln_warp_factor
==
1.0
:
# left_mel < center_mel < right_mel so we can min the two slopes and clamp negative values
bins
=
paddle
.
maximum
(
paddle
.
zeros
([
1
]),
paddle
.
minimum
(
up_slope
,
down_slope
))
else
:
# warping can move the order of left_mel, center_mel, right_mel anywhere
bins
=
paddle
.
zeros_like
(
up_slope
)
up_idx
=
paddle
.
greater_than
(
mel
,
left_mel
)
&
paddle
.
less_than
(
mel
,
center_mel
)
# left_mel < mel <= center_mel
mel
,
center_mel
)
down_idx
=
paddle
.
greater_than
(
mel
,
center_mel
)
&
paddle
.
less_than
(
mel
,
right_mel
)
# center_mel < mel < right_mel
mel
,
right_mel
)
bins
[
up_idx
]
=
up_slope
[
up_idx
]
bins
[
down_idx
]
=
down_slope
[
down_idx
]
...
...
@@ -430,13 +403,12 @@ def fbank(waveform: Tensor,
high_freq
:
float
=
0.0
,
htk_compat
:
bool
=
False
,
low_freq
:
float
=
20.0
,
min_duration
:
float
=
0.0
,
num_mel_bins
:
int
=
23
,
n_mels
:
int
=
23
,
preemphasis_coefficient
:
float
=
0.97
,
raw_energy
:
bool
=
True
,
remove_dc_offset
:
bool
=
True
,
round_to_power_of_two
:
bool
=
True
,
s
ample_frequency
:
float
=
16000.
0
,
s
r
:
int
=
1600
0
,
snip_edges
:
bool
=
True
,
subtract_mean
:
bool
=
False
,
use_energy
:
bool
=
False
,
...
...
@@ -446,83 +418,75 @@ def fbank(waveform: Tensor,
vtln_low
:
float
=
100.0
,
vtln_warp
:
float
=
1.0
,
window_type
:
str
=
POVEY
)
->
Tensor
:
"""
[summary]
"""
Compute and return filter banks from a waveform. The output is identical to Kaldi's.
Args:
waveform (Tensor): [description]
blackman_coeff (float, optional): [description]. Defaults to 0.42.
channel (int, optional): [description]. Defaults to -1.
dither (float, optional): [description]. Defaults to 0.0.
energy_floor (float, optional): [description]. Defaults to 1.0.
frame_length (float, optional): [description]. Defaults to 25.0.
frame_shift (float, optional): [description]. Defaults to 10.0.
high_freq (float, optional): [description]. Defaults to 0.0.
htk_compat (bool, optional): [description]. Defaults to False.
low_freq (float, optional): [description]. Defaults to 20.0.
min_duration (float, optional): [description]. Defaults to 0.0.
num_mel_bins (int, optional): [description]. Defaults to 23.
preemphasis_coefficient (float, optional): [description]. Defaults to 0.97.
raw_energy (bool, optional): [description]. Defaults to True.
remove_dc_offset (bool, optional): [description]. Defaults to True.
round_to_power_of_two (bool, optional): [description]. Defaults to True.
sample_frequency (float, optional): [description]. Defaults to 16000.0.
snip_edges (bool, optional): [description]. Defaults to True.
subtract_mean (bool, optional): [description]. Defaults to False.
use_energy (bool, optional): [description]. Defaults to False.
use_log_fbank (bool, optional): [description]. Defaults to True.
use_power (bool, optional): [description]. Defaults to True.
vtln_high (float, optional): [description]. Defaults to -500.0.
vtln_low (float, optional): [description]. Defaults to 100.0.
vtln_warp (float, optional): [description]. Defaults to 1.0.
window_type (str, optional): [description]. Defaults to POVEY.
waveform (Tensor): A waveform tensor with shape [C, T].
blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
channel (int, optional): Select the channel of waveform. Defaults to -1.
dither (float, optional): Dithering constant . Defaults to 0.0.
energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0.
htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False.
low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0.
n_mels (int, optional): Number of output mel bins. Defaults to 23.
preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
to FFT. Defaults to True.
sr (int, optional): Sample rate of input waveform. Defaults to 16000.
snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False.
use_log_fbank (bool, optional): Return log fbank when it is set True. Defaults to True.
use_power (bool, optional): Whether to use power instead of magnitude. Defaults to True.
vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0.
vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0.
vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0.
window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
Returns:
Tensor:
[description]
Tensor:
A filter banks tensor with shape (m, n_mels).
"""
dtype
=
waveform
.
dtype
waveform
,
window_shift
,
window_size
,
padded_window_size
=
_get_waveform_and_window_properties
(
waveform
,
channel
,
sample_frequency
,
frame_shift
,
frame_length
,
round_to_power_of_two
,
preemphasis_coefficient
)
if
len
(
waveform
)
<
min_duration
*
sample_frequency
:
# signal is too short
return
paddle
.
empty
([
0
],
dtype
=
dtype
)
waveform
,
channel
,
sr
,
frame_shift
,
frame_length
,
round_to_power_of_two
,
preemphasis_coefficient
)
# strided_input, size (m, padded_window_size) and signal_log_energy, size (m)
strided_input
,
signal_log_energy
=
_get_window
(
waveform
,
padded_window_size
,
window_size
,
window_shift
,
window_type
,
blackman_coeff
,
snip_edges
,
raw_energy
,
energy_floor
,
dither
,
remove_dc_offset
,
preemphasis_coefficient
)
#
size
(m, padded_window_size // 2 + 1)
# (m, padded_window_size // 2 + 1)
spectrum
=
paddle
.
fft
.
rfft
(
strided_input
).
abs
()
if
use_power
:
spectrum
=
spectrum
.
pow
(
2.
)
# size (num_mel_bins, padded_window_size // 2)
mel_energies
,
_
=
_get_mel_banks
(
num_mel_bins
,
padded_window_size
,
sample_frequency
,
low_freq
,
high_freq
,
vtln_low
,
vtln_high
,
vtln_warp
)
# (n_mels, padded_window_size // 2)
mel_energies
,
_
=
_get_mel_banks
(
n_mels
,
padded_window_size
,
sr
,
low_freq
,
high_freq
,
vtln_low
,
vtln_high
,
vtln_warp
)
mel_energies
=
mel_energies
.
astype
(
dtype
)
#
pad right column with zeros and add dimension, size (num_mel_bin
s, padded_window_size // 2 + 1)
#
(n_mel
s, padded_window_size // 2 + 1)
mel_energies
=
paddle
.
nn
.
functional
.
pad
(
mel_energies
.
unsqueeze
(
0
),
(
0
,
1
),
data_format
=
'NCL'
,
mode
=
'constant'
,
value
=
0
).
squeeze
(
0
)
#
sum with mel fiterbanks over the power spectrum, size (m, num_mel_bin
s)
#
(m, n_mel
s)
mel_energies
=
paddle
.
mm
(
spectrum
,
mel_energies
.
T
)
if
use_log_fbank
:
# avoid log of zero (which should be prevented anyway by dithering)
mel_energies
=
paddle
.
maximum
(
mel_energies
,
_get_epsilon
(
dtype
)).
log
()
# if use_energy then add it as the last column for htk_compat == true else first column
if
use_energy
:
signal_log_energy
=
signal_log_energy
.
unsqueeze
(
1
)
# size (m, 1)
# returns size (m, num_mel_bins + 1)
signal_log_energy
=
signal_log_energy
.
unsqueeze
(
1
)
if
htk_compat
:
mel_energies
=
paddle
.
concat
(
(
mel_energies
,
signal_log_energy
),
axis
=
1
)
...
...
@@ -530,28 +494,20 @@ def fbank(waveform: Tensor,
mel_energies
=
paddle
.
concat
(
(
signal_log_energy
,
mel_energies
),
axis
=
1
)
# (m, n_mels + 1)
mel_energies
=
_subtract_column_mean
(
mel_energies
,
subtract_mean
)
return
mel_energies
def
_get_dct_matrix
(
num_ceps
:
int
,
num_mel_bins
:
int
)
->
Tensor
:
# returns a dct matrix of size (num_mel_bins, num_ceps)
# size (num_mel_bins, num_mel_bins)
dct_matrix
=
create_dct
(
num_mel_bins
,
num_mel_bins
,
'ortho'
)
# kaldi expects the first cepstral to be weighted sum of factor sqrt(1/num_mel_bins)
# this would be the first column in the dct_matrix for torchaudio as it expects a
# right multiply (which would be the first column of the kaldi's dct_matrix as kaldi
# expects a left multiply e.g. dct_matrix * vector).
dct_matrix
[:,
0
]
=
math
.
sqrt
(
1
/
float
(
num_mel_bins
))
dct_matrix
=
dct_matrix
[:,
:
num_ceps
]
def
_get_dct_matrix
(
n_mfcc
:
int
,
n_mels
:
int
)
->
Tensor
:
dct_matrix
=
create_dct
(
n_mels
,
n_mels
,
'ortho'
)
dct_matrix
[:,
0
]
=
math
.
sqrt
(
1
/
float
(
n_mels
))
dct_matrix
=
dct_matrix
[:,
:
n_mfcc
]
# (n_mels, n_mfcc)
return
dct_matrix
def
_get_lifter_coeffs
(
num_ceps
:
int
,
cepstral_lifter
:
float
)
->
Tensor
:
# returns size (num_ceps)
# Compute liftering coefficients (scaling on cepstral coeffs)
# coeffs are numbered slightly differently from HTK: the zeroth index is C0, which is not affected.
i
=
paddle
.
arange
(
num_ceps
)
def
_get_lifter_coeffs
(
n_mfcc
:
int
,
cepstral_lifter
:
float
)
->
Tensor
:
i
=
paddle
.
arange
(
n_mfcc
)
return
1.0
+
0.5
*
cepstral_lifter
*
paddle
.
sin
(
math
.
pi
*
i
/
cepstral_lifter
)
...
...
@@ -567,14 +523,13 @@ def mfcc(waveform: Tensor,
high_freq
:
float
=
0.0
,
htk_compat
:
bool
=
False
,
low_freq
:
float
=
20.0
,
num_ceps
:
int
=
13
,
min_duration
:
float
=
0.0
,
num_mel_bins
:
int
=
23
,
n_mfcc
:
int
=
13
,
n_mels
:
int
=
23
,
preemphasis_coefficient
:
float
=
0.97
,
raw_energy
:
bool
=
True
,
remove_dc_offset
:
bool
=
True
,
round_to_power_of_two
:
bool
=
True
,
s
ample_frequency
:
float
=
16000.
0
,
s
r
:
int
=
1600
0
,
snip_edges
:
bool
=
True
,
subtract_mean
:
bool
=
False
,
use_energy
:
bool
=
False
,
...
...
@@ -582,47 +537,47 @@ def mfcc(waveform: Tensor,
vtln_low
:
float
=
100.0
,
vtln_warp
:
float
=
1.0
,
window_type
:
str
=
POVEY
)
->
Tensor
:
"""[summary]
"""Compute and return mel frequency cepstral coefficients from a waveform. The output is
identical to Kaldi's.
Args:
waveform (Tensor): [description]
blackman_coeff (float, optional): [description]. Defaults to 0.42.
cepstral_lifter (float, optional): [description]. Defaults to 22.0.
channel (int, optional): [description]. Defaults to -1.
dither (float, optional): [description]. Defaults to 0.0.
energy_floor (float, optional): [description]. Defaults to 1.0.
frame_length (float, optional): [description]. Defaults to 25.0.
frame_shift (float, optional): [description]. Defaults to 10.0.
high_freq (float, optional): [description]. Defaults to 0.0.
htk_compat (bool, optional): [description]. Defaults to False.
low_freq (float, optional): [description]. Defaults to 20.0.
num_ceps (int, optional): [description]. Defaults to 13.
min_duration (float, optional): [description]. Defaults to 0.0.
num_mel_bins (int, optional): [description]. Defaults to 23.
preemphasis_coefficient (float, optional): [description]. Defaults to 0.97.
raw_energy (bool, optional): [description]. Defaults to True.
remove_dc_offset (bool, optional): [description]. Defaults to True.
round_to_power_of_two (bool, optional): [description]. Defaults to True.
sample_frequency (float, optional): [description]. Defaults to 16000.0.
snip_edges (bool, optional): [description]. Defaults to True.
subtract_mean (bool, optional): [description]. Defaults to False.
use_energy (bool, optional): [description]. Defaults to False.
vtln_high (float, optional): [description]. Defaults to -500.0.
vtln_low (float, optional): [description]. Defaults to 100.0.
vtln_warp (float, optional): [description]. Defaults to 1.0.
window_type (str, optional): [description]. Defaults to POVEY.
waveform (Tensor): A waveform tensor with shape [C, T].
blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
cepstral_lifter (float, optional): Scaling of output mfccs. Defaults to 22.0.
channel (int, optional): Select the channel of waveform. Defaults to -1.
dither (float, optional): Dithering constant . Defaults to 0.0.
energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0.
htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False.
low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0.
n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 13.
n_mels (int, optional): Number of output mel bins. Defaults to 23.
preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
to FFT. Defaults to True.
sr (int, optional): Sample rate of input waveform. Defaults to 16000.
snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False.
vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0.
vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0.
vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0.
window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
Returns:
Tensor:
[description]
Tensor:
A mel frequency cepstral coefficients tensor with shape (m, n_mfcc).
"""
assert
n
um_ceps
<=
num_mel_bins
,
'num_ceps cannot be larger than num_mel_bin
s: %d vs %d'
%
(
n
um_ceps
,
num_mel_bin
s
)
assert
n
_mfcc
<=
n_mels
,
'n_mfcc cannot be larger than n_mel
s: %d vs %d'
%
(
n
_mfcc
,
n_mel
s
)
dtype
=
waveform
.
dtype
# The mel_energies should not be squared (use_power=True), not have mean subtracted
# (subtract_mean=False), and use log (use_log_fbank=True).
# size (m, num_mel_bins + use_energy)
# (m, n_mels + use_energy)
feature
=
fbank
(
waveform
=
waveform
,
blackman_coeff
=
blackman_coeff
,
...
...
@@ -634,13 +589,12 @@ def mfcc(waveform: Tensor,
high_freq
=
high_freq
,
htk_compat
=
htk_compat
,
low_freq
=
low_freq
,
min_duration
=
min_duration
,
num_mel_bins
=
num_mel_bins
,
n_mels
=
n_mels
,
preemphasis_coefficient
=
preemphasis_coefficient
,
raw_energy
=
raw_energy
,
remove_dc_offset
=
remove_dc_offset
,
round_to_power_of_two
=
round_to_power_of_two
,
s
ample_frequency
=
sample_frequency
,
s
r
=
sr
,
snip_edges
=
snip_edges
,
subtract_mean
=
False
,
use_energy
=
use_energy
,
...
...
@@ -652,34 +606,29 @@ def mfcc(waveform: Tensor,
window_type
=
window_type
)
if
use_energy
:
# size (m)
signal_log_energy
=
feature
[:,
num_mel_bins
if
htk_compat
else
0
]
# offset is 0 if htk_compat==True else 1
# (m)
signal_log_energy
=
feature
[:,
n_mels
if
htk_compat
else
0
]
mel_offset
=
int
(
not
htk_compat
)
feature
=
feature
[:,
mel_offset
:(
n
um_mel_bin
s
+
mel_offset
)]
feature
=
feature
[:,
mel_offset
:(
n
_mel
s
+
mel_offset
)]
#
size (num_mel_bins, num_ceps
)
dct_matrix
=
_get_dct_matrix
(
n
um_ceps
,
num_mel_bin
s
).
astype
(
dtype
=
dtype
)
#
(n_mels, n_mfcc
)
dct_matrix
=
_get_dct_matrix
(
n
_mfcc
,
n_mel
s
).
astype
(
dtype
=
dtype
)
#
size (m, num_ceps
)
#
(m, n_mfcc
)
feature
=
feature
.
matmul
(
dct_matrix
)
if
cepstral_lifter
!=
0.0
:
# size (1, num_ceps)
lifter_coeffs
=
_get_lifter_coeffs
(
num_ceps
,
cepstral_lifter
).
unsqueeze
(
0
)
# (1, n_mfcc)
lifter_coeffs
=
_get_lifter_coeffs
(
n_mfcc
,
cepstral_lifter
).
unsqueeze
(
0
)
feature
*=
lifter_coeffs
.
astype
(
dtype
=
dtype
)
# if use_energy then replace the last column for htk_compat == true else first column
if
use_energy
:
feature
[:,
0
]
=
signal_log_energy
if
htk_compat
:
energy
=
feature
[:,
0
].
unsqueeze
(
1
)
#
size
(m, 1)
feature
=
feature
[:,
1
:]
#
size (m, num_ceps
- 1)
energy
=
feature
[:,
0
].
unsqueeze
(
1
)
# (m, 1)
feature
=
feature
[:,
1
:]
#
(m, n_mfcc
- 1)
if
not
use_energy
:
# scale on C0 (actually removing a scale we previously added that's
# part of one common definition of the cosine transform.)
energy
*=
math
.
sqrt
(
2
)
feature
=
paddle
.
concat
((
feature
,
energy
),
axis
=
1
)
...
...
paddleaudio/paddleaudio/features/layers.py
浏览文件 @
959408ba
...
...
@@ -261,12 +261,18 @@ class MFCC(nn.Layer):
sr
:
int
=
22050
,
n_mfcc
:
int
=
40
,
norm
:
str
=
'ortho'
,
dtype
:
str
=
paddle
.
float32
,
**
kwargs
):
"""[summary]
"""Compute mel frequency cepstral coefficients(MFCCs) feature of given waveforms.
Parameters:
sr (int, optional): [description]. Defaults to 22050.
n_mfcc (int, optional): [description]. Defaults to 40.
norm (str, optional): [description]. Defaults to 'ortho'.
sr(int): the audio sample rate.
The default value is 22050.
n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 40.
norm(str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
You can specify norm=1.0/2.0 to use customized p-norm normalization.
dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
"""
super
(
MFCC
,
self
).
__init__
()
self
.
_log_melspectrogram
=
LogMelSpectrogram
(
sr
=
sr
,
**
kwargs
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录