Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
25cb4bb0
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 1 年 前同步成功
通知
207
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
25cb4bb0
编写于
3月 02, 2022
作者:
H
Hui Zhang
提交者:
GitHub
3月 02, 2022
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #1518 from KPatr1ck/audio
[audio]refactor audio arch
上级
39e5a81a
f4c72054
变更
15
展开全部
隐藏空白更改
内联
并排
Showing
15 changed file
with
1877 addition
and
1153 deletion
+1877
-1153
paddleaudio/paddleaudio/__init__.py
paddleaudio/paddleaudio/__init__.py
+2
-0
paddleaudio/paddleaudio/backends/__init__.py
paddleaudio/paddleaudio/backends/__init__.py
+6
-0
paddleaudio/paddleaudio/backends/soundfile_backend.py
paddleaudio/paddleaudio/backends/soundfile_backend.py
+252
-0
paddleaudio/paddleaudio/compliance/__init__.py
paddleaudio/paddleaudio/compliance/__init__.py
+0
-0
paddleaudio/paddleaudio/compliance/kaldi.py
paddleaudio/paddleaudio/compliance/kaldi.py
+688
-0
paddleaudio/paddleaudio/compliance/librosa.py
paddleaudio/paddleaudio/compliance/librosa.py
+728
-0
paddleaudio/paddleaudio/datasets/__init__.py
paddleaudio/paddleaudio/datasets/__init__.py
+0
-7
paddleaudio/paddleaudio/datasets/dataset.py
paddleaudio/paddleaudio/datasets/dataset.py
+2
-2
paddleaudio/paddleaudio/features/__init__.py
paddleaudio/paddleaudio/features/__init__.py
+4
-3
paddleaudio/paddleaudio/features/layers.py
paddleaudio/paddleaudio/features/layers.py
+30
-211
paddleaudio/paddleaudio/functional/__init__.py
paddleaudio/paddleaudio/functional/__init__.py
+7
-0
paddleaudio/paddleaudio/functional/functional.py
paddleaudio/paddleaudio/functional/functional.py
+156
-620
paddleaudio/paddleaudio/io/__init__.py
paddleaudio/paddleaudio/io/__init__.py
+1
-7
paddleaudio/paddleaudio/io/audio.py
paddleaudio/paddleaudio/io/audio.py
+0
-303
paddleaudio/paddleaudio/metric/mcd.py
paddleaudio/paddleaudio/metric/mcd.py
+1
-0
未找到文件。
paddleaudio/paddleaudio/__init__.py
浏览文件 @
25cb4bb0
...
...
@@ -11,3 +11,5 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.backends
import
load
from
.backends
import
save
paddleaudio/paddleaudio/backends/__init__.py
浏览文件 @
25cb4bb0
...
...
@@ -11,3 +11,9 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.soundfile_backend
import
depth_convert
from
.soundfile_backend
import
load
from
.soundfile_backend
import
normalize
from
.soundfile_backend
import
resample
from
.soundfile_backend
import
save
from
.soundfile_backend
import
to_mono
paddleaudio/paddleaudio/backends/soundfile_backend.py
浏览文件 @
25cb4bb0
...
...
@@ -11,3 +11,255 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
warnings
from
typing
import
Optional
from
typing
import
Tuple
from
typing
import
Union
import
numpy
as
np
import
resampy
import
soundfile
as
sf
from
numpy
import
ndarray
as
array
from
scipy.io
import
wavfile
from
..utils
import
ParameterError
__all__
=
[
'resample'
,
'to_mono'
,
'depth_convert'
,
'normalize'
,
'save'
,
'load'
,
]
NORMALMIZE_TYPES
=
[
'linear'
,
'gaussian'
]
MERGE_TYPES
=
[
'ch0'
,
'ch1'
,
'random'
,
'average'
]
RESAMPLE_MODES
=
[
'kaiser_best'
,
'kaiser_fast'
]
EPS
=
1e-8
def
resample
(
y
:
array
,
src_sr
:
int
,
target_sr
:
int
,
mode
:
str
=
'kaiser_fast'
)
->
array
:
""" Audio resampling
This function is the same as using resampy.resample().
Notes:
The default mode is kaiser_fast. For better audio quality, use mode = 'kaiser_fast'
"""
if
mode
==
'kaiser_best'
:
warnings
.
warn
(
f
'Using resampy in kaiser_best to
{
src_sr
}
=>
{
target_sr
}
. This function is pretty slow,
\
we recommend the mode kaiser_fast in large scale audio trainning'
)
if
not
isinstance
(
y
,
np
.
ndarray
):
raise
ParameterError
(
'Only support numpy array, but received y in {type(y)}'
)
if
mode
not
in
RESAMPLE_MODES
:
raise
ParameterError
(
f
'resample mode must in
{
RESAMPLE_MODES
}
'
)
return
resampy
.
resample
(
y
,
src_sr
,
target_sr
,
filter
=
mode
)
def
to_mono
(
y
:
array
,
merge_type
:
str
=
'average'
)
->
array
:
""" convert sterior audio to mono
"""
if
merge_type
not
in
MERGE_TYPES
:
raise
ParameterError
(
f
'Unsupported merge type
{
merge_type
}
, available types are
{
MERGE_TYPES
}
'
)
if
y
.
ndim
>
2
:
raise
ParameterError
(
f
'Unsupported audio array, y.ndim > 2, the shape is
{
y
.
shape
}
'
)
if
y
.
ndim
==
1
:
# nothing to merge
return
y
if
merge_type
==
'ch0'
:
return
y
[
0
]
if
merge_type
==
'ch1'
:
return
y
[
1
]
if
merge_type
==
'random'
:
return
y
[
np
.
random
.
randint
(
0
,
2
)]
# need to do averaging according to dtype
if
y
.
dtype
==
'float32'
:
y_out
=
(
y
[
0
]
+
y
[
1
])
*
0.5
elif
y
.
dtype
==
'int16'
:
y_out
=
y
.
astype
(
'int32'
)
y_out
=
(
y_out
[
0
]
+
y_out
[
1
])
//
2
y_out
=
np
.
clip
(
y_out
,
np
.
iinfo
(
y
.
dtype
).
min
,
np
.
iinfo
(
y
.
dtype
).
max
).
astype
(
y
.
dtype
)
elif
y
.
dtype
==
'int8'
:
y_out
=
y
.
astype
(
'int16'
)
y_out
=
(
y_out
[
0
]
+
y_out
[
1
])
//
2
y_out
=
np
.
clip
(
y_out
,
np
.
iinfo
(
y
.
dtype
).
min
,
np
.
iinfo
(
y
.
dtype
).
max
).
astype
(
y
.
dtype
)
else
:
raise
ParameterError
(
f
'Unsupported dtype:
{
y
.
dtype
}
'
)
return
y_out
def
_safe_cast
(
y
:
array
,
dtype
:
Union
[
type
,
str
])
->
array
:
""" data type casting in a safe way, i.e., prevent overflow or underflow
This function is used internally.
"""
return
np
.
clip
(
y
,
np
.
iinfo
(
dtype
).
min
,
np
.
iinfo
(
dtype
).
max
).
astype
(
dtype
)
def
depth_convert
(
y
:
array
,
dtype
:
Union
[
type
,
str
],
dithering
:
bool
=
True
)
->
array
:
"""Convert audio array to target dtype safely
This function convert audio waveform to a target dtype, with addition steps of
preventing overflow/underflow and preserving audio range.
"""
SUPPORT_DTYPE
=
[
'int16'
,
'int8'
,
'float32'
,
'float64'
]
if
y
.
dtype
not
in
SUPPORT_DTYPE
:
raise
ParameterError
(
'Unsupported audio dtype, '
f
'y.dtype is
{
y
.
dtype
}
, supported dtypes are
{
SUPPORT_DTYPE
}
'
)
if
dtype
not
in
SUPPORT_DTYPE
:
raise
ParameterError
(
'Unsupported audio dtype, '
f
'target dtype is
{
dtype
}
, supported dtypes are
{
SUPPORT_DTYPE
}
'
)
if
dtype
==
y
.
dtype
:
return
y
if
dtype
==
'float64'
and
y
.
dtype
==
'float32'
:
return
_safe_cast
(
y
,
dtype
)
if
dtype
==
'float32'
and
y
.
dtype
==
'float64'
:
return
_safe_cast
(
y
,
dtype
)
if
dtype
==
'int16'
or
dtype
==
'int8'
:
if
y
.
dtype
in
[
'float64'
,
'float32'
]:
factor
=
np
.
iinfo
(
dtype
).
max
y
=
np
.
clip
(
y
*
factor
,
np
.
iinfo
(
dtype
).
min
,
np
.
iinfo
(
dtype
).
max
).
astype
(
dtype
)
y
=
y
.
astype
(
dtype
)
else
:
if
dtype
==
'int16'
and
y
.
dtype
==
'int8'
:
factor
=
np
.
iinfo
(
'int16'
).
max
/
np
.
iinfo
(
'int8'
).
max
-
EPS
y
=
y
.
astype
(
'float32'
)
*
factor
y
=
y
.
astype
(
'int16'
)
else
:
# dtype == 'int8' and y.dtype=='int16':
y
=
y
.
astype
(
'int32'
)
*
np
.
iinfo
(
'int8'
).
max
/
\
np
.
iinfo
(
'int16'
).
max
y
=
y
.
astype
(
'int8'
)
if
dtype
in
[
'float32'
,
'float64'
]:
org_dtype
=
y
.
dtype
y
=
y
.
astype
(
dtype
)
/
np
.
iinfo
(
org_dtype
).
max
return
y
def
sound_file_load
(
file
:
str
,
offset
:
Optional
[
float
]
=
None
,
dtype
:
str
=
'int16'
,
duration
:
Optional
[
int
]
=
None
)
->
Tuple
[
array
,
int
]:
"""Load audio using soundfile library
This function load audio file using libsndfile.
Reference:
http://www.mega-nerd.com/libsndfile/#Features
"""
with
sf
.
SoundFile
(
file
)
as
sf_desc
:
sr_native
=
sf_desc
.
samplerate
if
offset
:
sf_desc
.
seek
(
int
(
offset
*
sr_native
))
if
duration
is
not
None
:
frame_duration
=
int
(
duration
*
sr_native
)
else
:
frame_duration
=
-
1
y
=
sf_desc
.
read
(
frames
=
frame_duration
,
dtype
=
dtype
,
always_2d
=
False
).
T
return
y
,
sf_desc
.
samplerate
def
normalize
(
y
:
array
,
norm_type
:
str
=
'linear'
,
mul_factor
:
float
=
1.0
)
->
array
:
""" normalize an input audio with additional multiplier.
"""
if
norm_type
==
'linear'
:
amax
=
np
.
max
(
np
.
abs
(
y
))
factor
=
1.0
/
(
amax
+
EPS
)
y
=
y
*
factor
*
mul_factor
elif
norm_type
==
'gaussian'
:
amean
=
np
.
mean
(
y
)
astd
=
np
.
std
(
y
)
astd
=
max
(
astd
,
EPS
)
y
=
mul_factor
*
(
y
-
amean
)
/
astd
else
:
raise
NotImplementedError
(
f
'norm_type should be in
{
NORMALMIZE_TYPES
}
'
)
return
y
def
save
(
y
:
array
,
sr
:
int
,
file
:
str
)
->
None
:
"""Save audio file to disk.
This function saves audio to disk using scipy.io.wavfile, with additional step
to convert input waveform to int16 unless it already is int16
Notes:
It only support raw wav format.
"""
if
not
file
.
endswith
(
'.wav'
):
raise
ParameterError
(
f
'only .wav file supported, but dst file name is:
{
file
}
'
)
if
sr
<=
0
:
raise
ParameterError
(
f
'Sample rate should be larger than 0, recieved sr =
{
sr
}
'
)
if
y
.
dtype
not
in
[
'int16'
,
'int8'
]:
warnings
.
warn
(
f
'input data type is
{
y
.
dtype
}
, will convert data to int16 format before saving'
)
y_out
=
depth_convert
(
y
,
'int16'
)
else
:
y_out
=
y
wavfile
.
write
(
file
,
sr
,
y_out
)
def
load
(
file
:
str
,
sr
:
Optional
[
int
]
=
None
,
mono
:
bool
=
True
,
merge_type
:
str
=
'average'
,
# ch0,ch1,random,average
normal
:
bool
=
True
,
norm_type
:
str
=
'linear'
,
norm_mul_factor
:
float
=
1.0
,
offset
:
float
=
0.0
,
duration
:
Optional
[
int
]
=
None
,
dtype
:
str
=
'float32'
,
resample_mode
:
str
=
'kaiser_fast'
)
->
Tuple
[
array
,
int
]:
"""Load audio file from disk.
This function loads audio from disk using using audio beackend.
Parameters:
Notes:
"""
y
,
r
=
sound_file_load
(
file
,
offset
=
offset
,
dtype
=
dtype
,
duration
=
duration
)
if
not
((
y
.
ndim
==
1
and
len
(
y
)
>
0
)
or
(
y
.
ndim
==
2
and
len
(
y
[
0
])
>
0
)):
raise
ParameterError
(
f
'audio file
{
file
}
looks empty'
)
if
mono
:
y
=
to_mono
(
y
,
merge_type
)
if
sr
is
not
None
and
sr
!=
r
:
y
=
resample
(
y
,
r
,
sr
,
mode
=
resample_mode
)
r
=
sr
if
normal
:
y
=
normalize
(
y
,
norm_type
,
norm_mul_factor
)
elif
dtype
in
[
'int8'
,
'int16'
]:
# still need to do normalization, before depth convertion
y
=
normalize
(
y
,
'linear'
,
1.0
)
y
=
depth_convert
(
y
,
dtype
)
return
y
,
r
paddleaudio/paddleaudio/
kaldi
/__init__.py
→
paddleaudio/paddleaudio/
compliance
/__init__.py
浏览文件 @
25cb4bb0
文件已移动
paddleaudio/paddleaudio/compliance/kaldi.py
0 → 100644
浏览文件 @
25cb4bb0
此差异已折叠。
点击以展开。
paddleaudio/paddleaudio/compliance/librosa.py
0 → 100644
浏览文件 @
25cb4bb0
此差异已折叠。
点击以展开。
paddleaudio/paddleaudio/datasets/__init__.py
浏览文件 @
25cb4bb0
...
...
@@ -15,10 +15,3 @@ from .esc50 import ESC50
from
.gtzan
import
GTZAN
from
.tess
import
TESS
from
.urban_sound
import
UrbanSound8K
__all__
=
[
'ESC50'
,
'UrbanSound8K'
,
'GTZAN'
,
'TESS'
,
]
paddleaudio/paddleaudio/datasets/dataset.py
浏览文件 @
25cb4bb0
...
...
@@ -17,8 +17,8 @@ import numpy as np
import
paddle
from
..backends
import
load
as
load_audio
from
..
features
import
melspectrogram
from
..
features
import
mfcc
from
..
compliance.librosa
import
melspectrogram
from
..
compliance.librosa
import
mfcc
feat_funcs
=
{
'raw'
:
None
,
...
...
paddleaudio/paddleaudio/features/__init__.py
浏览文件 @
25cb4bb0
...
...
@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.librosa
import
LogMelSpectrogram
from
.librosa
import
MelSpectrogram
from
.librosa
import
Spectrogram
from
.layers
import
LogMelSpectrogram
from
.layers
import
MelSpectrogram
from
.layers
import
MFCC
from
.layers
import
Spectrogram
paddleaudio/paddleaudio/features/l
ibrosa
.py
→
paddleaudio/paddleaudio/features/l
ayers
.py
浏览文件 @
25cb4bb0
...
...
@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
math
from
functools
import
partial
from
typing
import
Optional
from
typing
import
Union
...
...
@@ -19,225 +18,19 @@ from typing import Union
import
paddle
import
paddle.nn
as
nn
from
..functional
import
compute_fbank_matrix
from
..functional
import
create_dct
from
..functional
import
power_to_db
from
..functional.window
import
get_window
__all__
=
[
'Spectrogram'
,
'MelSpectrogram'
,
'LogMelSpectrogram'
,
'MFCC'
,
]
def
hz_to_mel
(
freq
:
Union
[
paddle
.
Tensor
,
float
],
htk
:
bool
=
False
)
->
Union
[
paddle
.
Tensor
,
float
]:
"""Convert Hz to Mels.
Parameters:
freq: the input tensor of arbitrary shape, or a single floating point number.
htk: use HTK formula to do the conversion.
The default value is False.
Returns:
The frequencies represented in Mel-scale.
"""
if
htk
:
if
isinstance
(
freq
,
paddle
.
Tensor
):
return
2595.0
*
paddle
.
log10
(
1.0
+
freq
/
700.0
)
else
:
return
2595.0
*
math
.
log10
(
1.0
+
freq
/
700.0
)
# Fill in the linear part
f_min
=
0.0
f_sp
=
200.0
/
3
mels
=
(
freq
-
f_min
)
/
f_sp
# Fill in the log-scale part
min_log_hz
=
1000.0
# beginning of log region (Hz)
min_log_mel
=
(
min_log_hz
-
f_min
)
/
f_sp
# same (Mels)
logstep
=
math
.
log
(
6.4
)
/
27.0
# step size for log region
if
isinstance
(
freq
,
paddle
.
Tensor
):
target
=
min_log_mel
+
paddle
.
log
(
freq
/
min_log_hz
+
1e-10
)
/
logstep
# prevent nan with 1e-10
mask
=
(
freq
>
min_log_hz
).
astype
(
freq
.
dtype
)
mels
=
target
*
mask
+
mels
*
(
1
-
mask
)
# will replace by masked_fill OP in future
else
:
if
freq
>=
min_log_hz
:
mels
=
min_log_mel
+
math
.
log
(
freq
/
min_log_hz
+
1e-10
)
/
logstep
return
mels
def
mel_to_hz
(
mel
:
Union
[
float
,
paddle
.
Tensor
],
htk
:
bool
=
False
)
->
Union
[
float
,
paddle
.
Tensor
]:
"""Convert mel bin numbers to frequencies.
Parameters:
mel: the mel frequency represented as a tensor of arbitrary shape, or a floating point number.
htk: use HTK formula to do the conversion.
Returns:
The frequencies represented in hz.
"""
if
htk
:
return
700.0
*
(
10.0
**
(
mel
/
2595.0
)
-
1.0
)
f_min
=
0.0
f_sp
=
200.0
/
3
freqs
=
f_min
+
f_sp
*
mel
# And now the nonlinear scale
min_log_hz
=
1000.0
# beginning of log region (Hz)
min_log_mel
=
(
min_log_hz
-
f_min
)
/
f_sp
# same (Mels)
logstep
=
math
.
log
(
6.4
)
/
27.0
# step size for log region
if
isinstance
(
mel
,
paddle
.
Tensor
):
target
=
min_log_hz
*
paddle
.
exp
(
logstep
*
(
mel
-
min_log_mel
))
mask
=
(
mel
>
min_log_mel
).
astype
(
mel
.
dtype
)
freqs
=
target
*
mask
+
freqs
*
(
1
-
mask
)
# will replace by masked_fill OP in future
else
:
if
mel
>=
min_log_mel
:
freqs
=
min_log_hz
*
math
.
exp
(
logstep
*
(
mel
-
min_log_mel
))
return
freqs
def
mel_frequencies
(
n_mels
:
int
=
64
,
f_min
:
float
=
0.0
,
f_max
:
float
=
11025.0
,
htk
:
bool
=
False
,
dtype
:
str
=
paddle
.
float32
):
"""Compute mel frequencies.
Parameters:
n_mels(int): number of Mel bins.
f_min(float): the lower cut-off frequency, below which the filter response is zero.
f_max(float): the upper cut-off frequency, above which the filter response is zero.
htk(bool): whether to use htk formula.
dtype(str): the datatype of the return frequencies.
Returns:
The frequencies represented in Mel-scale
"""
# 'Center freqs' of mel bands - uniformly spaced between limits
min_mel
=
hz_to_mel
(
f_min
,
htk
=
htk
)
max_mel
=
hz_to_mel
(
f_max
,
htk
=
htk
)
mels
=
paddle
.
linspace
(
min_mel
,
max_mel
,
n_mels
,
dtype
=
dtype
)
freqs
=
mel_to_hz
(
mels
,
htk
=
htk
)
return
freqs
def
fft_frequencies
(
sr
:
int
,
n_fft
:
int
,
dtype
:
str
=
paddle
.
float32
):
"""Compute fourier frequencies.
Parameters:
sr(int): the audio sample rate.
n_fft(float): the number of fft bins.
dtype(str): the datatype of the return frequencies.
Returns:
The frequencies represented in hz.
"""
return
paddle
.
linspace
(
0
,
float
(
sr
)
/
2
,
int
(
1
+
n_fft
//
2
),
dtype
=
dtype
)
def
compute_fbank_matrix
(
sr
:
int
,
n_fft
:
int
,
n_mels
:
int
=
64
,
f_min
:
float
=
0.0
,
f_max
:
Optional
[
float
]
=
None
,
htk
:
bool
=
False
,
norm
:
Union
[
str
,
float
]
=
'slaney'
,
dtype
:
str
=
paddle
.
float32
):
"""Compute fbank matrix.
Parameters:
sr(int): the audio sample rate.
n_fft(int): the number of fft bins.
n_mels(int): the number of Mel bins.
f_min(float): the lower cut-off frequency, below which the filter response is zero.
f_max(float): the upper cut-off frequency, above which the filter response is zero.
htk: whether to use htk formula.
return_complex(bool): whether to return complex matrix. If True, the matrix will
be complex type. Otherwise, the real and image part will be stored in the last
axis of returned tensor.
dtype(str): the datatype of the returned fbank matrix.
Returns:
The fbank matrix of shape (n_mels, int(1+n_fft//2)).
Shape:
output: (n_mels, int(1+n_fft//2))
"""
if
f_max
is
None
:
f_max
=
float
(
sr
)
/
2
# Initialize the weights
weights
=
paddle
.
zeros
((
n_mels
,
int
(
1
+
n_fft
//
2
)),
dtype
=
dtype
)
# Center freqs of each FFT bin
fftfreqs
=
fft_frequencies
(
sr
=
sr
,
n_fft
=
n_fft
,
dtype
=
dtype
)
# 'Center freqs' of mel bands - uniformly spaced between limits
mel_f
=
mel_frequencies
(
n_mels
+
2
,
f_min
=
f_min
,
f_max
=
f_max
,
htk
=
htk
,
dtype
=
dtype
)
fdiff
=
mel_f
[
1
:]
-
mel_f
[:
-
1
]
#np.diff(mel_f)
ramps
=
mel_f
.
unsqueeze
(
1
)
-
fftfreqs
.
unsqueeze
(
0
)
#ramps = np.subtract.outer(mel_f, fftfreqs)
for
i
in
range
(
n_mels
):
# lower and upper slopes for all bins
lower
=
-
ramps
[
i
]
/
fdiff
[
i
]
upper
=
ramps
[
i
+
2
]
/
fdiff
[
i
+
1
]
# .. then intersect them with each other and zero
weights
[
i
]
=
paddle
.
maximum
(
paddle
.
zeros_like
(
lower
),
paddle
.
minimum
(
lower
,
upper
))
# Slaney-style mel is scaled to be approx constant energy per channel
if
norm
==
'slaney'
:
enorm
=
2.0
/
(
mel_f
[
2
:
n_mels
+
2
]
-
mel_f
[:
n_mels
])
weights
*=
enorm
.
unsqueeze
(
1
)
elif
isinstance
(
norm
,
int
)
or
isinstance
(
norm
,
float
):
weights
=
paddle
.
nn
.
functional
.
normalize
(
weights
,
p
=
norm
,
axis
=-
1
)
return
weights
def
power_to_db
(
magnitude
:
paddle
.
Tensor
,
ref_value
:
float
=
1.0
,
amin
:
float
=
1e-10
,
top_db
:
Optional
[
float
]
=
None
)
->
paddle
.
Tensor
:
"""Convert a power spectrogram (amplitude squared) to decibel (dB) units.
The function computes the scaling ``10 * log10(x / ref)`` in a numerically
stable way.
Parameters:
magnitude(Tensor): the input magnitude tensor of any shape.
ref_value(float): the reference value. If smaller than 1.0, the db level
of the signal will be pulled up accordingly. Otherwise, the db level
is pushed down.
amin(float): the minimum value of input magnitude, below which the input
magnitude is clipped(to amin).
top_db(float): the maximum db value of resulting spectrum, above which the
spectrum is clipped(to top_db).
Returns:
The spectrogram in log-scale.
shape:
input: any shape
output: same as input
"""
if
amin
<=
0
:
raise
Exception
(
"amin must be strictly positive"
)
if
ref_value
<=
0
:
raise
Exception
(
"ref_value must be strictly positive"
)
ones
=
paddle
.
ones_like
(
magnitude
)
log_spec
=
10.0
*
paddle
.
log10
(
paddle
.
maximum
(
ones
*
amin
,
magnitude
))
log_spec
-=
10.0
*
math
.
log10
(
max
(
ref_value
,
amin
))
if
top_db
is
not
None
:
if
top_db
<
0
:
raise
Exception
(
"top_db must be non-negative"
)
log_spec
=
paddle
.
maximum
(
log_spec
,
ones
*
(
log_spec
.
max
()
-
top_db
))
return
log_spec
class
Spectrogram
(
nn
.
Layer
):
def
__init__
(
self
,
n_fft
:
int
=
512
,
...
...
@@ -459,3 +252,29 @@ class LogMelSpectrogram(nn.Layer):
amin
=
self
.
amin
,
top_db
=
self
.
top_db
)
return
log_mel_feature
class
MFCC
(
nn
.
Layer
):
def
__init__
(
self
,
sr
:
int
=
22050
,
n_mfcc
:
int
=
40
,
norm
:
str
=
'ortho'
,
**
kwargs
):
"""[summary]
Parameters:
sr (int, optional): [description]. Defaults to 22050.
n_mfcc (int, optional): [description]. Defaults to 40.
norm (str, optional): [description]. Defaults to 'ortho'.
"""
super
(
MFCC
,
self
).
__init__
()
self
.
_log_melspectrogram
=
LogMelSpectrogram
(
sr
=
sr
,
**
kwargs
)
self
.
dct_matrix
=
create_dct
(
n_mfcc
=
n_mfcc
,
n_mels
=
self
.
_log_melspectrogram
.
n_mels
,
norm
=
norm
)
self
.
register_buffer
(
'dct_matrix'
,
self
.
dct_matrix
)
def
forward
(
self
,
x
):
log_mel_feature
=
self
.
_log_melspectrogram
(
x
)
mfcc
=
paddle
.
matmul
(
log_mel_feature
.
transpose
((
0
,
2
,
1
)),
self
.
dct_matrix
).
transpose
(
(
0
,
2
,
1
))
# (B, n_mels, L)
return
mfcc
paddleaudio/paddleaudio/functional/__init__.py
浏览文件 @
25cb4bb0
...
...
@@ -11,3 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.functional
import
compute_fbank_matrix
from
.functional
import
create_dct
from
.functional
import
fft_frequencies
from
.functional
import
hz_to_mel
from
.functional
import
mel_frequencies
from
.functional
import
mel_to_hz
from
.functional
import
power_to_db
paddleaudio/paddleaudio/functional/functional.py
浏览文件 @
25cb4bb0
此差异已折叠。
点击以展开。
paddleaudio/paddleaudio/io/__init__.py
浏览文件 @
25cb4bb0
# Copyright (c) 202
2
PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 202
1
PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
@@ -11,9 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.audio
import
depth_convert
from
.audio
import
load
from
.audio
import
normalize
from
.audio
import
resample
from
.audio
import
save_wav
from
.audio
import
to_mono
paddleaudio/paddleaudio/io/audio.py
已删除
100644 → 0
浏览文件 @
39e5a81a
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
warnings
from
typing
import
Optional
from
typing
import
Tuple
from
typing
import
Union
import
numpy
as
np
import
resampy
import
soundfile
as
sf
from
numpy
import
ndarray
as
array
from
scipy.io
import
wavfile
from
..utils
import
ParameterError
__all__
=
[
'resample'
,
'to_mono'
,
'depth_convert'
,
'normalize'
,
'save_wav'
,
'load'
,
]
NORMALMIZE_TYPES
=
[
'linear'
,
'gaussian'
]
MERGE_TYPES
=
[
'ch0'
,
'ch1'
,
'random'
,
'average'
]
RESAMPLE_MODES
=
[
'kaiser_best'
,
'kaiser_fast'
]
EPS
=
1e-8
def
resample
(
y
:
array
,
src_sr
:
int
,
target_sr
:
int
,
mode
:
str
=
'kaiser_fast'
)
->
array
:
""" Audio resampling
This function is the same as using resampy.resample().
Notes:
The default mode is kaiser_fast. For better audio quality, use mode = 'kaiser_fast'
"""
if
mode
==
'kaiser_best'
:
warnings
.
warn
(
f
'Using resampy in kaiser_best to
{
src_sr
}
=>
{
target_sr
}
. This function is pretty slow,
\
we recommend the mode kaiser_fast in large scale audio trainning'
)
if
not
isinstance
(
y
,
np
.
ndarray
):
raise
ParameterError
(
'Only support numpy array, but received y in {type(y)}'
)
if
mode
not
in
RESAMPLE_MODES
:
raise
ParameterError
(
f
'resample mode must in
{
RESAMPLE_MODES
}
'
)
return
resampy
.
resample
(
y
,
src_sr
,
target_sr
,
filter
=
mode
)
def
to_mono
(
y
:
array
,
merge_type
:
str
=
'average'
)
->
array
:
""" convert sterior audio to mono
"""
if
merge_type
not
in
MERGE_TYPES
:
raise
ParameterError
(
f
'Unsupported merge type
{
merge_type
}
, available types are
{
MERGE_TYPES
}
'
)
if
y
.
ndim
>
2
:
raise
ParameterError
(
f
'Unsupported audio array, y.ndim > 2, the shape is
{
y
.
shape
}
'
)
if
y
.
ndim
==
1
:
# nothing to merge
return
y
if
merge_type
==
'ch0'
:
return
y
[
0
]
if
merge_type
==
'ch1'
:
return
y
[
1
]
if
merge_type
==
'random'
:
return
y
[
np
.
random
.
randint
(
0
,
2
)]
# need to do averaging according to dtype
if
y
.
dtype
==
'float32'
:
y_out
=
(
y
[
0
]
+
y
[
1
])
*
0.5
elif
y
.
dtype
==
'int16'
:
y_out
=
y
.
astype
(
'int32'
)
y_out
=
(
y_out
[
0
]
+
y_out
[
1
])
//
2
y_out
=
np
.
clip
(
y_out
,
np
.
iinfo
(
y
.
dtype
).
min
,
np
.
iinfo
(
y
.
dtype
).
max
).
astype
(
y
.
dtype
)
elif
y
.
dtype
==
'int8'
:
y_out
=
y
.
astype
(
'int16'
)
y_out
=
(
y_out
[
0
]
+
y_out
[
1
])
//
2
y_out
=
np
.
clip
(
y_out
,
np
.
iinfo
(
y
.
dtype
).
min
,
np
.
iinfo
(
y
.
dtype
).
max
).
astype
(
y
.
dtype
)
else
:
raise
ParameterError
(
f
'Unsupported dtype:
{
y
.
dtype
}
'
)
return
y_out
def
_safe_cast
(
y
:
array
,
dtype
:
Union
[
type
,
str
])
->
array
:
""" data type casting in a safe way, i.e., prevent overflow or underflow
This function is used internally.
"""
return
np
.
clip
(
y
,
np
.
iinfo
(
dtype
).
min
,
np
.
iinfo
(
dtype
).
max
).
astype
(
dtype
)
def
depth_convert
(
y
:
array
,
dtype
:
Union
[
type
,
str
],
dithering
:
bool
=
True
)
->
array
:
"""Convert audio array to target dtype safely
This function convert audio waveform to a target dtype, with addition steps of
preventing overflow/underflow and preserving audio range.
"""
SUPPORT_DTYPE
=
[
'int16'
,
'int8'
,
'float32'
,
'float64'
]
if
y
.
dtype
not
in
SUPPORT_DTYPE
:
raise
ParameterError
(
'Unsupported audio dtype, '
f
'y.dtype is
{
y
.
dtype
}
, supported dtypes are
{
SUPPORT_DTYPE
}
'
)
if
dtype
not
in
SUPPORT_DTYPE
:
raise
ParameterError
(
'Unsupported audio dtype, '
f
'target dtype is
{
dtype
}
, supported dtypes are
{
SUPPORT_DTYPE
}
'
)
if
dtype
==
y
.
dtype
:
return
y
if
dtype
==
'float64'
and
y
.
dtype
==
'float32'
:
return
_safe_cast
(
y
,
dtype
)
if
dtype
==
'float32'
and
y
.
dtype
==
'float64'
:
return
_safe_cast
(
y
,
dtype
)
if
dtype
==
'int16'
or
dtype
==
'int8'
:
if
y
.
dtype
in
[
'float64'
,
'float32'
]:
factor
=
np
.
iinfo
(
dtype
).
max
y
=
np
.
clip
(
y
*
factor
,
np
.
iinfo
(
dtype
).
min
,
np
.
iinfo
(
dtype
).
max
).
astype
(
dtype
)
y
=
y
.
astype
(
dtype
)
else
:
if
dtype
==
'int16'
and
y
.
dtype
==
'int8'
:
factor
=
np
.
iinfo
(
'int16'
).
max
/
np
.
iinfo
(
'int8'
).
max
-
EPS
y
=
y
.
astype
(
'float32'
)
*
factor
y
=
y
.
astype
(
'int16'
)
else
:
# dtype == 'int8' and y.dtype=='int16':
y
=
y
.
astype
(
'int32'
)
*
np
.
iinfo
(
'int8'
).
max
/
\
np
.
iinfo
(
'int16'
).
max
y
=
y
.
astype
(
'int8'
)
if
dtype
in
[
'float32'
,
'float64'
]:
org_dtype
=
y
.
dtype
y
=
y
.
astype
(
dtype
)
/
np
.
iinfo
(
org_dtype
).
max
return
y
def
sound_file_load
(
file
:
str
,
offset
:
Optional
[
float
]
=
None
,
dtype
:
str
=
'int16'
,
duration
:
Optional
[
int
]
=
None
)
->
Tuple
[
array
,
int
]:
"""Load audio using soundfile library
This function load audio file using libsndfile.
Reference:
http://www.mega-nerd.com/libsndfile/#Features
"""
with
sf
.
SoundFile
(
file
)
as
sf_desc
:
sr_native
=
sf_desc
.
samplerate
if
offset
:
sf_desc
.
seek
(
int
(
offset
*
sr_native
))
if
duration
is
not
None
:
frame_duration
=
int
(
duration
*
sr_native
)
else
:
frame_duration
=
-
1
y
=
sf_desc
.
read
(
frames
=
frame_duration
,
dtype
=
dtype
,
always_2d
=
False
).
T
return
y
,
sf_desc
.
samplerate
def
audio_file_load
():
"""Load audio using audiofile library
This function load audio file using audiofile.
Reference:
https://audiofile.68k.org/
"""
raise
NotImplementedError
()
def
sox_file_load
():
"""Load audio using sox library
This function load audio file using sox.
Reference:
http://sox.sourceforge.net/
"""
raise
NotImplementedError
()
def
normalize
(
y
:
array
,
norm_type
:
str
=
'linear'
,
mul_factor
:
float
=
1.0
)
->
array
:
""" normalize an input audio with additional multiplier.
"""
if
norm_type
==
'linear'
:
amax
=
np
.
max
(
np
.
abs
(
y
))
factor
=
1.0
/
(
amax
+
EPS
)
y
=
y
*
factor
*
mul_factor
elif
norm_type
==
'gaussian'
:
amean
=
np
.
mean
(
y
)
astd
=
np
.
std
(
y
)
astd
=
max
(
astd
,
EPS
)
y
=
mul_factor
*
(
y
-
amean
)
/
astd
else
:
raise
NotImplementedError
(
f
'norm_type should be in
{
NORMALMIZE_TYPES
}
'
)
return
y
def
save_wav
(
y
:
array
,
sr
:
int
,
file
:
str
)
->
None
:
"""Save audio file to disk.
This function saves audio to disk using scipy.io.wavfile, with additional step
to convert input waveform to int16 unless it already is int16
Notes:
It only support raw wav format.
"""
if
not
file
.
endswith
(
'.wav'
):
raise
ParameterError
(
f
'only .wav file supported, but dst file name is:
{
file
}
'
)
if
sr
<=
0
:
raise
ParameterError
(
f
'Sample rate should be larger than 0, recieved sr =
{
sr
}
'
)
if
y
.
dtype
not
in
[
'int16'
,
'int8'
]:
warnings
.
warn
(
f
'input data type is
{
y
.
dtype
}
, will convert data to int16 format before saving'
)
y_out
=
depth_convert
(
y
,
'int16'
)
else
:
y_out
=
y
wavfile
.
write
(
file
,
sr
,
y_out
)
def
load
(
file
:
str
,
sr
:
Optional
[
int
]
=
None
,
mono
:
bool
=
True
,
merge_type
:
str
=
'average'
,
# ch0,ch1,random,average
normal
:
bool
=
True
,
norm_type
:
str
=
'linear'
,
norm_mul_factor
:
float
=
1.0
,
offset
:
float
=
0.0
,
duration
:
Optional
[
int
]
=
None
,
dtype
:
str
=
'float32'
,
resample_mode
:
str
=
'kaiser_fast'
)
->
Tuple
[
array
,
int
]:
"""Load audio file from disk.
This function loads audio from disk using using audio beackend.
Parameters:
Notes:
"""
y
,
r
=
sound_file_load
(
file
,
offset
=
offset
,
dtype
=
dtype
,
duration
=
duration
)
if
not
((
y
.
ndim
==
1
and
len
(
y
)
>
0
)
or
(
y
.
ndim
==
2
and
len
(
y
[
0
])
>
0
)):
raise
ParameterError
(
f
'audio file
{
file
}
looks empty'
)
if
mono
:
y
=
to_mono
(
y
,
merge_type
)
if
sr
is
not
None
and
sr
!=
r
:
y
=
resample
(
y
,
r
,
sr
,
mode
=
resample_mode
)
r
=
sr
if
normal
:
y
=
normalize
(
y
,
norm_type
,
norm_mul_factor
)
elif
dtype
in
[
'int8'
,
'int16'
]:
# still need to do normalization, before depth convertion
y
=
normalize
(
y
,
'linear'
,
1.0
)
y
=
depth_convert
(
y
,
dtype
)
return
y
,
r
paddleaudio/paddleaudio/metric/mcd.py
浏览文件 @
25cb4bb0
...
...
@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import
mcd.metrics_fast
as
mt
import
numpy
as
np
from
mcd
import
dtw
__all__
=
[
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录