Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
d0bca198
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
d0bca198
编写于
3月 04, 2022
作者:
H
Hui Zhang
提交者:
GitHub
3月 04, 2022
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #1494 from PaddlePaddle/audio
[audio] refactor audio arch
上级
2886ab93
d70bcb8f
变更
34
展开全部
隐藏空白更改
内联
并排
Showing
34 changed file
with
1673 addition
and
254 deletion
+1673
-254
.gitignore
.gitignore
+2
-3
paddleaudio/CHANGELOG.md
paddleaudio/CHANGELOG.md
+4
-0
paddleaudio/features/augment.py
paddleaudio/features/augment.py
+0
-170
paddleaudio/paddleaudio/__init__.py
paddleaudio/paddleaudio/__init__.py
+22
-0
paddleaudio/paddleaudio/backends/__init__.py
paddleaudio/paddleaudio/backends/__init__.py
+19
-0
paddleaudio/paddleaudio/backends/soundfile_backend.py
paddleaudio/paddleaudio/backends/soundfile_backend.py
+3
-41
paddleaudio/paddleaudio/backends/sox_backend.py
paddleaudio/paddleaudio/backends/sox_backend.py
+13
-0
paddleaudio/paddleaudio/compliance/__init__.py
paddleaudio/paddleaudio/compliance/__init__.py
+1
-3
paddleaudio/paddleaudio/compliance/kaldi.py
paddleaudio/paddleaudio/compliance/kaldi.py
+638
-0
paddleaudio/paddleaudio/compliance/librosa.py
paddleaudio/paddleaudio/compliance/librosa.py
+152
-2
paddleaudio/paddleaudio/datasets/__init__.py
paddleaudio/paddleaudio/datasets/__init__.py
+0
-7
paddleaudio/paddleaudio/datasets/dataset.py
paddleaudio/paddleaudio/datasets/dataset.py
+2
-2
paddleaudio/paddleaudio/datasets/esc50.py
paddleaudio/paddleaudio/datasets/esc50.py
+0
-0
paddleaudio/paddleaudio/datasets/gtzan.py
paddleaudio/paddleaudio/datasets/gtzan.py
+0
-0
paddleaudio/paddleaudio/datasets/tess.py
paddleaudio/paddleaudio/datasets/tess.py
+0
-0
paddleaudio/paddleaudio/datasets/urban_sound.py
paddleaudio/paddleaudio/datasets/urban_sound.py
+0
-0
paddleaudio/paddleaudio/features/__init__.py
paddleaudio/paddleaudio/features/__init__.py
+4
-3
paddleaudio/paddleaudio/features/layers.py
paddleaudio/paddleaudio/features/layers.py
+344
-0
paddleaudio/paddleaudio/functional/__init__.py
paddleaudio/paddleaudio/functional/__init__.py
+20
-0
paddleaudio/paddleaudio/functional/functional.py
paddleaudio/paddleaudio/functional/functional.py
+265
-0
paddleaudio/paddleaudio/functional/window.py
paddleaudio/paddleaudio/functional/window.py
+29
-15
paddleaudio/paddleaudio/io/__init__.py
paddleaudio/paddleaudio/io/__init__.py
+0
-1
paddleaudio/paddleaudio/metric/__init__.py
paddleaudio/paddleaudio/metric/__init__.py
+15
-0
paddleaudio/paddleaudio/metric/dtw.py
paddleaudio/paddleaudio/metric/dtw.py
+42
-0
paddleaudio/paddleaudio/metric/mcd.py
paddleaudio/paddleaudio/metric/mcd.py
+48
-0
paddleaudio/paddleaudio/sox_effects/__init__.py
paddleaudio/paddleaudio/sox_effects/__init__.py
+13
-0
paddleaudio/paddleaudio/utils/__init__.py
paddleaudio/paddleaudio/utils/__init__.py
+12
-5
paddleaudio/paddleaudio/utils/download.py
paddleaudio/paddleaudio/utils/download.py
+6
-0
paddleaudio/paddleaudio/utils/env.py
paddleaudio/paddleaudio/utils/env.py
+7
-0
paddleaudio/paddleaudio/utils/error.py
paddleaudio/paddleaudio/utils/error.py
+0
-0
paddleaudio/paddleaudio/utils/log.py
paddleaudio/paddleaudio/utils/log.py
+4
-1
paddleaudio/paddleaudio/utils/time.py
paddleaudio/paddleaudio/utils/time.py
+5
-0
paddleaudio/setup.py
paddleaudio/setup.py
+3
-1
paddleaudio/tests/.gitkeep
paddleaudio/tests/.gitkeep
+0
-0
未找到文件。
.gitignore
浏览文件 @
d0bca198
...
@@ -14,6 +14,7 @@
...
@@ -14,6 +14,7 @@
*.whl
*.whl
*.egg-info
*.egg-info
build
build
*output/
docs/build/
docs/build/
docs/topic/ctc/warp-ctc/
docs/topic/ctc/warp-ctc/
...
@@ -33,6 +34,4 @@ tools/activate_python.sh
...
@@ -33,6 +34,4 @@ tools/activate_python.sh
tools/miniconda.sh
tools/miniconda.sh
tools/CRF++-0.58/
tools/CRF++-0.58/
speechx/fc_patch/
speechx/fc_patch/
\ No newline at end of file
*output/
paddleaudio/CHANGELOG.md
浏览文件 @
d0bca198
# Changelog
# Changelog
Date: 2022-2-25, Author: Hui Zhang.
-
Refactor architecture.
-
dtw distance and mcd style dtw
paddleaudio/features/augment.py
已删除
100644 → 0
浏览文件 @
2886ab93
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
typing
import
List
import
numpy
as
np
from
numpy
import
ndarray
as
array
from
..backends
import
depth_convert
from
..utils
import
ParameterError
__all__
=
[
'depth_augment'
,
'spect_augment'
,
'random_crop1d'
,
'random_crop2d'
,
'adaptive_spect_augment'
,
]
def
randint
(
high
:
int
)
->
int
:
"""Generate one random integer in range [0 high)
This is a helper function for random data augmentaiton
"""
return
int
(
np
.
random
.
randint
(
0
,
high
=
high
))
def
rand
()
->
float
:
"""Generate one floating-point number in range [0 1)
This is a helper function for random data augmentaiton
"""
return
float
(
np
.
random
.
rand
(
1
))
def
depth_augment
(
y
:
array
,
choices
:
List
=
[
'int8'
,
'int16'
],
probs
:
List
[
float
]
=
[
0.5
,
0.5
])
->
array
:
""" Audio depth augmentation
Do audio depth augmentation to simulate the distortion brought by quantization.
"""
assert
len
(
probs
)
==
len
(
choices
),
'number of choices {} must be equal to size of probs {}'
.
format
(
len
(
choices
),
len
(
probs
))
depth
=
np
.
random
.
choice
(
choices
,
p
=
probs
)
src_depth
=
y
.
dtype
y1
=
depth_convert
(
y
,
depth
)
y2
=
depth_convert
(
y1
,
src_depth
)
return
y2
def
adaptive_spect_augment
(
spect
:
array
,
tempo_axis
:
int
=
0
,
level
:
float
=
0.1
)
->
array
:
"""Do adpative spectrogram augmentation
The level of the augmentation is gowern by the paramter level,
ranging from 0 to 1, with 0 represents no augmentation。
"""
assert
spect
.
ndim
==
2.
,
'only supports 2d tensor or numpy array'
if
tempo_axis
==
0
:
nt
,
nf
=
spect
.
shape
else
:
nf
,
nt
=
spect
.
shape
time_mask_width
=
int
(
nt
*
level
*
0.5
)
freq_mask_width
=
int
(
nf
*
level
*
0.5
)
num_time_mask
=
int
(
10
*
level
)
num_freq_mask
=
int
(
10
*
level
)
if
tempo_axis
==
0
:
for
_
in
range
(
num_time_mask
):
start
=
randint
(
nt
-
time_mask_width
)
spect
[
start
:
start
+
time_mask_width
,
:]
=
0
for
_
in
range
(
num_freq_mask
):
start
=
randint
(
nf
-
freq_mask_width
)
spect
[:,
start
:
start
+
freq_mask_width
]
=
0
else
:
for
_
in
range
(
num_time_mask
):
start
=
randint
(
nt
-
time_mask_width
)
spect
[:,
start
:
start
+
time_mask_width
]
=
0
for
_
in
range
(
num_freq_mask
):
start
=
randint
(
nf
-
freq_mask_width
)
spect
[
start
:
start
+
freq_mask_width
,
:]
=
0
return
spect
def
spect_augment
(
spect
:
array
,
tempo_axis
:
int
=
0
,
max_time_mask
:
int
=
3
,
max_freq_mask
:
int
=
3
,
max_time_mask_width
:
int
=
30
,
max_freq_mask_width
:
int
=
20
)
->
array
:
"""Do spectrogram augmentation in both time and freq axis
Reference:
"""
assert
spect
.
ndim
==
2.
,
'only supports 2d tensor or numpy array'
if
tempo_axis
==
0
:
nt
,
nf
=
spect
.
shape
else
:
nf
,
nt
=
spect
.
shape
num_time_mask
=
randint
(
max_time_mask
)
num_freq_mask
=
randint
(
max_freq_mask
)
time_mask_width
=
randint
(
max_time_mask_width
)
freq_mask_width
=
randint
(
max_freq_mask_width
)
if
tempo_axis
==
0
:
for
_
in
range
(
num_time_mask
):
start
=
randint
(
nt
-
time_mask_width
)
spect
[
start
:
start
+
time_mask_width
,
:]
=
0
for
_
in
range
(
num_freq_mask
):
start
=
randint
(
nf
-
freq_mask_width
)
spect
[:,
start
:
start
+
freq_mask_width
]
=
0
else
:
for
_
in
range
(
num_time_mask
):
start
=
randint
(
nt
-
time_mask_width
)
spect
[:,
start
:
start
+
time_mask_width
]
=
0
for
_
in
range
(
num_freq_mask
):
start
=
randint
(
nf
-
freq_mask_width
)
spect
[
start
:
start
+
freq_mask_width
,
:]
=
0
return
spect
def
random_crop1d
(
y
:
array
,
crop_len
:
int
)
->
array
:
""" Do random cropping on 1d input signal
The input is a 1d signal, typically a sound waveform
"""
if
y
.
ndim
!=
1
:
'only accept 1d tensor or numpy array'
n
=
len
(
y
)
idx
=
randint
(
n
-
crop_len
)
return
y
[
idx
:
idx
+
crop_len
]
def
random_crop2d
(
s
:
array
,
crop_len
:
int
,
tempo_axis
:
int
=
0
)
->
array
:
""" Do random cropping for 2D array, typically a spectrogram.
The cropping is done in temporal direction on the time-freq input signal.
"""
if
tempo_axis
>=
s
.
ndim
:
raise
ParameterError
(
'axis out of range'
)
n
=
s
.
shape
[
tempo_axis
]
idx
=
randint
(
high
=
n
-
crop_len
)
sli
=
[
slice
(
None
)
for
i
in
range
(
s
.
ndim
)]
sli
[
tempo_axis
]
=
slice
(
idx
,
idx
+
crop_len
)
out
=
s
[
tuple
(
sli
)]
return
out
paddleaudio/paddleaudio/__init__.py
0 → 100644
浏览文件 @
d0bca198
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.
import
compliance
from
.
import
datasets
from
.
import
features
from
.
import
functional
from
.
import
io
from
.
import
metric
from
.
import
sox_effects
from
.backends
import
load
from
.backends
import
save
paddleaudio/paddleaudio/backends/__init__.py
0 → 100644
浏览文件 @
d0bca198
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.soundfile_backend
import
depth_convert
from
.soundfile_backend
import
load
from
.soundfile_backend
import
normalize
from
.soundfile_backend
import
resample
from
.soundfile_backend
import
save
from
.soundfile_backend
import
to_mono
paddleaudio/
backends/audio
.py
→
paddleaudio/
paddleaudio/backends/soundfile_backend
.py
浏览文件 @
d0bca198
# Copyright (c) 202
1
PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 202
2
PaddlePaddle Authors. All Rights Reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# you may not use this file except in compliance with the License.
...
@@ -29,7 +29,7 @@ __all__ = [
...
@@ -29,7 +29,7 @@ __all__ = [
'to_mono'
,
'to_mono'
,
'depth_convert'
,
'depth_convert'
,
'normalize'
,
'normalize'
,
'save
_wav
'
,
'save'
,
'load'
,
'load'
,
]
]
NORMALMIZE_TYPES
=
[
'linear'
,
'gaussian'
]
NORMALMIZE_TYPES
=
[
'linear'
,
'gaussian'
]
...
@@ -41,12 +41,9 @@ EPS = 1e-8
...
@@ -41,12 +41,9 @@ EPS = 1e-8
def
resample
(
y
:
array
,
src_sr
:
int
,
target_sr
:
int
,
def
resample
(
y
:
array
,
src_sr
:
int
,
target_sr
:
int
,
mode
:
str
=
'kaiser_fast'
)
->
array
:
mode
:
str
=
'kaiser_fast'
)
->
array
:
""" Audio resampling
""" Audio resampling
This function is the same as using resampy.resample().
This function is the same as using resampy.resample().
Notes:
Notes:
The default mode is kaiser_fast. For better audio quality, use mode = 'kaiser_fast'
The default mode is kaiser_fast. For better audio quality, use mode = 'kaiser_fast'
"""
"""
if
mode
==
'kaiser_best'
:
if
mode
==
'kaiser_best'
:
...
@@ -106,7 +103,6 @@ def to_mono(y: array, merge_type: str='average') -> array:
...
@@ -106,7 +103,6 @@ def to_mono(y: array, merge_type: str='average') -> array:
def
_safe_cast
(
y
:
array
,
dtype
:
Union
[
type
,
str
])
->
array
:
def
_safe_cast
(
y
:
array
,
dtype
:
Union
[
type
,
str
])
->
array
:
""" data type casting in a safe way, i.e., prevent overflow or underflow
""" data type casting in a safe way, i.e., prevent overflow or underflow
This function is used internally.
This function is used internally.
"""
"""
return
np
.
clip
(
y
,
np
.
iinfo
(
dtype
).
min
,
np
.
iinfo
(
dtype
).
max
).
astype
(
dtype
)
return
np
.
clip
(
y
,
np
.
iinfo
(
dtype
).
min
,
np
.
iinfo
(
dtype
).
max
).
astype
(
dtype
)
...
@@ -115,10 +111,8 @@ def _safe_cast(y: array, dtype: Union[type, str]) -> array:
...
@@ -115,10 +111,8 @@ def _safe_cast(y: array, dtype: Union[type, str]) -> array:
def
depth_convert
(
y
:
array
,
dtype
:
Union
[
type
,
str
],
def
depth_convert
(
y
:
array
,
dtype
:
Union
[
type
,
str
],
dithering
:
bool
=
True
)
->
array
:
dithering
:
bool
=
True
)
->
array
:
"""Convert audio array to target dtype safely
"""Convert audio array to target dtype safely
This function convert audio waveform to a target dtype, with addition steps of
This function convert audio waveform to a target dtype, with addition steps of
preventing overflow/underflow and preserving audio range.
preventing overflow/underflow and preserving audio range.
"""
"""
SUPPORT_DTYPE
=
[
'int16'
,
'int8'
,
'float32'
,
'float64'
]
SUPPORT_DTYPE
=
[
'int16'
,
'int8'
,
'float32'
,
'float64'
]
...
@@ -168,12 +162,9 @@ def sound_file_load(file: str,
...
@@ -168,12 +162,9 @@ def sound_file_load(file: str,
dtype
:
str
=
'int16'
,
dtype
:
str
=
'int16'
,
duration
:
Optional
[
int
]
=
None
)
->
Tuple
[
array
,
int
]:
duration
:
Optional
[
int
]
=
None
)
->
Tuple
[
array
,
int
]:
"""Load audio using soundfile library
"""Load audio using soundfile library
This function load audio file using libsndfile.
This function load audio file using libsndfile.
Reference:
Reference:
http://www.mega-nerd.com/libsndfile/#Features
http://www.mega-nerd.com/libsndfile/#Features
"""
"""
with
sf
.
SoundFile
(
file
)
as
sf_desc
:
with
sf
.
SoundFile
(
file
)
as
sf_desc
:
sr_native
=
sf_desc
.
samplerate
sr_native
=
sf_desc
.
samplerate
...
@@ -188,33 +179,9 @@ def sound_file_load(file: str,
...
@@ -188,33 +179,9 @@ def sound_file_load(file: str,
return
y
,
sf_desc
.
samplerate
return
y
,
sf_desc
.
samplerate
def
audio_file_load
():
"""Load audio using audiofile library
This function load audio file using audiofile.
Reference:
https://audiofile.68k.org/
"""
raise
NotImplementedError
()
def
sox_file_load
():
"""Load audio using sox library
This function load audio file using sox.
Reference:
http://sox.sourceforge.net/
"""
raise
NotImplementedError
()
def
normalize
(
y
:
array
,
norm_type
:
str
=
'linear'
,
def
normalize
(
y
:
array
,
norm_type
:
str
=
'linear'
,
mul_factor
:
float
=
1.0
)
->
array
:
mul_factor
:
float
=
1.0
)
->
array
:
""" normalize an input audio with additional multiplier.
""" normalize an input audio with additional multiplier.
"""
"""
if
norm_type
==
'linear'
:
if
norm_type
==
'linear'
:
...
@@ -232,14 +199,12 @@ def normalize(y: array, norm_type: str='linear',
...
@@ -232,14 +199,12 @@ def normalize(y: array, norm_type: str='linear',
return
y
return
y
def
save
_wav
(
y
:
array
,
sr
:
int
,
file
:
str
)
->
None
:
def
save
(
y
:
array
,
sr
:
int
,
file
:
str
)
->
None
:
"""Save audio file to disk.
"""Save audio file to disk.
This function saves audio to disk using scipy.io.wavfile, with additional step
This function saves audio to disk using scipy.io.wavfile, with additional step
to convert input waveform to int16 unless it already is int16
to convert input waveform to int16 unless it already is int16
Notes:
Notes:
It only support raw wav format.
It only support raw wav format.
"""
"""
if
not
file
.
endswith
(
'.wav'
):
if
not
file
.
endswith
(
'.wav'
):
raise
ParameterError
(
raise
ParameterError
(
...
@@ -274,11 +239,8 @@ def load(
...
@@ -274,11 +239,8 @@ def load(
resample_mode
:
str
=
'kaiser_fast'
)
->
Tuple
[
array
,
int
]:
resample_mode
:
str
=
'kaiser_fast'
)
->
Tuple
[
array
,
int
]:
"""Load audio file from disk.
"""Load audio file from disk.
This function loads audio from disk using using audio beackend.
This function loads audio from disk using using audio beackend.
Parameters:
Parameters:
Notes:
Notes:
"""
"""
y
,
r
=
sound_file_load
(
file
,
offset
=
offset
,
dtype
=
dtype
,
duration
=
duration
)
y
,
r
=
sound_file_load
(
file
,
offset
=
offset
,
dtype
=
dtype
,
duration
=
duration
)
...
...
paddleaudio/paddleaudio/backends/sox_backend.py
0 → 100644
浏览文件 @
d0bca198
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
paddleaudio/__init__.py
→
paddleaudio/
paddleaudio/compliance/
__init__.py
浏览文件 @
d0bca198
# Copyright (c) 202
1
PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 202
2
PaddlePaddle Authors. All Rights Reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# you may not use this file except in compliance with the License.
...
@@ -11,5 +11,3 @@
...
@@ -11,5 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
from
.backends
import
*
from
.features
import
*
paddleaudio/paddleaudio/compliance/kaldi.py
0 → 100644
浏览文件 @
d0bca198
此差异已折叠。
点击以展开。
paddleaudio/
features/core
.py
→
paddleaudio/
paddleaudio/compliance/librosa
.py
浏览文件 @
d0bca198
...
@@ -21,11 +21,13 @@ import numpy as np
...
@@ -21,11 +21,13 @@ import numpy as np
import
scipy
import
scipy
from
numpy
import
ndarray
as
array
from
numpy
import
ndarray
as
array
from
numpy.lib.stride_tricks
import
as_strided
from
numpy.lib.stride_tricks
import
as_strided
from
scipy
.signal
import
get_window
from
scipy
import
signal
from
..backends
import
depth_convert
from
..utils
import
ParameterError
from
..utils
import
ParameterError
__all__
=
[
__all__
=
[
# dsp
'stft'
,
'stft'
,
'mfcc'
,
'mfcc'
,
'hz_to_mel'
,
'hz_to_mel'
,
...
@@ -38,6 +40,12 @@ __all__ = [
...
@@ -38,6 +40,12 @@ __all__ = [
'spectrogram'
,
'spectrogram'
,
'mu_encode'
,
'mu_encode'
,
'mu_decode'
,
'mu_decode'
,
# augmentation
'depth_augment'
,
'spect_augment'
,
'random_crop1d'
,
'random_crop2d'
,
'adaptive_spect_augment'
,
]
]
...
@@ -303,7 +311,7 @@ def stft(x: array,
...
@@ -303,7 +311,7 @@ def stft(x: array,
if
hop_length
is
None
:
if
hop_length
is
None
:
hop_length
=
int
(
win_length
//
4
)
hop_length
=
int
(
win_length
//
4
)
fft_window
=
get_window
(
window
,
win_length
,
fftbins
=
True
)
fft_window
=
signal
.
get_window
(
window
,
win_length
,
fftbins
=
True
)
# Pad the window out to n_fft size
# Pad the window out to n_fft size
fft_window
=
pad_center
(
fft_window
,
n_fft
)
fft_window
=
pad_center
(
fft_window
,
n_fft
)
...
@@ -576,3 +584,145 @@ def mu_decode(y: array, mu: int=255, quantized: bool=True) -> array:
...
@@ -576,3 +584,145 @@ def mu_decode(y: array, mu: int=255, quantized: bool=True) -> array:
y
=
y
*
2
/
mu
-
1
y
=
y
*
2
/
mu
-
1
x
=
np
.
sign
(
y
)
/
mu
*
((
1
+
mu
)
**
np
.
abs
(
y
)
-
1
)
x
=
np
.
sign
(
y
)
/
mu
*
((
1
+
mu
)
**
np
.
abs
(
y
)
-
1
)
return
x
return
x
def
randint
(
high
:
int
)
->
int
:
"""Generate one random integer in range [0 high)
This is a helper function for random data augmentaiton
"""
return
int
(
np
.
random
.
randint
(
0
,
high
=
high
))
def
rand
()
->
float
:
"""Generate one floating-point number in range [0 1)
This is a helper function for random data augmentaiton
"""
return
float
(
np
.
random
.
rand
(
1
))
def
depth_augment
(
y
:
array
,
choices
:
List
=
[
'int8'
,
'int16'
],
probs
:
List
[
float
]
=
[
0.5
,
0.5
])
->
array
:
""" Audio depth augmentation
Do audio depth augmentation to simulate the distortion brought by quantization.
"""
assert
len
(
probs
)
==
len
(
choices
),
'number of choices {} must be equal to size of probs {}'
.
format
(
len
(
choices
),
len
(
probs
))
depth
=
np
.
random
.
choice
(
choices
,
p
=
probs
)
src_depth
=
y
.
dtype
y1
=
depth_convert
(
y
,
depth
)
y2
=
depth_convert
(
y1
,
src_depth
)
return
y2
def
adaptive_spect_augment
(
spect
:
array
,
tempo_axis
:
int
=
0
,
level
:
float
=
0.1
)
->
array
:
"""Do adpative spectrogram augmentation
The level of the augmentation is gowern by the paramter level,
ranging from 0 to 1, with 0 represents no augmentation。
"""
assert
spect
.
ndim
==
2.
,
'only supports 2d tensor or numpy array'
if
tempo_axis
==
0
:
nt
,
nf
=
spect
.
shape
else
:
nf
,
nt
=
spect
.
shape
time_mask_width
=
int
(
nt
*
level
*
0.5
)
freq_mask_width
=
int
(
nf
*
level
*
0.5
)
num_time_mask
=
int
(
10
*
level
)
num_freq_mask
=
int
(
10
*
level
)
if
tempo_axis
==
0
:
for
_
in
range
(
num_time_mask
):
start
=
randint
(
nt
-
time_mask_width
)
spect
[
start
:
start
+
time_mask_width
,
:]
=
0
for
_
in
range
(
num_freq_mask
):
start
=
randint
(
nf
-
freq_mask_width
)
spect
[:,
start
:
start
+
freq_mask_width
]
=
0
else
:
for
_
in
range
(
num_time_mask
):
start
=
randint
(
nt
-
time_mask_width
)
spect
[:,
start
:
start
+
time_mask_width
]
=
0
for
_
in
range
(
num_freq_mask
):
start
=
randint
(
nf
-
freq_mask_width
)
spect
[
start
:
start
+
freq_mask_width
,
:]
=
0
return
spect
def
spect_augment
(
spect
:
array
,
tempo_axis
:
int
=
0
,
max_time_mask
:
int
=
3
,
max_freq_mask
:
int
=
3
,
max_time_mask_width
:
int
=
30
,
max_freq_mask_width
:
int
=
20
)
->
array
:
"""Do spectrogram augmentation in both time and freq axis
Reference:
"""
assert
spect
.
ndim
==
2.
,
'only supports 2d tensor or numpy array'
if
tempo_axis
==
0
:
nt
,
nf
=
spect
.
shape
else
:
nf
,
nt
=
spect
.
shape
num_time_mask
=
randint
(
max_time_mask
)
num_freq_mask
=
randint
(
max_freq_mask
)
time_mask_width
=
randint
(
max_time_mask_width
)
freq_mask_width
=
randint
(
max_freq_mask_width
)
if
tempo_axis
==
0
:
for
_
in
range
(
num_time_mask
):
start
=
randint
(
nt
-
time_mask_width
)
spect
[
start
:
start
+
time_mask_width
,
:]
=
0
for
_
in
range
(
num_freq_mask
):
start
=
randint
(
nf
-
freq_mask_width
)
spect
[:,
start
:
start
+
freq_mask_width
]
=
0
else
:
for
_
in
range
(
num_time_mask
):
start
=
randint
(
nt
-
time_mask_width
)
spect
[:,
start
:
start
+
time_mask_width
]
=
0
for
_
in
range
(
num_freq_mask
):
start
=
randint
(
nf
-
freq_mask_width
)
spect
[
start
:
start
+
freq_mask_width
,
:]
=
0
return
spect
def
random_crop1d
(
y
:
array
,
crop_len
:
int
)
->
array
:
""" Do random cropping on 1d input signal
The input is a 1d signal, typically a sound waveform
"""
if
y
.
ndim
!=
1
:
'only accept 1d tensor or numpy array'
n
=
len
(
y
)
idx
=
randint
(
n
-
crop_len
)
return
y
[
idx
:
idx
+
crop_len
]
def
random_crop2d
(
s
:
array
,
crop_len
:
int
,
tempo_axis
:
int
=
0
)
->
array
:
""" Do random cropping for 2D array, typically a spectrogram.
The cropping is done in temporal direction on the time-freq input signal.
"""
if
tempo_axis
>=
s
.
ndim
:
raise
ParameterError
(
'axis out of range'
)
n
=
s
.
shape
[
tempo_axis
]
idx
=
randint
(
high
=
n
-
crop_len
)
sli
=
[
slice
(
None
)
for
i
in
range
(
s
.
ndim
)]
sli
[
tempo_axis
]
=
slice
(
idx
,
idx
+
crop_len
)
out
=
s
[
tuple
(
sli
)]
return
out
paddleaudio/datasets/__init__.py
→
paddleaudio/
paddleaudio/
datasets/__init__.py
浏览文件 @
d0bca198
...
@@ -15,10 +15,3 @@ from .esc50 import ESC50
...
@@ -15,10 +15,3 @@ from .esc50 import ESC50
from
.gtzan
import
GTZAN
from
.gtzan
import
GTZAN
from
.tess
import
TESS
from
.tess
import
TESS
from
.urban_sound
import
UrbanSound8K
from
.urban_sound
import
UrbanSound8K
__all__
=
[
'ESC50'
,
'UrbanSound8K'
,
'GTZAN'
,
'TESS'
,
]
paddleaudio/datasets/dataset.py
→
paddleaudio/
paddleaudio/
datasets/dataset.py
浏览文件 @
d0bca198
...
@@ -17,8 +17,8 @@ import numpy as np
...
@@ -17,8 +17,8 @@ import numpy as np
import
paddle
import
paddle
from
..backends
import
load
as
load_audio
from
..backends
import
load
as
load_audio
from
..
features
import
melspectrogram
from
..
compliance.librosa
import
melspectrogram
from
..
features
import
mfcc
from
..
compliance.librosa
import
mfcc
feat_funcs
=
{
feat_funcs
=
{
'raw'
:
None
,
'raw'
:
None
,
...
...
paddleaudio/datasets/esc50.py
→
paddleaudio/
paddleaudio/
datasets/esc50.py
浏览文件 @
d0bca198
文件已移动
paddleaudio/datasets/gtzan.py
→
paddleaudio/
paddleaudio/
datasets/gtzan.py
浏览文件 @
d0bca198
文件已移动
paddleaudio/datasets/tess.py
→
paddleaudio/
paddleaudio/
datasets/tess.py
浏览文件 @
d0bca198
文件已移动
paddleaudio/datasets/urban_sound.py
→
paddleaudio/
paddleaudio/
datasets/urban_sound.py
浏览文件 @
d0bca198
文件已移动
paddleaudio/features/__init__.py
→
paddleaudio/
paddleaudio/
features/__init__.py
浏览文件 @
d0bca198
...
@@ -11,6 +11,7 @@
...
@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
from
.augment
import
*
from
.layers
import
LogMelSpectrogram
from
.core
import
*
from
.layers
import
MelSpectrogram
from
.spectrum
import
*
from
.layers
import
MFCC
from
.layers
import
Spectrogram
paddleaudio/
features/spectrum
.py
→
paddleaudio/
paddleaudio/features/layers
.py
浏览文件 @
d0bca198
此差异已折叠。
点击以展开。
paddleaudio/paddleaudio/functional/__init__.py
0 → 100644
浏览文件 @
d0bca198
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.functional
import
compute_fbank_matrix
from
.functional
import
create_dct
from
.functional
import
fft_frequencies
from
.functional
import
hz_to_mel
from
.functional
import
mel_frequencies
from
.functional
import
mel_to_hz
from
.functional
import
power_to_db
paddleaudio/paddleaudio/functional/functional.py
0 → 100644
浏览文件 @
d0bca198
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from librosa(https://github.com/librosa/librosa)
import
math
from
typing
import
Optional
from
typing
import
Union
import
paddle
__all__
=
[
'hz_to_mel'
,
'mel_to_hz'
,
'mel_frequencies'
,
'fft_frequencies'
,
'compute_fbank_matrix'
,
'power_to_db'
,
'create_dct'
,
]
def
hz_to_mel
(
freq
:
Union
[
paddle
.
Tensor
,
float
],
htk
:
bool
=
False
)
->
Union
[
paddle
.
Tensor
,
float
]:
"""Convert Hz to Mels.
Parameters:
freq: the input tensor of arbitrary shape, or a single floating point number.
htk: use HTK formula to do the conversion.
The default value is False.
Returns:
The frequencies represented in Mel-scale.
"""
if
htk
:
if
isinstance
(
freq
,
paddle
.
Tensor
):
return
2595.0
*
paddle
.
log10
(
1.0
+
freq
/
700.0
)
else
:
return
2595.0
*
math
.
log10
(
1.0
+
freq
/
700.0
)
# Fill in the linear part
f_min
=
0.0
f_sp
=
200.0
/
3
mels
=
(
freq
-
f_min
)
/
f_sp
# Fill in the log-scale part
min_log_hz
=
1000.0
# beginning of log region (Hz)
min_log_mel
=
(
min_log_hz
-
f_min
)
/
f_sp
# same (Mels)
logstep
=
math
.
log
(
6.4
)
/
27.0
# step size for log region
if
isinstance
(
freq
,
paddle
.
Tensor
):
target
=
min_log_mel
+
paddle
.
log
(
freq
/
min_log_hz
+
1e-10
)
/
logstep
# prevent nan with 1e-10
mask
=
(
freq
>
min_log_hz
).
astype
(
freq
.
dtype
)
mels
=
target
*
mask
+
mels
*
(
1
-
mask
)
# will replace by masked_fill OP in future
else
:
if
freq
>=
min_log_hz
:
mels
=
min_log_mel
+
math
.
log
(
freq
/
min_log_hz
+
1e-10
)
/
logstep
return
mels
def
mel_to_hz
(
mel
:
Union
[
float
,
paddle
.
Tensor
],
htk
:
bool
=
False
)
->
Union
[
float
,
paddle
.
Tensor
]:
"""Convert mel bin numbers to frequencies.
Parameters:
mel: the mel frequency represented as a tensor of arbitrary shape, or a floating point number.
htk: use HTK formula to do the conversion.
Returns:
The frequencies represented in hz.
"""
if
htk
:
return
700.0
*
(
10.0
**
(
mel
/
2595.0
)
-
1.0
)
f_min
=
0.0
f_sp
=
200.0
/
3
freqs
=
f_min
+
f_sp
*
mel
# And now the nonlinear scale
min_log_hz
=
1000.0
# beginning of log region (Hz)
min_log_mel
=
(
min_log_hz
-
f_min
)
/
f_sp
# same (Mels)
logstep
=
math
.
log
(
6.4
)
/
27.0
# step size for log region
if
isinstance
(
mel
,
paddle
.
Tensor
):
target
=
min_log_hz
*
paddle
.
exp
(
logstep
*
(
mel
-
min_log_mel
))
mask
=
(
mel
>
min_log_mel
).
astype
(
mel
.
dtype
)
freqs
=
target
*
mask
+
freqs
*
(
1
-
mask
)
# will replace by masked_fill OP in future
else
:
if
mel
>=
min_log_mel
:
freqs
=
min_log_hz
*
math
.
exp
(
logstep
*
(
mel
-
min_log_mel
))
return
freqs
def
mel_frequencies
(
n_mels
:
int
=
64
,
f_min
:
float
=
0.0
,
f_max
:
float
=
11025.0
,
htk
:
bool
=
False
,
dtype
:
str
=
paddle
.
float32
):
"""Compute mel frequencies.
Parameters:
n_mels(int): number of Mel bins.
f_min(float): the lower cut-off frequency, below which the filter response is zero.
f_max(float): the upper cut-off frequency, above which the filter response is zero.
htk(bool): whether to use htk formula.
dtype(str): the datatype of the return frequencies.
Returns:
The frequencies represented in Mel-scale
"""
# 'Center freqs' of mel bands - uniformly spaced between limits
min_mel
=
hz_to_mel
(
f_min
,
htk
=
htk
)
max_mel
=
hz_to_mel
(
f_max
,
htk
=
htk
)
mels
=
paddle
.
linspace
(
min_mel
,
max_mel
,
n_mels
,
dtype
=
dtype
)
freqs
=
mel_to_hz
(
mels
,
htk
=
htk
)
return
freqs
def
fft_frequencies
(
sr
:
int
,
n_fft
:
int
,
dtype
:
str
=
paddle
.
float32
):
"""Compute fourier frequencies.
Parameters:
sr(int): the audio sample rate.
n_fft(float): the number of fft bins.
dtype(str): the datatype of the return frequencies.
Returns:
The frequencies represented in hz.
"""
return
paddle
.
linspace
(
0
,
float
(
sr
)
/
2
,
int
(
1
+
n_fft
//
2
),
dtype
=
dtype
)
def
compute_fbank_matrix
(
sr
:
int
,
n_fft
:
int
,
n_mels
:
int
=
64
,
f_min
:
float
=
0.0
,
f_max
:
Optional
[
float
]
=
None
,
htk
:
bool
=
False
,
norm
:
Union
[
str
,
float
]
=
'slaney'
,
dtype
:
str
=
paddle
.
float32
):
"""Compute fbank matrix.
Parameters:
sr(int): the audio sample rate.
n_fft(int): the number of fft bins.
n_mels(int): the number of Mel bins.
f_min(float): the lower cut-off frequency, below which the filter response is zero.
f_max(float): the upper cut-off frequency, above which the filter response is zero.
htk: whether to use htk formula.
return_complex(bool): whether to return complex matrix. If True, the matrix will
be complex type. Otherwise, the real and image part will be stored in the last
axis of returned tensor.
dtype(str): the datatype of the returned fbank matrix.
Returns:
The fbank matrix of shape (n_mels, int(1+n_fft//2)).
Shape:
output: (n_mels, int(1+n_fft//2))
"""
if
f_max
is
None
:
f_max
=
float
(
sr
)
/
2
# Initialize the weights
weights
=
paddle
.
zeros
((
n_mels
,
int
(
1
+
n_fft
//
2
)),
dtype
=
dtype
)
# Center freqs of each FFT bin
fftfreqs
=
fft_frequencies
(
sr
=
sr
,
n_fft
=
n_fft
,
dtype
=
dtype
)
# 'Center freqs' of mel bands - uniformly spaced between limits
mel_f
=
mel_frequencies
(
n_mels
+
2
,
f_min
=
f_min
,
f_max
=
f_max
,
htk
=
htk
,
dtype
=
dtype
)
fdiff
=
mel_f
[
1
:]
-
mel_f
[:
-
1
]
#np.diff(mel_f)
ramps
=
mel_f
.
unsqueeze
(
1
)
-
fftfreqs
.
unsqueeze
(
0
)
#ramps = np.subtract.outer(mel_f, fftfreqs)
for
i
in
range
(
n_mels
):
# lower and upper slopes for all bins
lower
=
-
ramps
[
i
]
/
fdiff
[
i
]
upper
=
ramps
[
i
+
2
]
/
fdiff
[
i
+
1
]
# .. then intersect them with each other and zero
weights
[
i
]
=
paddle
.
maximum
(
paddle
.
zeros_like
(
lower
),
paddle
.
minimum
(
lower
,
upper
))
# Slaney-style mel is scaled to be approx constant energy per channel
if
norm
==
'slaney'
:
enorm
=
2.0
/
(
mel_f
[
2
:
n_mels
+
2
]
-
mel_f
[:
n_mels
])
weights
*=
enorm
.
unsqueeze
(
1
)
elif
isinstance
(
norm
,
int
)
or
isinstance
(
norm
,
float
):
weights
=
paddle
.
nn
.
functional
.
normalize
(
weights
,
p
=
norm
,
axis
=-
1
)
return
weights
def
power_to_db
(
magnitude
:
paddle
.
Tensor
,
ref_value
:
float
=
1.0
,
amin
:
float
=
1e-10
,
top_db
:
Optional
[
float
]
=
None
)
->
paddle
.
Tensor
:
"""Convert a power spectrogram (amplitude squared) to decibel (dB) units.
The function computes the scaling ``10 * log10(x / ref)`` in a numerically
stable way.
Parameters:
magnitude(Tensor): the input magnitude tensor of any shape.
ref_value(float): the reference value. If smaller than 1.0, the db level
of the signal will be pulled up accordingly. Otherwise, the db level
is pushed down.
amin(float): the minimum value of input magnitude, below which the input
magnitude is clipped(to amin).
top_db(float): the maximum db value of resulting spectrum, above which the
spectrum is clipped(to top_db).
Returns:
The spectrogram in log-scale.
shape:
input: any shape
output: same as input
"""
if
amin
<=
0
:
raise
Exception
(
"amin must be strictly positive"
)
if
ref_value
<=
0
:
raise
Exception
(
"ref_value must be strictly positive"
)
ones
=
paddle
.
ones_like
(
magnitude
)
log_spec
=
10.0
*
paddle
.
log10
(
paddle
.
maximum
(
ones
*
amin
,
magnitude
))
log_spec
-=
10.0
*
math
.
log10
(
max
(
ref_value
,
amin
))
if
top_db
is
not
None
:
if
top_db
<
0
:
raise
Exception
(
"top_db must be non-negative"
)
log_spec
=
paddle
.
maximum
(
log_spec
,
ones
*
(
log_spec
.
max
()
-
top_db
))
return
log_spec
def
create_dct
(
n_mfcc
:
int
,
n_mels
:
int
,
norm
:
Optional
[
str
]
=
'ortho'
,
dtype
:
Optional
[
str
]
=
paddle
.
float32
)
->
paddle
.
Tensor
:
"""Create a discrete cosine transform(DCT) matrix.
Parameters:
n_mfcc (int): Number of mel frequency cepstral coefficients.
n_mels (int): Number of mel filterbanks.
norm (str, optional): Normalizaiton type. Defaults to 'ortho'.
Returns:
Tensor: The DCT matrix with shape (n_mels, n_mfcc).
"""
n
=
paddle
.
arange
(
n_mels
,
dtype
=
dtype
)
k
=
paddle
.
arange
(
n_mfcc
,
dtype
=
dtype
).
unsqueeze
(
1
)
dct
=
paddle
.
cos
(
math
.
pi
/
float
(
n_mels
)
*
(
n
+
0.5
)
*
k
)
# size (n_mfcc, n_mels)
if
norm
is
None
:
dct
*=
2.0
else
:
assert
norm
==
"ortho"
dct
[
0
]
*=
1.0
/
math
.
sqrt
(
2.0
)
dct
*=
math
.
sqrt
(
2.0
/
float
(
n_mels
))
return
dct
.
T
paddleaudio/
features
/window.py
→
paddleaudio/
paddleaudio/functional
/window.py
浏览文件 @
d0bca198
...
@@ -20,6 +20,19 @@ from paddle import Tensor
...
@@ -20,6 +20,19 @@ from paddle import Tensor
__all__
=
[
__all__
=
[
'get_window'
,
'get_window'
,
# windows
'taylor'
,
'hamming'
,
'hann'
,
'tukey'
,
'kaiser'
,
'gaussian'
,
'exponential'
,
'triang'
,
'bohman'
,
'blackman'
,
'cosine'
,
]
]
...
@@ -73,6 +86,21 @@ def general_gaussian(M: int, p, sig, sym: bool=True,
...
@@ -73,6 +86,21 @@ def general_gaussian(M: int, p, sig, sym: bool=True,
return
_truncate
(
w
,
needs_trunc
)
return
_truncate
(
w
,
needs_trunc
)
def
general_cosine
(
M
:
int
,
a
:
float
,
sym
:
bool
=
True
,
dtype
:
str
=
'float64'
)
->
Tensor
:
"""Compute a generic weighted sum of cosine terms window.
This function is consistent with scipy.signal.windows.general_cosine().
"""
if
_len_guards
(
M
):
return
paddle
.
ones
((
M
,
),
dtype
=
dtype
)
M
,
needs_trunc
=
_extend
(
M
,
sym
)
fac
=
paddle
.
linspace
(
-
math
.
pi
,
math
.
pi
,
M
,
dtype
=
dtype
)
w
=
paddle
.
zeros
((
M
,
),
dtype
=
dtype
)
for
k
in
range
(
len
(
a
)):
w
+=
a
[
k
]
*
paddle
.
cos
(
k
*
fac
)
return
_truncate
(
w
,
needs_trunc
)
def
general_hamming
(
M
:
int
,
alpha
:
float
,
sym
:
bool
=
True
,
def
general_hamming
(
M
:
int
,
alpha
:
float
,
sym
:
bool
=
True
,
dtype
:
str
=
'float64'
)
->
Tensor
:
dtype
:
str
=
'float64'
)
->
Tensor
:
"""Compute a generalized Hamming window.
"""Compute a generalized Hamming window.
...
@@ -143,21 +171,6 @@ def taylor(M: int,
...
@@ -143,21 +171,6 @@ def taylor(M: int,
return
_truncate
(
w
,
needs_trunc
)
return
_truncate
(
w
,
needs_trunc
)
def
general_cosine
(
M
:
int
,
a
:
float
,
sym
:
bool
=
True
,
dtype
:
str
=
'float64'
)
->
Tensor
:
"""Compute a generic weighted sum of cosine terms window.
This function is consistent with scipy.signal.windows.general_cosine().
"""
if
_len_guards
(
M
):
return
paddle
.
ones
((
M
,
),
dtype
=
dtype
)
M
,
needs_trunc
=
_extend
(
M
,
sym
)
fac
=
paddle
.
linspace
(
-
math
.
pi
,
math
.
pi
,
M
,
dtype
=
dtype
)
w
=
paddle
.
zeros
((
M
,
),
dtype
=
dtype
)
for
k
in
range
(
len
(
a
)):
w
+=
a
[
k
]
*
paddle
.
cos
(
k
*
fac
)
return
_truncate
(
w
,
needs_trunc
)
def
hamming
(
M
:
int
,
sym
:
bool
=
True
,
dtype
:
str
=
'float64'
)
->
Tensor
:
def
hamming
(
M
:
int
,
sym
:
bool
=
True
,
dtype
:
str
=
'float64'
)
->
Tensor
:
"""Compute a Hamming window.
"""Compute a Hamming window.
The Hamming window is a taper formed by using a raised cosine with
The Hamming window is a taper formed by using a raised cosine with
...
@@ -375,6 +388,7 @@ def cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
...
@@ -375,6 +388,7 @@ def cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
return
_truncate
(
w
,
needs_trunc
)
return
_truncate
(
w
,
needs_trunc
)
## factory function
def
get_window
(
window
:
Union
[
str
,
Tuple
[
str
,
float
]],
def
get_window
(
window
:
Union
[
str
,
Tuple
[
str
,
float
]],
win_length
:
int
,
win_length
:
int
,
fftbins
:
bool
=
True
,
fftbins
:
bool
=
True
,
...
...
paddleaudio/
backends
/__init__.py
→
paddleaudio/
paddleaudio/io
/__init__.py
浏览文件 @
d0bca198
...
@@ -11,4 +11,3 @@
...
@@ -11,4 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
from
.audio
import
*
paddleaudio/paddleaudio/metric/__init__.py
0 → 100644
浏览文件 @
d0bca198
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.dtw
import
dtw_distance
from
.mcd
import
mcd_distance
paddleaudio/paddleaudio/metric/dtw.py
0 → 100644
浏览文件 @
d0bca198
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
from
dtaidistance
import
dtw_ndim
__all__
=
[
'dtw_distance'
,
]
def
dtw_distance
(
xs
:
np
.
ndarray
,
ys
:
np
.
ndarray
)
->
float
:
"""dtw distance
Dynamic Time Warping.
This function keeps a compact matrix, not the full warping paths matrix.
Uses dynamic programming to compute:
wps[i, j] = (s1[i]-s2[j])**2 + min(
wps[i-1, j ] + penalty, // vertical / insertion / expansion
wps[i , j-1] + penalty, // horizontal / deletion / compression
wps[i-1, j-1]) // diagonal / match
dtw = sqrt(wps[-1, -1])
Args:
xs (np.ndarray): ref sequence, [T,D]
ys (np.ndarray): hyp sequence, [T,D]
Returns:
float: dtw distance
"""
return
dtw_ndim
.
distance
(
xs
,
ys
)
paddleaudio/paddleaudio/metric/mcd.py
0 → 100644
浏览文件 @
d0bca198
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
mcd.metrics_fast
as
mt
import
numpy
as
np
from
mcd
import
dtw
__all__
=
[
'mcd_distance'
,
]
def
mcd_distance
(
xs
:
np
.
ndarray
,
ys
:
np
.
ndarray
,
cost_fn
=
mt
.
logSpecDbDist
):
"""Mel cepstral distortion (MCD), dtw distance.
Dynamic Time Warping.
Uses dynamic programming to compute:
wps[i, j] = cost_fn(xs[i], ys[j]) + min(
wps[i-1, j ], // vertical / insertion / expansion
wps[i , j-1], // horizontal / deletion / compression
wps[i-1, j-1]) // diagonal / match
dtw = sqrt(wps[-1, -1])
Cost Function:
logSpecDbConst = 10.0 / math.log(10.0) * math.sqrt(2.0)
def logSpecDbDist(x, y):
diff = x - y
return logSpecDbConst * math.sqrt(np.inner(diff, diff))
Args:
xs (np.ndarray): ref sequence, [T,D]
ys (np.ndarray): hyp sequence, [T,D]
Returns:
float: dtw distance
"""
min_cost
,
path
=
dtw
.
dtw
(
xs
,
ys
,
cost_fn
)
return
min_cost
paddleaudio/paddleaudio/sox_effects/__init__.py
0 → 100644
浏览文件 @
d0bca198
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
paddleaudio/utils/__init__.py
→
paddleaudio/
paddleaudio/
utils/__init__.py
浏览文件 @
d0bca198
...
@@ -11,8 +11,15 @@
...
@@ -11,8 +11,15 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
from
.download
import
*
from
.download
import
decompress
from
.env
import
*
from
.download
import
download_and_decompress
from
.error
import
*
from
.download
import
load_state_dict_from_url
from
.log
import
*
from
.env
import
DATA_HOME
from
.time
import
*
from
.env
import
MODEL_HOME
from
.env
import
PPAUDIO_HOME
from
.env
import
USER_HOME
from
.error
import
ParameterError
from
.log
import
Logger
from
.log
import
logger
from
.time
import
seconds_to_hms
from
.time
import
Timer
paddleaudio/utils/download.py
→
paddleaudio/
paddleaudio/
utils/download.py
浏览文件 @
d0bca198
...
@@ -22,6 +22,12 @@ from .log import logger
...
@@ -22,6 +22,12 @@ from .log import logger
download
.
logger
=
logger
download
.
logger
=
logger
__all__
=
[
'decompress'
,
'download_and_decompress'
,
'load_state_dict_from_url'
,
]
def
decompress
(
file
:
str
):
def
decompress
(
file
:
str
):
"""
"""
...
...
paddleaudio/utils/env.py
→
paddleaudio/
paddleaudio/
utils/env.py
浏览文件 @
d0bca198
...
@@ -20,6 +20,13 @@ PPAUDIO_HOME --> the root directory for storing PaddleAudio related data. D
...
@@ -20,6 +20,13 @@ PPAUDIO_HOME --> the root directory for storing PaddleAudio related data. D
'''
'''
import
os
import
os
__all__
=
[
'USER_HOME'
,
'PPAUDIO_HOME'
,
'MODEL_HOME'
,
'DATA_HOME'
,
]
def
_get_user_home
():
def
_get_user_home
():
return
os
.
path
.
expanduser
(
'~'
)
return
os
.
path
.
expanduser
(
'~'
)
...
...
paddleaudio/utils/error.py
→
paddleaudio/
paddleaudio/
utils/error.py
浏览文件 @
d0bca198
文件已移动
paddleaudio/utils/log.py
→
paddleaudio/
paddleaudio/
utils/log.py
浏览文件 @
d0bca198
...
@@ -19,7 +19,10 @@ import time
...
@@ -19,7 +19,10 @@ import time
import
colorlog
import
colorlog
loggers
=
{}
__all__
=
[
'Logger'
,
'logger'
,
]
log_config
=
{
log_config
=
{
'DEBUG'
:
{
'DEBUG'
:
{
...
...
paddleaudio/utils/time.py
→
paddleaudio/
paddleaudio/
utils/time.py
浏览文件 @
d0bca198
...
@@ -14,6 +14,11 @@
...
@@ -14,6 +14,11 @@
import
math
import
math
import
time
import
time
__all__
=
[
'Timer'
,
'seconds_to_hms'
,
]
class
Timer
(
object
):
class
Timer
(
object
):
'''Calculate runing speed and estimated time of arrival(ETA)'''
'''Calculate runing speed and estimated time of arrival(ETA)'''
...
...
setup_audio
.py
→
paddleaudio/setup
.py
浏览文件 @
d0bca198
...
@@ -14,7 +14,7 @@
...
@@ -14,7 +14,7 @@
import
setuptools
import
setuptools
# set the version here
# set the version here
VERSION
=
'0.
1
.0'
VERSION
=
'0.
2
.0'
def
write_version_py
(
filename
=
'paddleaudio/__init__.py'
):
def
write_version_py
(
filename
=
'paddleaudio/__init__.py'
):
...
@@ -59,6 +59,8 @@ setuptools.setup(
...
@@ -59,6 +59,8 @@ setuptools.setup(
'resampy >= 0.2.2'
,
'resampy >= 0.2.2'
,
'soundfile >= 0.9.0'
,
'soundfile >= 0.9.0'
,
'colorlog'
,
'colorlog'
,
'dtaidistance >= 2.3.6'
,
'mcd >= 0.4'
,
],
)
],
)
remove_version_py
()
remove_version_py
()
paddleaudio/tests/.gitkeep
0 → 100644
浏览文件 @
d0bca198
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录