未验证 提交 d0bca198 编写于 作者: H Hui Zhang 提交者: GitHub

Merge pull request #1494 from PaddlePaddle/audio

[audio] refactor audio arch
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
*.whl *.whl
*.egg-info *.egg-info
build build
*output/
docs/build/ docs/build/
docs/topic/ctc/warp-ctc/ docs/topic/ctc/warp-ctc/
...@@ -33,6 +34,4 @@ tools/activate_python.sh ...@@ -33,6 +34,4 @@ tools/activate_python.sh
tools/miniconda.sh tools/miniconda.sh
tools/CRF++-0.58/ tools/CRF++-0.58/
speechx/fc_patch/ speechx/fc_patch/
\ No newline at end of file
*output/
# Changelog # Changelog
Date: 2022-2-25, Author: Hui Zhang.
- Refactor architecture.
- dtw distance and mcd style dtw
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
import numpy as np
from numpy import ndarray as array
from ..backends import depth_convert
from ..utils import ParameterError
__all__ = [
'depth_augment',
'spect_augment',
'random_crop1d',
'random_crop2d',
'adaptive_spect_augment',
]
def randint(high: int) -> int:
"""Generate one random integer in range [0 high)
This is a helper function for random data augmentaiton
"""
return int(np.random.randint(0, high=high))
def rand() -> float:
"""Generate one floating-point number in range [0 1)
This is a helper function for random data augmentaiton
"""
return float(np.random.rand(1))
def depth_augment(y: array,
choices: List=['int8', 'int16'],
probs: List[float]=[0.5, 0.5]) -> array:
""" Audio depth augmentation
Do audio depth augmentation to simulate the distortion brought by quantization.
"""
assert len(probs) == len(
choices
), 'number of choices {} must be equal to size of probs {}'.format(
len(choices), len(probs))
depth = np.random.choice(choices, p=probs)
src_depth = y.dtype
y1 = depth_convert(y, depth)
y2 = depth_convert(y1, src_depth)
return y2
def adaptive_spect_augment(spect: array, tempo_axis: int=0,
level: float=0.1) -> array:
"""Do adpative spectrogram augmentation
The level of the augmentation is gowern by the paramter level,
ranging from 0 to 1, with 0 represents no augmentation。
"""
assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
if tempo_axis == 0:
nt, nf = spect.shape
else:
nf, nt = spect.shape
time_mask_width = int(nt * level * 0.5)
freq_mask_width = int(nf * level * 0.5)
num_time_mask = int(10 * level)
num_freq_mask = int(10 * level)
if tempo_axis == 0:
for _ in range(num_time_mask):
start = randint(nt - time_mask_width)
spect[start:start + time_mask_width, :] = 0
for _ in range(num_freq_mask):
start = randint(nf - freq_mask_width)
spect[:, start:start + freq_mask_width] = 0
else:
for _ in range(num_time_mask):
start = randint(nt - time_mask_width)
spect[:, start:start + time_mask_width] = 0
for _ in range(num_freq_mask):
start = randint(nf - freq_mask_width)
spect[start:start + freq_mask_width, :] = 0
return spect
def spect_augment(spect: array,
tempo_axis: int=0,
max_time_mask: int=3,
max_freq_mask: int=3,
max_time_mask_width: int=30,
max_freq_mask_width: int=20) -> array:
"""Do spectrogram augmentation in both time and freq axis
Reference:
"""
assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
if tempo_axis == 0:
nt, nf = spect.shape
else:
nf, nt = spect.shape
num_time_mask = randint(max_time_mask)
num_freq_mask = randint(max_freq_mask)
time_mask_width = randint(max_time_mask_width)
freq_mask_width = randint(max_freq_mask_width)
if tempo_axis == 0:
for _ in range(num_time_mask):
start = randint(nt - time_mask_width)
spect[start:start + time_mask_width, :] = 0
for _ in range(num_freq_mask):
start = randint(nf - freq_mask_width)
spect[:, start:start + freq_mask_width] = 0
else:
for _ in range(num_time_mask):
start = randint(nt - time_mask_width)
spect[:, start:start + time_mask_width] = 0
for _ in range(num_freq_mask):
start = randint(nf - freq_mask_width)
spect[start:start + freq_mask_width, :] = 0
return spect
def random_crop1d(y: array, crop_len: int) -> array:
""" Do random cropping on 1d input signal
The input is a 1d signal, typically a sound waveform
"""
if y.ndim != 1:
'only accept 1d tensor or numpy array'
n = len(y)
idx = randint(n - crop_len)
return y[idx:idx + crop_len]
def random_crop2d(s: array, crop_len: int, tempo_axis: int=0) -> array:
""" Do random cropping for 2D array, typically a spectrogram.
The cropping is done in temporal direction on the time-freq input signal.
"""
if tempo_axis >= s.ndim:
raise ParameterError('axis out of range')
n = s.shape[tempo_axis]
idx = randint(high=n - crop_len)
sli = [slice(None) for i in range(s.ndim)]
sli[tempo_axis] = slice(idx, idx + crop_len)
out = s[tuple(sli)]
return out
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import compliance
from . import datasets
from . import features
from . import functional
from . import io
from . import metric
from . import sox_effects
from .backends import load
from .backends import save
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .soundfile_backend import depth_convert
from .soundfile_backend import load
from .soundfile_backend import normalize
from .soundfile_backend import resample
from .soundfile_backend import save
from .soundfile_backend import to_mono
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -29,7 +29,7 @@ __all__ = [ ...@@ -29,7 +29,7 @@ __all__ = [
'to_mono', 'to_mono',
'depth_convert', 'depth_convert',
'normalize', 'normalize',
'save_wav', 'save',
'load', 'load',
] ]
NORMALMIZE_TYPES = ['linear', 'gaussian'] NORMALMIZE_TYPES = ['linear', 'gaussian']
...@@ -41,12 +41,9 @@ EPS = 1e-8 ...@@ -41,12 +41,9 @@ EPS = 1e-8
def resample(y: array, src_sr: int, target_sr: int, def resample(y: array, src_sr: int, target_sr: int,
mode: str='kaiser_fast') -> array: mode: str='kaiser_fast') -> array:
""" Audio resampling """ Audio resampling
This function is the same as using resampy.resample(). This function is the same as using resampy.resample().
Notes: Notes:
The default mode is kaiser_fast. For better audio quality, use mode = 'kaiser_fast' The default mode is kaiser_fast. For better audio quality, use mode = 'kaiser_fast'
""" """
if mode == 'kaiser_best': if mode == 'kaiser_best':
...@@ -106,7 +103,6 @@ def to_mono(y: array, merge_type: str='average') -> array: ...@@ -106,7 +103,6 @@ def to_mono(y: array, merge_type: str='average') -> array:
def _safe_cast(y: array, dtype: Union[type, str]) -> array: def _safe_cast(y: array, dtype: Union[type, str]) -> array:
""" data type casting in a safe way, i.e., prevent overflow or underflow """ data type casting in a safe way, i.e., prevent overflow or underflow
This function is used internally. This function is used internally.
""" """
return np.clip(y, np.iinfo(dtype).min, np.iinfo(dtype).max).astype(dtype) return np.clip(y, np.iinfo(dtype).min, np.iinfo(dtype).max).astype(dtype)
...@@ -115,10 +111,8 @@ def _safe_cast(y: array, dtype: Union[type, str]) -> array: ...@@ -115,10 +111,8 @@ def _safe_cast(y: array, dtype: Union[type, str]) -> array:
def depth_convert(y: array, dtype: Union[type, str], def depth_convert(y: array, dtype: Union[type, str],
dithering: bool=True) -> array: dithering: bool=True) -> array:
"""Convert audio array to target dtype safely """Convert audio array to target dtype safely
This function convert audio waveform to a target dtype, with addition steps of This function convert audio waveform to a target dtype, with addition steps of
preventing overflow/underflow and preserving audio range. preventing overflow/underflow and preserving audio range.
""" """
SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64'] SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64']
...@@ -168,12 +162,9 @@ def sound_file_load(file: str, ...@@ -168,12 +162,9 @@ def sound_file_load(file: str,
dtype: str='int16', dtype: str='int16',
duration: Optional[int]=None) -> Tuple[array, int]: duration: Optional[int]=None) -> Tuple[array, int]:
"""Load audio using soundfile library """Load audio using soundfile library
This function load audio file using libsndfile. This function load audio file using libsndfile.
Reference: Reference:
http://www.mega-nerd.com/libsndfile/#Features http://www.mega-nerd.com/libsndfile/#Features
""" """
with sf.SoundFile(file) as sf_desc: with sf.SoundFile(file) as sf_desc:
sr_native = sf_desc.samplerate sr_native = sf_desc.samplerate
...@@ -188,33 +179,9 @@ def sound_file_load(file: str, ...@@ -188,33 +179,9 @@ def sound_file_load(file: str,
return y, sf_desc.samplerate return y, sf_desc.samplerate
def audio_file_load():
"""Load audio using audiofile library
This function load audio file using audiofile.
Reference:
https://audiofile.68k.org/
"""
raise NotImplementedError()
def sox_file_load():
"""Load audio using sox library
This function load audio file using sox.
Reference:
http://sox.sourceforge.net/
"""
raise NotImplementedError()
def normalize(y: array, norm_type: str='linear', def normalize(y: array, norm_type: str='linear',
mul_factor: float=1.0) -> array: mul_factor: float=1.0) -> array:
""" normalize an input audio with additional multiplier. """ normalize an input audio with additional multiplier.
""" """
if norm_type == 'linear': if norm_type == 'linear':
...@@ -232,14 +199,12 @@ def normalize(y: array, norm_type: str='linear', ...@@ -232,14 +199,12 @@ def normalize(y: array, norm_type: str='linear',
return y return y
def save_wav(y: array, sr: int, file: str) -> None: def save(y: array, sr: int, file: str) -> None:
"""Save audio file to disk. """Save audio file to disk.
This function saves audio to disk using scipy.io.wavfile, with additional step This function saves audio to disk using scipy.io.wavfile, with additional step
to convert input waveform to int16 unless it already is int16 to convert input waveform to int16 unless it already is int16
Notes: Notes:
It only support raw wav format. It only support raw wav format.
""" """
if not file.endswith('.wav'): if not file.endswith('.wav'):
raise ParameterError( raise ParameterError(
...@@ -274,11 +239,8 @@ def load( ...@@ -274,11 +239,8 @@ def load(
resample_mode: str='kaiser_fast') -> Tuple[array, int]: resample_mode: str='kaiser_fast') -> Tuple[array, int]:
"""Load audio file from disk. """Load audio file from disk.
This function loads audio from disk using using audio beackend. This function loads audio from disk using using audio beackend.
Parameters: Parameters:
Notes: Notes:
""" """
y, r = sound_file_load(file, offset=offset, dtype=dtype, duration=duration) y, r = sound_file_load(file, offset=offset, dtype=dtype, duration=duration)
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -11,5 +11,3 @@ ...@@ -11,5 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from .backends import *
from .features import *
此差异已折叠。
...@@ -21,11 +21,13 @@ import numpy as np ...@@ -21,11 +21,13 @@ import numpy as np
import scipy import scipy
from numpy import ndarray as array from numpy import ndarray as array
from numpy.lib.stride_tricks import as_strided from numpy.lib.stride_tricks import as_strided
from scipy.signal import get_window from scipy import signal
from ..backends import depth_convert
from ..utils import ParameterError from ..utils import ParameterError
__all__ = [ __all__ = [
# dsp
'stft', 'stft',
'mfcc', 'mfcc',
'hz_to_mel', 'hz_to_mel',
...@@ -38,6 +40,12 @@ __all__ = [ ...@@ -38,6 +40,12 @@ __all__ = [
'spectrogram', 'spectrogram',
'mu_encode', 'mu_encode',
'mu_decode', 'mu_decode',
# augmentation
'depth_augment',
'spect_augment',
'random_crop1d',
'random_crop2d',
'adaptive_spect_augment',
] ]
...@@ -303,7 +311,7 @@ def stft(x: array, ...@@ -303,7 +311,7 @@ def stft(x: array,
if hop_length is None: if hop_length is None:
hop_length = int(win_length // 4) hop_length = int(win_length // 4)
fft_window = get_window(window, win_length, fftbins=True) fft_window = signal.get_window(window, win_length, fftbins=True)
# Pad the window out to n_fft size # Pad the window out to n_fft size
fft_window = pad_center(fft_window, n_fft) fft_window = pad_center(fft_window, n_fft)
...@@ -576,3 +584,145 @@ def mu_decode(y: array, mu: int=255, quantized: bool=True) -> array: ...@@ -576,3 +584,145 @@ def mu_decode(y: array, mu: int=255, quantized: bool=True) -> array:
y = y * 2 / mu - 1 y = y * 2 / mu - 1
x = np.sign(y) / mu * ((1 + mu)**np.abs(y) - 1) x = np.sign(y) / mu * ((1 + mu)**np.abs(y) - 1)
return x return x
def randint(high: int) -> int:
"""Generate one random integer in range [0 high)
This is a helper function for random data augmentaiton
"""
return int(np.random.randint(0, high=high))
def rand() -> float:
"""Generate one floating-point number in range [0 1)
This is a helper function for random data augmentaiton
"""
return float(np.random.rand(1))
def depth_augment(y: array,
choices: List=['int8', 'int16'],
probs: List[float]=[0.5, 0.5]) -> array:
""" Audio depth augmentation
Do audio depth augmentation to simulate the distortion brought by quantization.
"""
assert len(probs) == len(
choices
), 'number of choices {} must be equal to size of probs {}'.format(
len(choices), len(probs))
depth = np.random.choice(choices, p=probs)
src_depth = y.dtype
y1 = depth_convert(y, depth)
y2 = depth_convert(y1, src_depth)
return y2
def adaptive_spect_augment(spect: array, tempo_axis: int=0,
level: float=0.1) -> array:
"""Do adpative spectrogram augmentation
The level of the augmentation is gowern by the paramter level,
ranging from 0 to 1, with 0 represents no augmentation。
"""
assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
if tempo_axis == 0:
nt, nf = spect.shape
else:
nf, nt = spect.shape
time_mask_width = int(nt * level * 0.5)
freq_mask_width = int(nf * level * 0.5)
num_time_mask = int(10 * level)
num_freq_mask = int(10 * level)
if tempo_axis == 0:
for _ in range(num_time_mask):
start = randint(nt - time_mask_width)
spect[start:start + time_mask_width, :] = 0
for _ in range(num_freq_mask):
start = randint(nf - freq_mask_width)
spect[:, start:start + freq_mask_width] = 0
else:
for _ in range(num_time_mask):
start = randint(nt - time_mask_width)
spect[:, start:start + time_mask_width] = 0
for _ in range(num_freq_mask):
start = randint(nf - freq_mask_width)
spect[start:start + freq_mask_width, :] = 0
return spect
def spect_augment(spect: array,
tempo_axis: int=0,
max_time_mask: int=3,
max_freq_mask: int=3,
max_time_mask_width: int=30,
max_freq_mask_width: int=20) -> array:
"""Do spectrogram augmentation in both time and freq axis
Reference:
"""
assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
if tempo_axis == 0:
nt, nf = spect.shape
else:
nf, nt = spect.shape
num_time_mask = randint(max_time_mask)
num_freq_mask = randint(max_freq_mask)
time_mask_width = randint(max_time_mask_width)
freq_mask_width = randint(max_freq_mask_width)
if tempo_axis == 0:
for _ in range(num_time_mask):
start = randint(nt - time_mask_width)
spect[start:start + time_mask_width, :] = 0
for _ in range(num_freq_mask):
start = randint(nf - freq_mask_width)
spect[:, start:start + freq_mask_width] = 0
else:
for _ in range(num_time_mask):
start = randint(nt - time_mask_width)
spect[:, start:start + time_mask_width] = 0
for _ in range(num_freq_mask):
start = randint(nf - freq_mask_width)
spect[start:start + freq_mask_width, :] = 0
return spect
def random_crop1d(y: array, crop_len: int) -> array:
""" Do random cropping on 1d input signal
The input is a 1d signal, typically a sound waveform
"""
if y.ndim != 1:
'only accept 1d tensor or numpy array'
n = len(y)
idx = randint(n - crop_len)
return y[idx:idx + crop_len]
def random_crop2d(s: array, crop_len: int, tempo_axis: int=0) -> array:
""" Do random cropping for 2D array, typically a spectrogram.
The cropping is done in temporal direction on the time-freq input signal.
"""
if tempo_axis >= s.ndim:
raise ParameterError('axis out of range')
n = s.shape[tempo_axis]
idx = randint(high=n - crop_len)
sli = [slice(None) for i in range(s.ndim)]
sli[tempo_axis] = slice(idx, idx + crop_len)
out = s[tuple(sli)]
return out
...@@ -15,10 +15,3 @@ from .esc50 import ESC50 ...@@ -15,10 +15,3 @@ from .esc50 import ESC50
from .gtzan import GTZAN from .gtzan import GTZAN
from .tess import TESS from .tess import TESS
from .urban_sound import UrbanSound8K from .urban_sound import UrbanSound8K
__all__ = [
'ESC50',
'UrbanSound8K',
'GTZAN',
'TESS',
]
...@@ -17,8 +17,8 @@ import numpy as np ...@@ -17,8 +17,8 @@ import numpy as np
import paddle import paddle
from ..backends import load as load_audio from ..backends import load as load_audio
from ..features import melspectrogram from ..compliance.librosa import melspectrogram
from ..features import mfcc from ..compliance.librosa import mfcc
feat_funcs = { feat_funcs = {
'raw': None, 'raw': None,
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from .augment import * from .layers import LogMelSpectrogram
from .core import * from .layers import MelSpectrogram
from .spectrum import * from .layers import MFCC
from .layers import Spectrogram
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .functional import compute_fbank_matrix
from .functional import create_dct
from .functional import fft_frequencies
from .functional import hz_to_mel
from .functional import mel_frequencies
from .functional import mel_to_hz
from .functional import power_to_db
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from librosa(https://github.com/librosa/librosa)
import math
from typing import Optional
from typing import Union
import paddle
__all__ = [
'hz_to_mel',
'mel_to_hz',
'mel_frequencies',
'fft_frequencies',
'compute_fbank_matrix',
'power_to_db',
'create_dct',
]
def hz_to_mel(freq: Union[paddle.Tensor, float],
htk: bool=False) -> Union[paddle.Tensor, float]:
"""Convert Hz to Mels.
Parameters:
freq: the input tensor of arbitrary shape, or a single floating point number.
htk: use HTK formula to do the conversion.
The default value is False.
Returns:
The frequencies represented in Mel-scale.
"""
if htk:
if isinstance(freq, paddle.Tensor):
return 2595.0 * paddle.log10(1.0 + freq / 700.0)
else:
return 2595.0 * math.log10(1.0 + freq / 700.0)
# Fill in the linear part
f_min = 0.0
f_sp = 200.0 / 3
mels = (freq - f_min) / f_sp
# Fill in the log-scale part
min_log_hz = 1000.0 # beginning of log region (Hz)
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
logstep = math.log(6.4) / 27.0 # step size for log region
if isinstance(freq, paddle.Tensor):
target = min_log_mel + paddle.log(
freq / min_log_hz + 1e-10) / logstep # prevent nan with 1e-10
mask = (freq > min_log_hz).astype(freq.dtype)
mels = target * mask + mels * (
1 - mask) # will replace by masked_fill OP in future
else:
if freq >= min_log_hz:
mels = min_log_mel + math.log(freq / min_log_hz + 1e-10) / logstep
return mels
def mel_to_hz(mel: Union[float, paddle.Tensor],
htk: bool=False) -> Union[float, paddle.Tensor]:
"""Convert mel bin numbers to frequencies.
Parameters:
mel: the mel frequency represented as a tensor of arbitrary shape, or a floating point number.
htk: use HTK formula to do the conversion.
Returns:
The frequencies represented in hz.
"""
if htk:
return 700.0 * (10.0**(mel / 2595.0) - 1.0)
f_min = 0.0
f_sp = 200.0 / 3
freqs = f_min + f_sp * mel
# And now the nonlinear scale
min_log_hz = 1000.0 # beginning of log region (Hz)
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
logstep = math.log(6.4) / 27.0 # step size for log region
if isinstance(mel, paddle.Tensor):
target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel))
mask = (mel > min_log_mel).astype(mel.dtype)
freqs = target * mask + freqs * (
1 - mask) # will replace by masked_fill OP in future
else:
if mel >= min_log_mel:
freqs = min_log_hz * math.exp(logstep * (mel - min_log_mel))
return freqs
def mel_frequencies(n_mels: int=64,
f_min: float=0.0,
f_max: float=11025.0,
htk: bool=False,
dtype: str=paddle.float32):
"""Compute mel frequencies.
Parameters:
n_mels(int): number of Mel bins.
f_min(float): the lower cut-off frequency, below which the filter response is zero.
f_max(float): the upper cut-off frequency, above which the filter response is zero.
htk(bool): whether to use htk formula.
dtype(str): the datatype of the return frequencies.
Returns:
The frequencies represented in Mel-scale
"""
# 'Center freqs' of mel bands - uniformly spaced between limits
min_mel = hz_to_mel(f_min, htk=htk)
max_mel = hz_to_mel(f_max, htk=htk)
mels = paddle.linspace(min_mel, max_mel, n_mels, dtype=dtype)
freqs = mel_to_hz(mels, htk=htk)
return freqs
def fft_frequencies(sr: int, n_fft: int, dtype: str=paddle.float32):
"""Compute fourier frequencies.
Parameters:
sr(int): the audio sample rate.
n_fft(float): the number of fft bins.
dtype(str): the datatype of the return frequencies.
Returns:
The frequencies represented in hz.
"""
return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype)
def compute_fbank_matrix(sr: int,
n_fft: int,
n_mels: int=64,
f_min: float=0.0,
f_max: Optional[float]=None,
htk: bool=False,
norm: Union[str, float]='slaney',
dtype: str=paddle.float32):
"""Compute fbank matrix.
Parameters:
sr(int): the audio sample rate.
n_fft(int): the number of fft bins.
n_mels(int): the number of Mel bins.
f_min(float): the lower cut-off frequency, below which the filter response is zero.
f_max(float): the upper cut-off frequency, above which the filter response is zero.
htk: whether to use htk formula.
return_complex(bool): whether to return complex matrix. If True, the matrix will
be complex type. Otherwise, the real and image part will be stored in the last
axis of returned tensor.
dtype(str): the datatype of the returned fbank matrix.
Returns:
The fbank matrix of shape (n_mels, int(1+n_fft//2)).
Shape:
output: (n_mels, int(1+n_fft//2))
"""
if f_max is None:
f_max = float(sr) / 2
# Initialize the weights
weights = paddle.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
# Center freqs of each FFT bin
fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft, dtype=dtype)
# 'Center freqs' of mel bands - uniformly spaced between limits
mel_f = mel_frequencies(
n_mels + 2, f_min=f_min, f_max=f_max, htk=htk, dtype=dtype)
fdiff = mel_f[1:] - mel_f[:-1] #np.diff(mel_f)
ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0)
#ramps = np.subtract.outer(mel_f, fftfreqs)
for i in range(n_mels):
# lower and upper slopes for all bins
lower = -ramps[i] / fdiff[i]
upper = ramps[i + 2] / fdiff[i + 1]
# .. then intersect them with each other and zero
weights[i] = paddle.maximum(
paddle.zeros_like(lower), paddle.minimum(lower, upper))
# Slaney-style mel is scaled to be approx constant energy per channel
if norm == 'slaney':
enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
weights *= enorm.unsqueeze(1)
elif isinstance(norm, int) or isinstance(norm, float):
weights = paddle.nn.functional.normalize(weights, p=norm, axis=-1)
return weights
def power_to_db(magnitude: paddle.Tensor,
ref_value: float=1.0,
amin: float=1e-10,
top_db: Optional[float]=None) -> paddle.Tensor:
"""Convert a power spectrogram (amplitude squared) to decibel (dB) units.
The function computes the scaling ``10 * log10(x / ref)`` in a numerically
stable way.
Parameters:
magnitude(Tensor): the input magnitude tensor of any shape.
ref_value(float): the reference value. If smaller than 1.0, the db level
of the signal will be pulled up accordingly. Otherwise, the db level
is pushed down.
amin(float): the minimum value of input magnitude, below which the input
magnitude is clipped(to amin).
top_db(float): the maximum db value of resulting spectrum, above which the
spectrum is clipped(to top_db).
Returns:
The spectrogram in log-scale.
shape:
input: any shape
output: same as input
"""
if amin <= 0:
raise Exception("amin must be strictly positive")
if ref_value <= 0:
raise Exception("ref_value must be strictly positive")
ones = paddle.ones_like(magnitude)
log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, magnitude))
log_spec -= 10.0 * math.log10(max(ref_value, amin))
if top_db is not None:
if top_db < 0:
raise Exception("top_db must be non-negative")
log_spec = paddle.maximum(log_spec, ones * (log_spec.max() - top_db))
return log_spec
def create_dct(n_mfcc: int,
n_mels: int,
norm: Optional[str]='ortho',
dtype: Optional[str]=paddle.float32) -> paddle.Tensor:
"""Create a discrete cosine transform(DCT) matrix.
Parameters:
n_mfcc (int): Number of mel frequency cepstral coefficients.
n_mels (int): Number of mel filterbanks.
norm (str, optional): Normalizaiton type. Defaults to 'ortho'.
Returns:
Tensor: The DCT matrix with shape (n_mels, n_mfcc).
"""
n = paddle.arange(n_mels, dtype=dtype)
k = paddle.arange(n_mfcc, dtype=dtype).unsqueeze(1)
dct = paddle.cos(math.pi / float(n_mels) * (n + 0.5) *
k) # size (n_mfcc, n_mels)
if norm is None:
dct *= 2.0
else:
assert norm == "ortho"
dct[0] *= 1.0 / math.sqrt(2.0)
dct *= math.sqrt(2.0 / float(n_mels))
return dct.T
...@@ -20,6 +20,19 @@ from paddle import Tensor ...@@ -20,6 +20,19 @@ from paddle import Tensor
__all__ = [ __all__ = [
'get_window', 'get_window',
# windows
'taylor',
'hamming',
'hann',
'tukey',
'kaiser',
'gaussian',
'exponential',
'triang',
'bohman',
'blackman',
'cosine',
] ]
...@@ -73,6 +86,21 @@ def general_gaussian(M: int, p, sig, sym: bool=True, ...@@ -73,6 +86,21 @@ def general_gaussian(M: int, p, sig, sym: bool=True,
return _truncate(w, needs_trunc) return _truncate(w, needs_trunc)
def general_cosine(M: int, a: float, sym: bool=True,
dtype: str='float64') -> Tensor:
"""Compute a generic weighted sum of cosine terms window.
This function is consistent with scipy.signal.windows.general_cosine().
"""
if _len_guards(M):
return paddle.ones((M, ), dtype=dtype)
M, needs_trunc = _extend(M, sym)
fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype)
w = paddle.zeros((M, ), dtype=dtype)
for k in range(len(a)):
w += a[k] * paddle.cos(k * fac)
return _truncate(w, needs_trunc)
def general_hamming(M: int, alpha: float, sym: bool=True, def general_hamming(M: int, alpha: float, sym: bool=True,
dtype: str='float64') -> Tensor: dtype: str='float64') -> Tensor:
"""Compute a generalized Hamming window. """Compute a generalized Hamming window.
...@@ -143,21 +171,6 @@ def taylor(M: int, ...@@ -143,21 +171,6 @@ def taylor(M: int,
return _truncate(w, needs_trunc) return _truncate(w, needs_trunc)
def general_cosine(M: int, a: float, sym: bool=True,
dtype: str='float64') -> Tensor:
"""Compute a generic weighted sum of cosine terms window.
This function is consistent with scipy.signal.windows.general_cosine().
"""
if _len_guards(M):
return paddle.ones((M, ), dtype=dtype)
M, needs_trunc = _extend(M, sym)
fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype)
w = paddle.zeros((M, ), dtype=dtype)
for k in range(len(a)):
w += a[k] * paddle.cos(k * fac)
return _truncate(w, needs_trunc)
def hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor: def hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
"""Compute a Hamming window. """Compute a Hamming window.
The Hamming window is a taper formed by using a raised cosine with The Hamming window is a taper formed by using a raised cosine with
...@@ -375,6 +388,7 @@ def cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor: ...@@ -375,6 +388,7 @@ def cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
return _truncate(w, needs_trunc) return _truncate(w, needs_trunc)
## factory function
def get_window(window: Union[str, Tuple[str, float]], def get_window(window: Union[str, Tuple[str, float]],
win_length: int, win_length: int,
fftbins: bool=True, fftbins: bool=True,
......
...@@ -11,4 +11,3 @@ ...@@ -11,4 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from .audio import *
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .dtw import dtw_distance
from .mcd import mcd_distance
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from dtaidistance import dtw_ndim
__all__ = [
'dtw_distance',
]
def dtw_distance(xs: np.ndarray, ys: np.ndarray) -> float:
"""dtw distance
Dynamic Time Warping.
This function keeps a compact matrix, not the full warping paths matrix.
Uses dynamic programming to compute:
wps[i, j] = (s1[i]-s2[j])**2 + min(
wps[i-1, j ] + penalty, // vertical / insertion / expansion
wps[i , j-1] + penalty, // horizontal / deletion / compression
wps[i-1, j-1]) // diagonal / match
dtw = sqrt(wps[-1, -1])
Args:
xs (np.ndarray): ref sequence, [T,D]
ys (np.ndarray): hyp sequence, [T,D]
Returns:
float: dtw distance
"""
return dtw_ndim.distance(xs, ys)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import mcd.metrics_fast as mt
import numpy as np
from mcd import dtw
__all__ = [
'mcd_distance',
]
def mcd_distance(xs: np.ndarray, ys: np.ndarray, cost_fn=mt.logSpecDbDist):
"""Mel cepstral distortion (MCD), dtw distance.
Dynamic Time Warping.
Uses dynamic programming to compute:
wps[i, j] = cost_fn(xs[i], ys[j]) + min(
wps[i-1, j ], // vertical / insertion / expansion
wps[i , j-1], // horizontal / deletion / compression
wps[i-1, j-1]) // diagonal / match
dtw = sqrt(wps[-1, -1])
Cost Function:
logSpecDbConst = 10.0 / math.log(10.0) * math.sqrt(2.0)
def logSpecDbDist(x, y):
diff = x - y
return logSpecDbConst * math.sqrt(np.inner(diff, diff))
Args:
xs (np.ndarray): ref sequence, [T,D]
ys (np.ndarray): hyp sequence, [T,D]
Returns:
float: dtw distance
"""
min_cost, path = dtw.dtw(xs, ys, cost_fn)
return min_cost
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
...@@ -11,8 +11,15 @@ ...@@ -11,8 +11,15 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from .download import * from .download import decompress
from .env import * from .download import download_and_decompress
from .error import * from .download import load_state_dict_from_url
from .log import * from .env import DATA_HOME
from .time import * from .env import MODEL_HOME
from .env import PPAUDIO_HOME
from .env import USER_HOME
from .error import ParameterError
from .log import Logger
from .log import logger
from .time import seconds_to_hms
from .time import Timer
...@@ -22,6 +22,12 @@ from .log import logger ...@@ -22,6 +22,12 @@ from .log import logger
download.logger = logger download.logger = logger
__all__ = [
'decompress',
'download_and_decompress',
'load_state_dict_from_url',
]
def decompress(file: str): def decompress(file: str):
""" """
......
...@@ -20,6 +20,13 @@ PPAUDIO_HOME --> the root directory for storing PaddleAudio related data. D ...@@ -20,6 +20,13 @@ PPAUDIO_HOME --> the root directory for storing PaddleAudio related data. D
''' '''
import os import os
__all__ = [
'USER_HOME',
'PPAUDIO_HOME',
'MODEL_HOME',
'DATA_HOME',
]
def _get_user_home(): def _get_user_home():
return os.path.expanduser('~') return os.path.expanduser('~')
......
...@@ -19,7 +19,10 @@ import time ...@@ -19,7 +19,10 @@ import time
import colorlog import colorlog
loggers = {} __all__ = [
'Logger',
'logger',
]
log_config = { log_config = {
'DEBUG': { 'DEBUG': {
......
...@@ -14,6 +14,11 @@ ...@@ -14,6 +14,11 @@
import math import math
import time import time
__all__ = [
'Timer',
'seconds_to_hms',
]
class Timer(object): class Timer(object):
'''Calculate runing speed and estimated time of arrival(ETA)''' '''Calculate runing speed and estimated time of arrival(ETA)'''
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
import setuptools import setuptools
# set the version here # set the version here
VERSION = '0.1.0' VERSION = '0.2.0'
def write_version_py(filename='paddleaudio/__init__.py'): def write_version_py(filename='paddleaudio/__init__.py'):
...@@ -59,6 +59,8 @@ setuptools.setup( ...@@ -59,6 +59,8 @@ setuptools.setup(
'resampy >= 0.2.2', 'resampy >= 0.2.2',
'soundfile >= 0.9.0', 'soundfile >= 0.9.0',
'colorlog', 'colorlog',
'dtaidistance >= 2.3.6',
'mcd >= 0.4',
], ) ], )
remove_version_py() remove_version_py()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册