提交 8dcaef9a 编写于 作者: K KP

Add paddleaudio doc.

上级 0b427057
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os
import warnings import warnings
from typing import Optional from typing import Optional
from typing import Tuple from typing import Tuple
...@@ -19,7 +20,6 @@ from typing import Union ...@@ -19,7 +20,6 @@ from typing import Union
import numpy as np import numpy as np
import resampy import resampy
import soundfile as sf import soundfile as sf
from numpy import ndarray as array
from scipy.io import wavfile from scipy.io import wavfile
from ..utils import ParameterError from ..utils import ParameterError
...@@ -38,13 +38,21 @@ RESAMPLE_MODES = ['kaiser_best', 'kaiser_fast'] ...@@ -38,13 +38,21 @@ RESAMPLE_MODES = ['kaiser_best', 'kaiser_fast']
EPS = 1e-8 EPS = 1e-8
def resample(y: array, src_sr: int, target_sr: int, def resample(y: np.ndarray,
mode: str='kaiser_fast') -> array: src_sr: int,
""" Audio resampling target_sr: int,
This function is the same as using resampy.resample(). mode: str='kaiser_fast') -> np.ndarray:
Notes: """Audio resampling.
The default mode is kaiser_fast. For better audio quality, use mode = 'kaiser_fast'
""" Args:
y (np.ndarray): Input waveform array in 1D or 2D.
src_sr (int): Source sample rate.
target_sr (int): Target sample rate.
mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'.
Returns:
np.ndarray: `y` resampled to `target_sr`
"""
if mode == 'kaiser_best': if mode == 'kaiser_best':
warnings.warn( warnings.warn(
...@@ -53,7 +61,7 @@ def resample(y: array, src_sr: int, target_sr: int, ...@@ -53,7 +61,7 @@ def resample(y: array, src_sr: int, target_sr: int,
if not isinstance(y, np.ndarray): if not isinstance(y, np.ndarray):
raise ParameterError( raise ParameterError(
'Only support numpy array, but received y in {type(y)}') 'Only support numpy np.ndarray, but received y in {type(y)}')
if mode not in RESAMPLE_MODES: if mode not in RESAMPLE_MODES:
raise ParameterError(f'resample mode must in {RESAMPLE_MODES}') raise ParameterError(f'resample mode must in {RESAMPLE_MODES}')
...@@ -61,9 +69,17 @@ def resample(y: array, src_sr: int, target_sr: int, ...@@ -61,9 +69,17 @@ def resample(y: array, src_sr: int, target_sr: int,
return resampy.resample(y, src_sr, target_sr, filter=mode) return resampy.resample(y, src_sr, target_sr, filter=mode)
def to_mono(y: array, merge_type: str='average') -> array: def to_mono(y: np.ndarray, merge_type: str='average') -> np.ndarray:
""" convert sterior audio to mono """Convert sterior audio to mono.
Args:
y (np.ndarray): Input waveform array in 1D or 2D.
merge_type (str, optional): Merge type to generate mono waveform. Defaults to 'average'.
Returns:
np.ndarray: `y` with mono channel.
""" """
if merge_type not in MERGE_TYPES: if merge_type not in MERGE_TYPES:
raise ParameterError( raise ParameterError(
f'Unsupported merge type {merge_type}, available types are {MERGE_TYPES}' f'Unsupported merge type {merge_type}, available types are {MERGE_TYPES}'
...@@ -101,18 +117,34 @@ def to_mono(y: array, merge_type: str='average') -> array: ...@@ -101,18 +117,34 @@ def to_mono(y: array, merge_type: str='average') -> array:
return y_out return y_out
def _safe_cast(y: array, dtype: Union[type, str]) -> array: def _safe_cast(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
""" data type casting in a safe way, i.e., prevent overflow or underflow """Data type casting in a safe way, i.e., prevent overflow or underflow.
This function is used internally.
Args:
y (np.ndarray): Input waveform array in 1D or 2D.
dtype (Union[type, str]): Data type of waveform.
Returns:
np.ndarray: `y` after safe casting.
""" """
return np.clip(y, np.iinfo(dtype).min, np.iinfo(dtype).max).astype(dtype) if 'float' in str(y.dtype):
return np.clip(y, np.finfo(dtype).min,
np.finfo(dtype).max).astype(dtype)
else:
return np.clip(y, np.iinfo(dtype).min,
np.iinfo(dtype).max).astype(dtype)
def depth_convert(y: array, dtype: Union[type, str], def depth_convert(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
dithering: bool=True) -> array: """Convert audio array to target dtype safely. This function convert audio waveform to a target dtype, with addition steps of
"""Convert audio array to target dtype safely
This function convert audio waveform to a target dtype, with addition steps of
preventing overflow/underflow and preserving audio range. preventing overflow/underflow and preserving audio range.
Args:
y (np.ndarray): Input waveform array in 1D or 2D.
dtype (Union[type, str]): Data type of waveform.
Returns:
np.ndarray: `y` after safe casting.
""" """
SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64'] SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64']
...@@ -157,14 +189,20 @@ def depth_convert(y: array, dtype: Union[type, str], ...@@ -157,14 +189,20 @@ def depth_convert(y: array, dtype: Union[type, str],
return y return y
def sound_file_load(file: str, def sound_file_load(file: os.PathLike,
offset: Optional[float]=None, offset: Optional[float]=None,
dtype: str='int16', dtype: str='int16',
duration: Optional[int]=None) -> Tuple[array, int]: duration: Optional[int]=None) -> Tuple[np.ndarray, int]:
"""Load audio using soundfile library """Load audio using soundfile library. This function load audio file using libsndfile.
This function load audio file using libsndfile.
Reference: Args:
http://www.mega-nerd.com/libsndfile/#Features file (os.PathLike): File of waveform.
offset (Optional[float], optional): Offset to the start of waveform. Defaults to None.
dtype (str, optional): Data type of waveform. Defaults to 'int16'.
duration (Optional[int], optional): Duration of waveform to read. Defaults to None.
Returns:
Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate.
""" """
with sf.SoundFile(file) as sf_desc: with sf.SoundFile(file) as sf_desc:
sr_native = sf_desc.samplerate sr_native = sf_desc.samplerate
...@@ -179,9 +217,17 @@ def sound_file_load(file: str, ...@@ -179,9 +217,17 @@ def sound_file_load(file: str,
return y, sf_desc.samplerate return y, sf_desc.samplerate
def normalize(y: array, norm_type: str='linear', def normalize(y: np.ndarray, norm_type: str='linear',
mul_factor: float=1.0) -> array: mul_factor: float=1.0) -> np.ndarray:
""" normalize an input audio with additional multiplier. """Normalize an input audio with additional multiplier.
Args:
y (np.ndarray): Input waveform array in 1D or 2D.
norm_type (str, optional): Type of normalization. Defaults to 'linear'.
mul_factor (float, optional): Scaling factor. Defaults to 1.0.
Returns:
np.ndarray: `y` after normalization.
""" """
if norm_type == 'linear': if norm_type == 'linear':
...@@ -199,12 +245,13 @@ def normalize(y: array, norm_type: str='linear', ...@@ -199,12 +245,13 @@ def normalize(y: array, norm_type: str='linear',
return y return y
def save(y: array, sr: int, file: str) -> None: def save(y: np.ndarray, sr: int, file: os.PathLike) -> None:
"""Save audio file to disk. """Save audio file to disk. This function saves audio to disk using scipy.io.wavfile, with additional step to convert input waveform to int16.
This function saves audio to disk using scipy.io.wavfile, with additional step
to convert input waveform to int16 unless it already is int16 Args:
Notes: y (np.ndarray): Input waveform array in 1D or 2D.
It only support raw wav format. sr (int): Sample rate.
file (os.PathLike): Path of auido file to save.
""" """
if not file.endswith('.wav'): if not file.endswith('.wav'):
raise ParameterError( raise ParameterError(
...@@ -226,7 +273,7 @@ def save(y: array, sr: int, file: str) -> None: ...@@ -226,7 +273,7 @@ def save(y: array, sr: int, file: str) -> None:
def load( def load(
file: str, file: os.PathLike,
sr: Optional[int]=None, sr: Optional[int]=None,
mono: bool=True, mono: bool=True,
merge_type: str='average', # ch0,ch1,random,average merge_type: str='average', # ch0,ch1,random,average
...@@ -236,11 +283,24 @@ def load( ...@@ -236,11 +283,24 @@ def load(
offset: float=0.0, offset: float=0.0,
duration: Optional[int]=None, duration: Optional[int]=None,
dtype: str='float32', dtype: str='float32',
resample_mode: str='kaiser_fast') -> Tuple[array, int]: resample_mode: str='kaiser_fast') -> Tuple[np.ndarray, int]:
"""Load audio file from disk. """Load audio file from disk. This function loads audio from disk using using audio beackend.
This function loads audio from disk using using audio beackend.
Parameters: Args:
Notes: file (os.PathLike): Path of auido file to load.
sr (Optional[int], optional): Sample rate of loaded waveform. Defaults to None.
mono (bool, optional): Return waveform with mono channel. Defaults to True.
merge_type (str, optional): Merge type of multi-channels waveform. Defaults to 'average'.
normal (bool, optional): Waveform normalization. Defaults to True.
norm_type (str, optional): Type of normalization. Defaults to 'linear'.
norm_mul_factor (float, optional): Scaling factor. Defaults to 1.0.
offset (float, optional): Offset to the start of waveform. Defaults to 0.0.
duration (Optional[int], optional): Duration of waveform to read. Defaults to None.
dtype (str, optional): Data type of waveform. Defaults to 'float32'.
resample_mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'.
Returns:
Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate.
""" """
y, r = sound_file_load(file, offset=offset, dtype=dtype, duration=duration) y, r = sound_file_load(file, offset=offset, dtype=dtype, duration=duration)
......
...@@ -220,7 +220,7 @@ def spectrogram(waveform: Tensor, ...@@ -220,7 +220,7 @@ def spectrogram(waveform: Tensor,
"""Compute and return a spectrogram from a waveform. The output is identical to Kaldi's. """Compute and return a spectrogram from a waveform. The output is identical to Kaldi's.
Args: Args:
waveform (Tensor): A waveform tensor with shape [C, T]. waveform (Tensor): A waveform tensor with shape `(C, T)`.
blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42. blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
channel (int, optional): Select the channel of waveform. Defaults to -1. channel (int, optional): Select the channel of waveform. Defaults to -1.
dither (float, optional): Dithering constant . Defaults to 0.0. dither (float, optional): Dithering constant . Defaults to 0.0.
...@@ -239,7 +239,7 @@ def spectrogram(waveform: Tensor, ...@@ -239,7 +239,7 @@ def spectrogram(waveform: Tensor,
window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY. window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
Returns: Returns:
Tensor: A spectrogram tensor with shape (m, padded_window_size // 2 + 1) where m is the number of frames Tensor: A spectrogram tensor with shape `(m, padded_window_size // 2 + 1)` where m is the number of frames
depends on frame_length and frame_shift. depends on frame_length and frame_shift.
""" """
dtype = waveform.dtype dtype = waveform.dtype
...@@ -422,7 +422,7 @@ def fbank(waveform: Tensor, ...@@ -422,7 +422,7 @@ def fbank(waveform: Tensor,
"""Compute and return filter banks from a waveform. The output is identical to Kaldi's. """Compute and return filter banks from a waveform. The output is identical to Kaldi's.
Args: Args:
waveform (Tensor): A waveform tensor with shape [C, T]. waveform (Tensor): A waveform tensor with shape `(C, T)`.
blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42. blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
channel (int, optional): Select the channel of waveform. Defaults to -1. channel (int, optional): Select the channel of waveform. Defaults to -1.
dither (float, optional): Dithering constant . Defaults to 0.0. dither (float, optional): Dithering constant . Defaults to 0.0.
...@@ -451,7 +451,7 @@ def fbank(waveform: Tensor, ...@@ -451,7 +451,7 @@ def fbank(waveform: Tensor,
window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY. window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
Returns: Returns:
Tensor: A filter banks tensor with shape (m, n_mels). Tensor: A filter banks tensor with shape `(m, n_mels)`.
""" """
dtype = waveform.dtype dtype = waveform.dtype
...@@ -542,7 +542,7 @@ def mfcc(waveform: Tensor, ...@@ -542,7 +542,7 @@ def mfcc(waveform: Tensor,
identical to Kaldi's. identical to Kaldi's.
Args: Args:
waveform (Tensor): A waveform tensor with shape [C, T]. waveform (Tensor): A waveform tensor with shape `(C, T)`.
blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42. blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
cepstral_lifter (float, optional): Scaling of output mfccs. Defaults to 22.0. cepstral_lifter (float, optional): Scaling of output mfccs. Defaults to 22.0.
channel (int, optional): Select the channel of waveform. Defaults to -1. channel (int, optional): Select the channel of waveform. Defaults to -1.
...@@ -571,7 +571,7 @@ def mfcc(waveform: Tensor, ...@@ -571,7 +571,7 @@ def mfcc(waveform: Tensor,
window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY. window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
Returns: Returns:
Tensor: A mel frequency cepstral coefficients tensor with shape (m, n_mfcc). Tensor: A mel frequency cepstral coefficients tensor with shape `(m, n_mfcc)`.
""" """
assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % ( assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
n_mfcc, n_mels) n_mfcc, n_mels)
......
...@@ -44,29 +44,16 @@ class Spectrogram(nn.Layer): ...@@ -44,29 +44,16 @@ class Spectrogram(nn.Layer):
"""Compute spectrogram of a given signal, typically an audio waveform. """Compute spectrogram of a given signal, typically an audio waveform.
The spectorgram is defined as the complex norm of the short-time The spectorgram is defined as the complex norm of the short-time
Fourier transformation. Fourier transformation.
Parameters:
n_fft (int): the number of frequency components of the discrete Fourier transform. Args:
The default value is 2048, n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
hop_length (int|None): the hop length of the short time FFT. If None, it is set to win_length//4. hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
The default value is None. win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
win_length: the window length of the short time FFt. If None, it is set to same as n_fft. window (str, optional): The window function applied to the single before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
The default value is None. power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
window (str): the name of the window function applied to the single before the Fourier transform. center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
The folllowing window names are supported: 'hamming','hann','kaiser','gaussian', pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
'exponential','triang','bohman','blackman','cosine','tukey','taylor'. dtype (str, optional): Data type of input and window. Defaults to paddle.float32.
The default value is 'hann'
power (float): Exponent for the magnitude spectrogram. The default value is 2.0.
center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
If False, frame t begins at x[t * hop_length]
The default value is True
pad_mode (str): the mode to pad the signal if necessary. The supported modes are 'reflect'
and 'constant'. The default value is 'reflect'.
dtype (str): the data type of input and window.
Notes:
The Spectrogram transform relies on STFT transform to compute the spectrogram.
By default, the weights are not learnable. To fine-tune the Fourier coefficients,
set stop_gradient=False before training.
For more information, see STFT().
""" """
super(Spectrogram, self).__init__() super(Spectrogram, self).__init__()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册