diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index b9801428201e6da9777563ef95f50f28e288211a..2cfcde0532a229fa118f135c79d1d8ab02619d33 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -80,6 +80,7 @@ import paddle.onnx # noqa: F401 import paddle.reader # noqa: F401 import paddle.static # noqa: F401 import paddle.vision # noqa: F401 +import paddle.audio # noqa: F401 import paddle.geometric # noqa: F401 from .tensor.attribute import is_complex # noqa: F401 diff --git a/python/paddle/audio/features/__init__.py b/python/paddle/audio/features/__init__.py index e6b005e501988cc1f466b6d3e0dc10383f0366aa..3c0bf499f1eff46f9f3b40f164c7cb66ecc39e75 100644 --- a/python/paddle/audio/features/__init__.py +++ b/python/paddle/audio/features/__init__.py @@ -11,7 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from .layers import LogMelSpectrogram -from .layers import MelSpectrogram -from .layers import MFCC -from .layers import Spectrogram +from .layers import LogMelSpectrogram # noqa: F401 +from .layers import MelSpectrogram # noqa: F401 +from .layers import MFCC # noqa: F401 +from .layers import Spectrogram # noqa: F401 + +__all__ = [ # noqa + 'LogMelSpectrogram', + 'MelSpectrogram', + 'MFCC', + 'Spectrogram', +] diff --git a/python/paddle/audio/features/layers.py b/python/paddle/audio/features/layers.py index cddb42635d6b034be1fe85e6e7e5f5a967e80ae5..d21a24d34241fec6e921f258b3ea26de8e124bfe 100644 --- a/python/paddle/audio/features/layers.py +++ b/python/paddle/audio/features/layers.py @@ -24,13 +24,6 @@ from ..functional import create_dct from ..functional import power_to_db from ..functional.window import get_window -__all__ = [ - 'Spectrogram', - 'MelSpectrogram', - 'LogMelSpectrogram', - 'MFCC', -] - class Spectrogram(nn.Layer): """Compute spectrogram of given signals, typically audio waveforms. @@ -45,6 +38,27 @@ class Spectrogram(nn.Layer): center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True. pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'. dtype (str, optional): Data type of input and window. Defaults to 'float32'. + + Returns: + :ref:`api_paddle_nn_Layer`. An instance of Spectrogram. + + + + Examples: + .. code-block:: python + + import paddle + from paddle.audio.features import Spectrogram + + sample_rate = 16000 + wav_duration = 0.5 + num_channels = 1 + num_frames = sample_rate * wav_duration + wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1 + waveform = wav_data.tile([num_channels, 1]) + + feature_extractor = Spectrogram(n_fft=512, window = 'hann', power = 1.0) + feats = feature_extractor(waveform) """ def __init__(self, @@ -108,6 +122,25 @@ class MelSpectrogram(nn.Layer): htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False. norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'. dtype (str, optional): Data type of input and window. Defaults to 'float32'. + + Returns: + :ref:`api_paddle_nn_Layer`. An instance of MelSpectrogram. + + Examples: + .. code-block:: python + + import paddle + from paddle.audio.features import MelSpectrogram + + sample_rate = 16000 + wav_duration = 0.5 + num_channels = 1 + num_frames = sample_rate * wav_duration + wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1 + waveform = wav_data.tile([num_channels, 1]) + + feature_extractor = MelSpectrogram(sr=sample_rate, n_fft=512, window = 'hann', power = 1.0) + feats = feature_extractor(waveform) """ def __init__(self, @@ -186,6 +219,25 @@ class LogMelSpectrogram(nn.Layer): amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10. top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None. dtype (str, optional): Data type of input and window. Defaults to 'float32'. + + Returns: + :ref:`api_paddle_nn_Layer`. An instance of LogMelSpectrogram. + + Examples: + .. code-block:: python + + import paddle + from paddle.audio.features import LogMelSpectrogram + + sample_rate = 16000 + wav_duration = 0.5 + num_channels = 1 + num_frames = sample_rate * wav_duration + wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1 + waveform = wav_data.tile([num_channels, 1]) + + feature_extractor = LogMelSpectrogram(sr=sample_rate, n_fft=512, window = 'hann', power = 1.0) + feats = feature_extractor(waveform) """ def __init__(self, @@ -265,6 +317,25 @@ class MFCC(nn.Layer): amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10. top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None. dtype (str, optional): Data type of input and window. Defaults to 'float32'. + + Returns: + :ref:`api_paddle_nn_Layer`. An instance of MFCC. + + Examples: + .. code-block:: python + + import paddle + from paddle.audio.features import MFCC + + sample_rate = 16000 + wav_duration = 0.5 + num_channels = 1 + num_frames = sample_rate * wav_duration + wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1 + waveform = wav_data.tile([num_channels, 1]) + + feature_extractor = MFCC(sr=sample_rate, n_fft=512, window = 'hann') + feats = feature_extractor(waveform) """ def __init__(self, diff --git a/python/paddle/audio/functional/__init__.py b/python/paddle/audio/functional/__init__.py index 0216172db1400c0ca2403bbfa09672db2e5bc8d1..b7db53d6c22a6f1206e8f615252bc4ba73c4647b 100644 --- a/python/paddle/audio/functional/__init__.py +++ b/python/paddle/audio/functional/__init__.py @@ -11,11 +11,22 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from .functional import compute_fbank_matrix -from .functional import create_dct -from .functional import fft_frequencies -from .functional import hz_to_mel -from .functional import mel_frequencies -from .functional import mel_to_hz -from .functional import power_to_db -from .window import get_window +from .functional import compute_fbank_matrix # noqa: F401 +from .functional import create_dct # noqa: F401 +from .functional import fft_frequencies # noqa: F401 +from .functional import hz_to_mel # noqa: F401 +from .functional import mel_frequencies # noqa: F401 +from .functional import mel_to_hz # noqa: F401 +from .functional import power_to_db # noqa: F401 +from .window import get_window # noqa: F401 + +__all__ = [ # noqa + 'compute_fbank_matrix', + 'create_dct', + 'fft_frequencies', + 'hz_to_mel', + 'mel_frequencies', + 'mel_to_hz', + 'power_to_db', + 'get_window', +] diff --git a/python/paddle/audio/functional/functional.py b/python/paddle/audio/functional/functional.py index 071fa6cac71ad9b0f81184c8a20e38c5055f0184..69feab436675795ba319a00bc1932db0ceeb68f9 100644 --- a/python/paddle/audio/functional/functional.py +++ b/python/paddle/audio/functional/functional.py @@ -19,16 +19,6 @@ from typing import Union import paddle from paddle import Tensor -__all__ = [ - 'hz_to_mel', - 'mel_to_hz', - 'mel_frequencies', - 'fft_frequencies', - 'compute_fbank_matrix', - 'power_to_db', - 'create_dct', -] - def hz_to_mel(freq: Union[Tensor, float], htk: bool = False) -> Union[Tensor, float]: @@ -40,6 +30,16 @@ def hz_to_mel(freq: Union[Tensor, float], Returns: Union[Tensor, float]: Frequency in mels. + + Examples: + .. code-block:: python + + import paddle + + val = 3.0 + htk_flag = True + mel_paddle_tensor = paddle.audio.functional.hz_to_mel( + paddle.to_tensor(val), htk_flag) """ if htk: @@ -83,6 +83,17 @@ def mel_to_hz(mel: Union[float, Tensor], Returns: Union[float, Tensor]: Frequencies in Hz. + + Examples: + .. code-block:: python + + import paddle + + val = 3.0 + htk_flag = True + mel_paddle_tensor = paddle.audio.functional.mel_to_hz( + paddle.to_tensor(val), htk_flag) + """ if htk: return 700.0 * (10.0**(mel / 2595.0) - 1.0) @@ -121,6 +132,19 @@ def mel_frequencies(n_mels: int = 64, Returns: Tensor: Tensor of n_mels frequencies in Hz with shape `(n_mels,)`. + + Examples: + .. code-block:: python + + import paddle + + n_mels = 64 + f_min = 0.5 + f_max = 10000 + htk_flag = True + + paddle_mel_freq = paddle.audio.functional.mel_frequencies( + n_mels, f_min, f_max, htk_flag, 'float64') """ # 'Center freqs' of mel bands - uniformly spaced between limits min_mel = hz_to_mel(f_min, htk=htk) @@ -140,6 +164,15 @@ def fft_frequencies(sr: int, n_fft: int, dtype: str = 'float32') -> Tensor: Returns: Tensor: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`. + + Examples: + .. code-block:: python + + import paddle + + sr = 16000 + n_fft = 128 + fft_freq = paddle.audio.functional.fft_frequencies(sr, n_fft) """ return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype) @@ -166,6 +199,15 @@ def compute_fbank_matrix(sr: int, Returns: Tensor: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`. + + Examples: + .. code-block:: python + + import paddle + + n_mfcc = 23 + n_mels = 51 + paddle_dct = paddle.audio.functional.create_dct(n_mfcc, n_mels) """ if f_max is None: @@ -221,6 +263,15 @@ def power_to_db(spect: Tensor, Returns: Tensor: Power spectrogram in db scale. + + Examples: + .. code-block:: python + + import paddle + + val = 3.0 + decibel_paddle = paddle.audio.functional.power_to_db( + paddle.to_tensor(val)) """ if amin <= 0: raise Exception("amin must be strictly positive") @@ -254,6 +305,14 @@ def create_dct(n_mfcc: int, Returns: Tensor: The DCT matrix with shape `(n_mels, n_mfcc)`. + + Examples: + .. code-block:: python + + import paddle + n_mfcc = 23 + n_mels = 257 + dct = paddle.audio.functional.create_dct(n_mfcc, n_mels) """ n = paddle.arange(n_mels, dtype=dtype) k = paddle.arange(n_mfcc, dtype=dtype).unsqueeze(1) diff --git a/python/paddle/audio/functional/window.py b/python/paddle/audio/functional/window.py index a4692dbc962df480eadb2a8db74a681159895304..17ccdce9ef663223ba778d26ebbc8df0a83fb02e 100644 --- a/python/paddle/audio/functional/window.py +++ b/python/paddle/audio/functional/window.py @@ -18,10 +18,6 @@ from typing import Union import paddle from paddle import Tensor -__all__ = [ - 'get_window', -] - def _cat(x: List[Tensor], data_type: str) -> Tensor: l = [paddle.to_tensor(_, data_type) for _ in x] @@ -323,6 +319,17 @@ def get_window(window: Union[str, Tuple[str, float]], Returns: Tensor: The window represented as a tensor. + + Examples: + .. code-block:: python + + import paddle + + n_fft = 512 + cosine_window = paddle.audio.functional.get_window('cosine', n_fft) + + std = 7 + gussian_window = paddle.audio.functional.get_window(('gaussian',std), n_fft) """ sym = not fftbins diff --git a/python/paddle/tests/test_audio_functions.py b/python/paddle/tests/test_audio_functions.py index 200920e73123f2f24b170b1bcb947dfb251cbd2f..5766299f24307ab3bb1603a5d9f7ecfbf45bf8bb 100644 --- a/python/paddle/tests/test_audio_functions.py +++ b/python/paddle/tests/test_audio_functions.py @@ -136,7 +136,7 @@ class TestAudioFuncitons(unittest.TestCase): decimal=5) @parameterize([1, 512]) - def test_gussian_window_and_exception(self, n_fft: int): + def test_gaussian_window_and_exception(self, n_fft: int): window_scipy_gaussain = signal.windows.gaussian(n_fft, std=7) window_paddle_gaussian = paddle.audio.functional.get_window( ('gaussian', 7), n_fft, False)