未验证 提交 c7d2e82c 编写于 作者: Y YangZhou 提交者: GitHub

update audio api examples (#46938)

* update audio api examples

* fix format

* format

* fix

* test api

* fix format

* fix static check error

* fix doc error

* fix ci

* fix api error

* update api.spec

* fix ci

* fix typo in window gaussian
上级 5c0bfc18
...@@ -80,6 +80,7 @@ import paddle.onnx # noqa: F401 ...@@ -80,6 +80,7 @@ import paddle.onnx # noqa: F401
import paddle.reader # noqa: F401 import paddle.reader # noqa: F401
import paddle.static # noqa: F401 import paddle.static # noqa: F401
import paddle.vision # noqa: F401 import paddle.vision # noqa: F401
import paddle.audio # noqa: F401
import paddle.geometric # noqa: F401 import paddle.geometric # noqa: F401
from .tensor.attribute import is_complex # noqa: F401 from .tensor.attribute import is_complex # noqa: F401
......
...@@ -11,7 +11,14 @@ ...@@ -11,7 +11,14 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from .layers import LogMelSpectrogram from .layers import LogMelSpectrogram # noqa: F401
from .layers import MelSpectrogram from .layers import MelSpectrogram # noqa: F401
from .layers import MFCC from .layers import MFCC # noqa: F401
from .layers import Spectrogram from .layers import Spectrogram # noqa: F401
__all__ = [ # noqa
'LogMelSpectrogram',
'MelSpectrogram',
'MFCC',
'Spectrogram',
]
...@@ -24,13 +24,6 @@ from ..functional import create_dct ...@@ -24,13 +24,6 @@ from ..functional import create_dct
from ..functional import power_to_db from ..functional import power_to_db
from ..functional.window import get_window from ..functional.window import get_window
__all__ = [
'Spectrogram',
'MelSpectrogram',
'LogMelSpectrogram',
'MFCC',
]
class Spectrogram(nn.Layer): class Spectrogram(nn.Layer):
"""Compute spectrogram of given signals, typically audio waveforms. """Compute spectrogram of given signals, typically audio waveforms.
...@@ -45,6 +38,27 @@ class Spectrogram(nn.Layer): ...@@ -45,6 +38,27 @@ class Spectrogram(nn.Layer):
center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True. center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'. pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
dtype (str, optional): Data type of input and window. Defaults to 'float32'. dtype (str, optional): Data type of input and window. Defaults to 'float32'.
Returns:
:ref:`api_paddle_nn_Layer`. An instance of Spectrogram.
Examples:
.. code-block:: python
import paddle
from paddle.audio.features import Spectrogram
sample_rate = 16000
wav_duration = 0.5
num_channels = 1
num_frames = sample_rate * wav_duration
wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
waveform = wav_data.tile([num_channels, 1])
feature_extractor = Spectrogram(n_fft=512, window = 'hann', power = 1.0)
feats = feature_extractor(waveform)
""" """
def __init__(self, def __init__(self,
...@@ -108,6 +122,25 @@ class MelSpectrogram(nn.Layer): ...@@ -108,6 +122,25 @@ class MelSpectrogram(nn.Layer):
htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False. htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'. norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
dtype (str, optional): Data type of input and window. Defaults to 'float32'. dtype (str, optional): Data type of input and window. Defaults to 'float32'.
Returns:
:ref:`api_paddle_nn_Layer`. An instance of MelSpectrogram.
Examples:
.. code-block:: python
import paddle
from paddle.audio.features import MelSpectrogram
sample_rate = 16000
wav_duration = 0.5
num_channels = 1
num_frames = sample_rate * wav_duration
wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
waveform = wav_data.tile([num_channels, 1])
feature_extractor = MelSpectrogram(sr=sample_rate, n_fft=512, window = 'hann', power = 1.0)
feats = feature_extractor(waveform)
""" """
def __init__(self, def __init__(self,
...@@ -186,6 +219,25 @@ class LogMelSpectrogram(nn.Layer): ...@@ -186,6 +219,25 @@ class LogMelSpectrogram(nn.Layer):
amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10. amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10.
top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None. top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None.
dtype (str, optional): Data type of input and window. Defaults to 'float32'. dtype (str, optional): Data type of input and window. Defaults to 'float32'.
Returns:
:ref:`api_paddle_nn_Layer`. An instance of LogMelSpectrogram.
Examples:
.. code-block:: python
import paddle
from paddle.audio.features import LogMelSpectrogram
sample_rate = 16000
wav_duration = 0.5
num_channels = 1
num_frames = sample_rate * wav_duration
wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
waveform = wav_data.tile([num_channels, 1])
feature_extractor = LogMelSpectrogram(sr=sample_rate, n_fft=512, window = 'hann', power = 1.0)
feats = feature_extractor(waveform)
""" """
def __init__(self, def __init__(self,
...@@ -265,6 +317,25 @@ class MFCC(nn.Layer): ...@@ -265,6 +317,25 @@ class MFCC(nn.Layer):
amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10. amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10.
top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None. top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None.
dtype (str, optional): Data type of input and window. Defaults to 'float32'. dtype (str, optional): Data type of input and window. Defaults to 'float32'.
Returns:
:ref:`api_paddle_nn_Layer`. An instance of MFCC.
Examples:
.. code-block:: python
import paddle
from paddle.audio.features import MFCC
sample_rate = 16000
wav_duration = 0.5
num_channels = 1
num_frames = sample_rate * wav_duration
wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
waveform = wav_data.tile([num_channels, 1])
feature_extractor = MFCC(sr=sample_rate, n_fft=512, window = 'hann')
feats = feature_extractor(waveform)
""" """
def __init__(self, def __init__(self,
......
...@@ -11,11 +11,22 @@ ...@@ -11,11 +11,22 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from .functional import compute_fbank_matrix from .functional import compute_fbank_matrix # noqa: F401
from .functional import create_dct from .functional import create_dct # noqa: F401
from .functional import fft_frequencies from .functional import fft_frequencies # noqa: F401
from .functional import hz_to_mel from .functional import hz_to_mel # noqa: F401
from .functional import mel_frequencies from .functional import mel_frequencies # noqa: F401
from .functional import mel_to_hz from .functional import mel_to_hz # noqa: F401
from .functional import power_to_db from .functional import power_to_db # noqa: F401
from .window import get_window from .window import get_window # noqa: F401
__all__ = [ # noqa
'compute_fbank_matrix',
'create_dct',
'fft_frequencies',
'hz_to_mel',
'mel_frequencies',
'mel_to_hz',
'power_to_db',
'get_window',
]
...@@ -19,16 +19,6 @@ from typing import Union ...@@ -19,16 +19,6 @@ from typing import Union
import paddle import paddle
from paddle import Tensor from paddle import Tensor
__all__ = [
'hz_to_mel',
'mel_to_hz',
'mel_frequencies',
'fft_frequencies',
'compute_fbank_matrix',
'power_to_db',
'create_dct',
]
def hz_to_mel(freq: Union[Tensor, float], def hz_to_mel(freq: Union[Tensor, float],
htk: bool = False) -> Union[Tensor, float]: htk: bool = False) -> Union[Tensor, float]:
...@@ -40,6 +30,16 @@ def hz_to_mel(freq: Union[Tensor, float], ...@@ -40,6 +30,16 @@ def hz_to_mel(freq: Union[Tensor, float],
Returns: Returns:
Union[Tensor, float]: Frequency in mels. Union[Tensor, float]: Frequency in mels.
Examples:
.. code-block:: python
import paddle
val = 3.0
htk_flag = True
mel_paddle_tensor = paddle.audio.functional.hz_to_mel(
paddle.to_tensor(val), htk_flag)
""" """
if htk: if htk:
...@@ -83,6 +83,17 @@ def mel_to_hz(mel: Union[float, Tensor], ...@@ -83,6 +83,17 @@ def mel_to_hz(mel: Union[float, Tensor],
Returns: Returns:
Union[float, Tensor]: Frequencies in Hz. Union[float, Tensor]: Frequencies in Hz.
Examples:
.. code-block:: python
import paddle
val = 3.0
htk_flag = True
mel_paddle_tensor = paddle.audio.functional.mel_to_hz(
paddle.to_tensor(val), htk_flag)
""" """
if htk: if htk:
return 700.0 * (10.0**(mel / 2595.0) - 1.0) return 700.0 * (10.0**(mel / 2595.0) - 1.0)
...@@ -121,6 +132,19 @@ def mel_frequencies(n_mels: int = 64, ...@@ -121,6 +132,19 @@ def mel_frequencies(n_mels: int = 64,
Returns: Returns:
Tensor: Tensor of n_mels frequencies in Hz with shape `(n_mels,)`. Tensor: Tensor of n_mels frequencies in Hz with shape `(n_mels,)`.
Examples:
.. code-block:: python
import paddle
n_mels = 64
f_min = 0.5
f_max = 10000
htk_flag = True
paddle_mel_freq = paddle.audio.functional.mel_frequencies(
n_mels, f_min, f_max, htk_flag, 'float64')
""" """
# 'Center freqs' of mel bands - uniformly spaced between limits # 'Center freqs' of mel bands - uniformly spaced between limits
min_mel = hz_to_mel(f_min, htk=htk) min_mel = hz_to_mel(f_min, htk=htk)
...@@ -140,6 +164,15 @@ def fft_frequencies(sr: int, n_fft: int, dtype: str = 'float32') -> Tensor: ...@@ -140,6 +164,15 @@ def fft_frequencies(sr: int, n_fft: int, dtype: str = 'float32') -> Tensor:
Returns: Returns:
Tensor: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`. Tensor: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`.
Examples:
.. code-block:: python
import paddle
sr = 16000
n_fft = 128
fft_freq = paddle.audio.functional.fft_frequencies(sr, n_fft)
""" """
return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype) return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype)
...@@ -166,6 +199,15 @@ def compute_fbank_matrix(sr: int, ...@@ -166,6 +199,15 @@ def compute_fbank_matrix(sr: int,
Returns: Returns:
Tensor: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`. Tensor: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`.
Examples:
.. code-block:: python
import paddle
n_mfcc = 23
n_mels = 51
paddle_dct = paddle.audio.functional.create_dct(n_mfcc, n_mels)
""" """
if f_max is None: if f_max is None:
...@@ -221,6 +263,15 @@ def power_to_db(spect: Tensor, ...@@ -221,6 +263,15 @@ def power_to_db(spect: Tensor,
Returns: Returns:
Tensor: Power spectrogram in db scale. Tensor: Power spectrogram in db scale.
Examples:
.. code-block:: python
import paddle
val = 3.0
decibel_paddle = paddle.audio.functional.power_to_db(
paddle.to_tensor(val))
""" """
if amin <= 0: if amin <= 0:
raise Exception("amin must be strictly positive") raise Exception("amin must be strictly positive")
...@@ -254,6 +305,14 @@ def create_dct(n_mfcc: int, ...@@ -254,6 +305,14 @@ def create_dct(n_mfcc: int,
Returns: Returns:
Tensor: The DCT matrix with shape `(n_mels, n_mfcc)`. Tensor: The DCT matrix with shape `(n_mels, n_mfcc)`.
Examples:
.. code-block:: python
import paddle
n_mfcc = 23
n_mels = 257
dct = paddle.audio.functional.create_dct(n_mfcc, n_mels)
""" """
n = paddle.arange(n_mels, dtype=dtype) n = paddle.arange(n_mels, dtype=dtype)
k = paddle.arange(n_mfcc, dtype=dtype).unsqueeze(1) k = paddle.arange(n_mfcc, dtype=dtype).unsqueeze(1)
......
...@@ -18,10 +18,6 @@ from typing import Union ...@@ -18,10 +18,6 @@ from typing import Union
import paddle import paddle
from paddle import Tensor from paddle import Tensor
__all__ = [
'get_window',
]
def _cat(x: List[Tensor], data_type: str) -> Tensor: def _cat(x: List[Tensor], data_type: str) -> Tensor:
l = [paddle.to_tensor(_, data_type) for _ in x] l = [paddle.to_tensor(_, data_type) for _ in x]
...@@ -323,6 +319,17 @@ def get_window(window: Union[str, Tuple[str, float]], ...@@ -323,6 +319,17 @@ def get_window(window: Union[str, Tuple[str, float]],
Returns: Returns:
Tensor: The window represented as a tensor. Tensor: The window represented as a tensor.
Examples:
.. code-block:: python
import paddle
n_fft = 512
cosine_window = paddle.audio.functional.get_window('cosine', n_fft)
std = 7
gussian_window = paddle.audio.functional.get_window(('gaussian',std), n_fft)
""" """
sym = not fftbins sym = not fftbins
......
...@@ -136,7 +136,7 @@ class TestAudioFuncitons(unittest.TestCase): ...@@ -136,7 +136,7 @@ class TestAudioFuncitons(unittest.TestCase):
decimal=5) decimal=5)
@parameterize([1, 512]) @parameterize([1, 512])
def test_gussian_window_and_exception(self, n_fft: int): def test_gaussian_window_and_exception(self, n_fft: int):
window_scipy_gaussain = signal.windows.gaussian(n_fft, std=7) window_scipy_gaussain = signal.windows.gaussian(n_fft, std=7)
window_paddle_gaussian = paddle.audio.functional.get_window( window_paddle_gaussian = paddle.audio.functional.get_window(
('gaussian', 7), n_fft, False) ('gaussian', 7), n_fft, False)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册