Merge pull request #667 from PaddlePaddle/feat

[WIP]nn audio feature

Merge pull request #667 from PaddlePaddle/feat
[WIP]nn audio feature
a13ce6d3 · Hui Zhang · GitHub · 5ef4a34e · 5fb06f42 · a13ce6d3
7 changed file
--- a/third_party/nnAudio/nnAudio/Spectrogram.py
+++ b/third_party/nnAudio/nnAudio/Spectrogram.py
--- a/third_party/nnAudio/nnAudio/__init__.py
+++ b/third_party/nnAudio/nnAudio/__init__.py
+__version__ = "0.2.2"
\ No newline at end of file
--- a/third_party/nnAudio/nnAudio/librosa_functions.py
+++ b/third_party/nnAudio/nnAudio/librosa_functions.py
+"""
+Module containing functions cloned from librosa
+
+To make sure nnAudio would not become broken when updating librosa 
+"""
+
+import numpy as np
+import warnings
+### ----------------Functions for generating kenral for Mel Spectrogram------------ ###
+# This code is equalvant to from librosa.filters import mel
+# By doing so, we can run nnAudio without installing librosa
+def fft2gammatonemx(sr=20000, n_fft=2048, n_bins=64, width=1.0, fmin=0.0,
+                    fmax=11025, maxlen=1024):
+    """
+    # Ellis' description in MATLAB:
+    # [wts,cfreqa] = fft2gammatonemx(nfft, sr, nfilts, width, minfreq, maxfreq, maxlen)
+    #      Generate a matrix of weights to combine FFT bins into
+    #      Gammatone bins.  nfft defines the source FFT size at
+    #      sampling rate sr.  Optional nfilts specifies the number of
+    #      output bands required (default 64), and width is the
+    #      constant width of each band in Bark (default 1).
+    #      minfreq, maxfreq specify range covered in Hz (100, sr/2).
+    #      While wts has nfft columns, the second half are all zero.
+    #      Hence, aud spectrum is
+    #      fft2gammatonemx(nfft,sr)*abs(fft(xincols,nfft));
+    #      maxlen truncates the rows to this many bins.
+    #      cfreqs returns the actual center frequencies of each
+    #      gammatone band in Hz.
+    #
+    # 2009/02/22 02:29:25 Dan Ellis dpwe@ee.columbia.edu  based on rastamat/audspec.m
+    # Sat May 27 15:37:50 2017 Maddie Cusimano, mcusi@mit.edu 27 May 2017: convert to python
+    """
+
+    wts = np.zeros([n_bins, n_fft], dtype=np.float32)
+
+    # after Slaney's MakeERBFilters
+    EarQ = 9.26449;
+    minBW = 24.7;
+    order = 1;
+
+    nFr = np.array(range(n_bins)) + 1
+    em = EarQ * minBW
+    cfreqs = (fmax + em) * np.exp(nFr * (-np.log(fmax + em) + np.log(fmin + em)) / n_bins) - em
+    cfreqs = cfreqs[::-1]
+
+    GTord = 4
+    ucircArray = np.array(range(int(n_fft / 2 + 1)))
+    ucirc = np.exp(1j * 2 * np.pi * ucircArray / n_fft);
+    # justpoles = 0 :taking out the 'if' corresponding to this.
+
+    ERB = width * np.power(np.power(cfreqs / EarQ, order) + np.power(minBW, order), 1 / order);
+    B = 1.019 * 2 * np.pi * ERB;
+    r = np.exp(-B / sr)
+    theta = 2 * np.pi * cfreqs / sr
+    pole = r * np.exp(1j * theta)
+    T = 1 / sr
+    ebt = np.exp(B * T);
+    cpt = 2 * cfreqs * np.pi * T;
+    ccpt = 2 * T * np.cos(cpt);
+    scpt = 2 * T * np.sin(cpt);
+    A11 = -np.divide(np.divide(ccpt, ebt) + np.divide(np.sqrt(3 + 2 ** 1.5) * scpt, ebt), 2);
+    A12 = -np.divide(np.divide(ccpt, ebt) - np.divide(np.sqrt(3 + 2 ** 1.5) * scpt, ebt), 2);
+    A13 = -np.divide(np.divide(ccpt, ebt) + np.divide(np.sqrt(3 - 2 ** 1.5) * scpt, ebt), 2);
+    A14 = -np.divide(np.divide(ccpt, ebt) - np.divide(np.sqrt(3 - 2 ** 1.5) * scpt, ebt), 2);
+    zros = -np.array([A11, A12, A13, A14]) / T;
+    wIdx = range(int(n_fft / 2 + 1))
+    gain = np.abs((-2 * np.exp(4 * 1j * cfreqs * np.pi * T) * T + 2 * np.exp(
+        -(B * T) + 2 * 1j * cfreqs * np.pi * T) * T * (
+                               np.cos(2 * cfreqs * np.pi * T) - np.sqrt(3 - 2 ** (3 / 2)) * np.sin(
+                           2 * cfreqs * np.pi * T))) * (-2 * np.exp(4 * 1j * cfreqs * np.pi * T) * T + 2 * np.exp(
+        -(B * T) + 2 * 1j * cfreqs * np.pi * T) * T * (np.cos(2 * cfreqs * np.pi * T) + np.sqrt(
+        3 - 2 ** (3 / 2)) * np.sin(2 * cfreqs * np.pi * T))) * (
+                              -2 * np.exp(4 * 1j * cfreqs * np.pi * T) * T + 2 * np.exp(
+                          -(B * T) + 2 * 1j * cfreqs * np.pi * T) * T * (
+                                          np.cos(2 * cfreqs * np.pi * T) - np.sqrt(3 + 2 ** (3 / 2)) * np.sin(
+                                      2 * cfreqs * np.pi * T))) * (
+                              -2 * np.exp(4 * 1j * cfreqs * np.pi * T) * T + 2 * np.exp(
+                          -(B * T) + 2 * 1j * cfreqs * np.pi * T) * T * (
+                                          np.cos(2 * cfreqs * np.pi * T) + np.sqrt(3 + 2 ** (3 / 2)) * np.sin(
+                                      2 * cfreqs * np.pi * T))) / (
+                              -2 / np.exp(2 * B * T) - 2 * np.exp(4 * 1j * cfreqs * np.pi * T) + 2 * (
+                                  1 + np.exp(4 * 1j * cfreqs * np.pi * T)) / np.exp(B * T)) ** 4);
+    # in MATLAB, there used to be 64 where here it says n_bins:
+    wts[:, wIdx] = ((T ** 4) / np.reshape(gain, (n_bins, 1))) * np.abs(
+        ucirc - np.reshape(zros[0], (n_bins, 1))) * np.abs(ucirc - np.reshape(zros[1], (n_bins, 1))) * np.abs(
+        ucirc - np.reshape(zros[2], (n_bins, 1))) * np.abs(ucirc - np.reshape(zros[3], (n_bins, 1))) * (np.abs(
+        np.power(np.multiply(np.reshape(pole, (n_bins, 1)) - ucirc, np.conj(np.reshape(pole, (n_bins, 1))) - ucirc),
+                 -GTord)));
+    wts = wts[:, range(maxlen)];
+
+    return wts, cfreqs
+
+def gammatone(sr, n_fft, n_bins=64, fmin=20.0, fmax=None, htk=False,
+        norm=1, dtype=np.float32):
+    """Create a Filterbank matrix to combine FFT bins into Gammatone bins
+    Parameters
+    ----------
+    sr        : number > 0 [scalar]
+        sampling rate of the incoming signal
+    n_fft     : int > 0 [scalar]
+        number of FFT components
+    n_bins    : int > 0 [scalar]
+        number of Mel bands to generate
+    fmin      : float >= 0 [scalar]
+        lowest frequency (in Hz)
+    fmax      : float >= 0 [scalar]
+        highest frequency (in Hz).
+        If `None`, use `fmax = sr / 2.0`
+    htk       : bool [scalar]
+        use HTK formula instead of Slaney
+    norm : {None, 1, np.inf} [scalar]
+        if 1, divide the triangular mel weights by the width of the mel band
+        (area normalization).  Otherwise, leave all the triangles aiming for
+        a peak value of 1.0
+    dtype : np.dtype
+        The data type of the output basis.
+        By default, uses 32-bit (single-precision) floating point.
+    Returns
+    -------
+    G         : np.ndarray [shape=(n_bins, 1 + n_fft/2)]
+        Gammatone transform matrix
+    """
+
+    if fmax is None:
+        fmax = float(sr) / 2
+    n_bins = int(n_bins)
+    
+    weights,_ = fft2gammatonemx(sr=sr, n_fft=n_fft, n_bins=n_bins, fmin=fmin, fmax=fmax, maxlen=int(n_fft//2+1))
+
+    return (1/n_fft)*weights
+
+def mel_to_hz(mels, htk=False):
+    """Convert mel bin numbers to frequencies
+    Examples
+    --------
+    >>> librosa.mel_to_hz(3)
+    200.
+    >>> librosa.mel_to_hz([1,2,3,4,5])
+    array([  66.667,  133.333,  200.   ,  266.667,  333.333])
+    Parameters
+    ----------
+    mels          : np.ndarray [shape=(n,)], float
+        mel bins to convert
+    htk           : bool
+        use HTK formula instead of Slaney
+    Returns
+    -------
+    frequencies   : np.ndarray [shape=(n,)]
+        input mels in Hz
+    See Also
+    --------
+    hz_to_mel
+    """
+
+    mels = np.asanyarray(mels)
+
+    if htk:
+        return 700.0 * (10.0**(mels / 2595.0) - 1.0)
+
+    # Fill in the linear scale
+    f_min = 0.0
+    f_sp = 200.0 / 3
+    freqs = f_min + f_sp * mels
+
+    # And now the nonlinear scale
+    min_log_hz = 1000.0                         # beginning of log region (Hz)
+    min_log_mel = (min_log_hz - f_min) / f_sp   # same (Mels)
+    logstep = np.log(6.4) / 27.0                # step size for log region
+
+    if mels.ndim:
+        # If we have vector data, vectorize
+        log_t = (mels >= min_log_mel)
+        freqs[log_t] = min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel))
+    elif mels >= min_log_mel:
+        # If we have scalar data, check directly
+        freqs = min_log_hz * np.exp(logstep * (mels - min_log_mel))
+
+    return freqs
+
+def hz_to_mel(frequencies, htk=False):
+    """Convert Hz to Mels
+    Examples
+    --------
+    >>> librosa.hz_to_mel(60)
+    0.9
+    >>> librosa.hz_to_mel([110, 220, 440])
+    array([ 1.65,  3.3 ,  6.6 ])
+    Parameters
+    ----------
+    frequencies   : number or np.ndarray [shape=(n,)] , float
+        scalar or array of frequencies
+    htk           : bool
+        use HTK formula instead of Slaney
+    Returns
+    -------
+    mels        : number or np.ndarray [shape=(n,)]
+        input frequencies in Mels
+    See Also
+    --------
+    mel_to_hz
+    """
+
+    frequencies = np.asanyarray(frequencies)
+
+    if htk:
+        return 2595.0 * np.log10(1.0 + frequencies / 700.0)
+
+    # Fill in the linear part
+    f_min = 0.0
+    f_sp = 200.0 / 3
+
+    mels = (frequencies - f_min) / f_sp
+
+    # Fill in the log-scale part
+
+    min_log_hz = 1000.0                         # beginning of log region (Hz)
+    min_log_mel = (min_log_hz - f_min) / f_sp   # same (Mels)
+    logstep = np.log(6.4) / 27.0                # step size for log region
+
+    if frequencies.ndim:
+        # If we have array data, vectorize
+        log_t = (frequencies >= min_log_hz)
+        mels[log_t] = min_log_mel + np.log(frequencies[log_t]/min_log_hz) / logstep
+    elif frequencies >= min_log_hz:
+        # If we have scalar data, heck directly
+        mels = min_log_mel + np.log(frequencies / min_log_hz) / logstep
+
+    return mels
+
+def fft_frequencies(sr=22050, n_fft=2048):
+    '''Alternative implementation of `np.fft.fftfreq`
+    Parameters
+    ----------
+    sr : number > 0 [scalar]
+        Audio sampling rate
+    n_fft : int > 0 [scalar]
+        FFT window size
+    Returns
+    -------
+    freqs : np.ndarray [shape=(1 + n_fft/2,)]
+        Frequencies `(0, sr/n_fft, 2*sr/n_fft, ..., sr/2)`
+    Examples
+    --------
+    >>> librosa.fft_frequencies(sr=22050, n_fft=16)
+    array([     0.   ,   1378.125,   2756.25 ,   4134.375,
+             5512.5  ,   6890.625,   8268.75 ,   9646.875,  11025.   ])
+    '''
+
+    return np.linspace(0,
+                       float(sr) / 2,
+                       int(1 + n_fft//2),
+                       endpoint=True)
+
+def mel_frequencies(n_mels=128, fmin=0.0, fmax=11025.0, htk=False):
+    """
+    This function is cloned from librosa 0.7.
+    Please refer to the original 
+    `documentation <https://librosa.org/doc/latest/generated/librosa.mel_frequencies.html?highlight=mel_frequencies#librosa.mel_frequencies>`__
+    for more info.
+    
+    Parameters
+    ----------
+    n_mels    : int > 0 [scalar]
+        Number of mel bins.
+
+    fmin      : float >= 0 [scalar]
+        Minimum frequency (Hz).
+
+    fmax      : float >= 0 [scalar]
+        Maximum frequency (Hz).
+
+    htk       : bool
+        If True, use HTK formula to convert Hz to mel.
+        Otherwise (False), use Slaney's Auditory Toolbox.
+
+    Returns
+    -------
+    bin_frequencies : ndarray [shape=(n_mels,)]
+        Vector of n_mels frequencies in Hz which are uniformly spaced on the Mel
+        axis.
+
+    Examples
+    --------
+    >>> librosa.mel_frequencies(n_mels=40)
+    array([     0.   ,     85.317,    170.635,    255.952,
+              341.269,    426.586,    511.904,    597.221,
+              682.538,    767.855,    853.173,    938.49 ,
+             1024.856,   1119.114,   1222.042,   1334.436,
+             1457.167,   1591.187,   1737.532,   1897.337,
+             2071.84 ,   2262.393,   2470.47 ,   2697.686,
+             2945.799,   3216.731,   3512.582,   3835.643,
+             4188.417,   4573.636,   4994.285,   5453.621,
+             5955.205,   6502.92 ,   7101.009,   7754.107,
+             8467.272,   9246.028,  10096.408,  11025.   ])
+    """
+
+    # 'Center freqs' of mel bands - uniformly spaced between limits
+    min_mel = hz_to_mel(fmin, htk=htk)
+    max_mel = hz_to_mel(fmax, htk=htk)
+
+    mels = np.linspace(min_mel, max_mel, n_mels)
+
+    return mel_to_hz(mels, htk=htk)
+
+def mel(sr, n_fft, n_mels=128, fmin=0.0, fmax=None, htk=False,
+        norm=1, dtype=np.float32):
+    """
+    This function is cloned from librosa 0.7.
+    Please refer to the original 
+    `documentation <https://librosa.org/doc/latest/generated/librosa.filters.mel.html>`__
+    for more info.    
+    Create a Filterbank matrix to combine FFT bins into Mel-frequency bins
+
+
+    Parameters
+    ----------
+    sr        : number > 0 [scalar]
+        sampling rate of the incoming signal
+    n_fft     : int > 0 [scalar]
+        number of FFT components
+    n_mels    : int > 0 [scalar]
+        number of Mel bands to generate
+    fmin      : float >= 0 [scalar]
+        lowest frequency (in Hz)
+    fmax      : float >= 0 [scalar]
+        highest frequency (in Hz).
+        If `None`, use `fmax = sr / 2.0`
+    htk       : bool [scalar]
+        use HTK formula instead of Slaney
+    norm : {None, 1, np.inf} [scalar]
+        if 1, divide the triangular mel weights by the width of the mel band
+        (area normalization).  Otherwise, leave all the triangles aiming for
+        a peak value of 1.0
+    dtype : np.dtype
+        The data type of the output basis.
+        By default, uses 32-bit (single-precision) floating point.
+
+    Returns
+    -------
+    M         : np.ndarray [shape=(n_mels, 1 + n_fft/2)]
+        Mel transform matrix
+
+    Notes
+    -----
+    This function caches at level 10.
+    
+    Examples
+    --------
+    >>> melfb = librosa.filters.mel(22050, 2048)
+    >>> melfb
+    array([[ 0.   ,  0.016, ...,  0.   ,  0.   ],
+           [ 0.   ,  0.   , ...,  0.   ,  0.   ],
+           ...,
+           [ 0.   ,  0.   , ...,  0.   ,  0.   ],
+           [ 0.   ,  0.   , ...,  0.   ,  0.   ]])
+    Clip the maximum frequency to 8KHz
+    >>> librosa.filters.mel(22050, 2048, fmax=8000)
+    array([[ 0.  ,  0.02, ...,  0.  ,  0.  ],
+           [ 0.  ,  0.  , ...,  0.  ,  0.  ],
+           ...,
+           [ 0.  ,  0.  , ...,  0.  ,  0.  ],
+           [ 0.  ,  0.  , ...,  0.  ,  0.  ]])
+    >>> import matplotlib.pyplot as plt
+    >>> plt.figure()
+    >>> librosa.display.specshow(melfb, x_axis='linear')
+    >>> plt.ylabel('Mel filter')
+    >>> plt.title('Mel filter bank')
+    >>> plt.colorbar()
+    >>> plt.tight_layout()
+    >>> plt.show()
+    """
+
+    if fmax is None:
+        fmax = float(sr) / 2
+
+    if norm is not None and norm != 1 and norm != np.inf:
+        raise ParameterError('Unsupported norm: {}'.format(repr(norm)))
+
+    # Initialize the weights
+    n_mels = int(n_mels)
+    weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
+
+    # Center freqs of each FFT bin
+    fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft)
+
+    # 'Center freqs' of mel bands - uniformly spaced between limits
+    mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk)
+
+    fdiff = np.diff(mel_f)
+    ramps = np.subtract.outer(mel_f, fftfreqs)
+
+    for i in range(n_mels):
+        # lower and upper slopes for all bins
+        lower = -ramps[i] / fdiff[i]
+        upper = ramps[i+2] / fdiff[i+1]
+
+        # .. then intersect them with each other and zero
+        weights[i] = np.maximum(0, np.minimum(lower, upper))
+
+    if norm == 1:
+        # Slaney-style mel is scaled to be approx constant energy per channel
+        enorm = 2.0 / (mel_f[2:n_mels+2] - mel_f[:n_mels])
+        weights *= enorm[:, np.newaxis]
+
+    # Only check weights if f_mel[0] is positive
+    if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)):
+        # This means we have an empty channel somewhere
+        warnings.warn('Empty filters detected in mel frequency basis. '
+                      'Some channels will produce empty responses. '
+                      'Try increasing your sampling rate (and fmax) or '
+                      'reducing n_mels.')
+
+    return weights
+### ------------------End of Functions for generating kenral for Mel Spectrogram ----------------###
+
+
+### ------------------Functions for making STFT same as librosa ---------------------------------###
+def pad_center(data, size, axis=-1, **kwargs):
+    '''Wrapper for np.pad to automatically center an array prior to padding.
+    This is analogous to `str.center()`
+
+    Examples
+    --------
+    >>> # Generate a vector
+    >>> data = np.ones(5)
+    >>> librosa.util.pad_center(data, 10, mode='constant')
+    array([ 0.,  0.,  1.,  1.,  1.,  1.,  1.,  0.,  0.,  0.])
+
+    >>> # Pad a matrix along its first dimension
+    >>> data = np.ones((3, 5))
+    >>> librosa.util.pad_center(data, 7, axis=0)
+    array([[ 0.,  0.,  0.,  0.,  0.],
+           [ 0.,  0.,  0.,  0.,  0.],
+           [ 1.,  1.,  1.,  1.,  1.],
+           [ 1.,  1.,  1.,  1.,  1.],
+           [ 1.,  1.,  1.,  1.,  1.],
+           [ 0.,  0.,  0.,  0.,  0.],
+           [ 0.,  0.,  0.,  0.,  0.]])
+    >>> # Or its second dimension
+    >>> librosa.util.pad_center(data, 7, axis=1)
+    array([[ 0.,  1.,  1.,  1.,  1.,  1.,  0.],
+           [ 0.,  1.,  1.,  1.,  1.,  1.,  0.],
+           [ 0.,  1.,  1.,  1.,  1.,  1.,  0.]])
+
+    Parameters
+    ----------
+    data : np.ndarray
+        Vector to be padded and centered
+
+    size : int >= len(data) [scalar]
+        Length to pad `data`
+
+    axis : int
+        Axis along which to pad and center the data
+
+    kwargs : additional keyword arguments
+      arguments passed to `np.pad()`
+
+    Returns
+    -------
+    data_padded : np.ndarray
+        `data` centered and padded to length `size` along the
+        specified axis
+
+    Raises
+    ------
+    ParameterError
+        If `size < data.shape[axis]`
+
+    See Also
+    --------
+    numpy.pad
+    '''
+
+    kwargs.setdefault('mode', 'constant')
+
+    n = data.shape[axis]
+
+    lpad = int((size - n) // 2)
+
+    lengths = [(0, 0)] * data.ndim
+    lengths[axis] = (lpad, int(size - n - lpad))
+
+    if lpad < 0:
+        raise ParameterError(('Target size ({:d}) must be '
+                              'at least input size ({:d})').format(size, n))
+
+    return np.pad(data, lengths, **kwargs)
+
+### ------------------End of functions for making STFT same as librosa ---------------------------###
--- a/third_party/nnAudio/nnAudio/utils.py
+++ b/third_party/nnAudio/nnAudio/utils.py
--- a/third_party/nnAudio/setup.py
+++ b/third_party/nnAudio/setup.py
+import setuptools
+import codecs
+import os.path
+
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+
+def read(rel_path):
+    here = os.path.abspath(os.path.dirname(__file__))
+    with codecs.open(os.path.join(here, rel_path), 'r') as fp:
+        return fp.read()    
+    
+def get_version(rel_path):
+    for line in read(rel_path).splitlines():
+        if line.startswith('__version__'):
+            delim = '"' if '"' in line else "'"
+            return line.split(delim)[1]
+    else:
+        raise RuntimeError("Unable to find version string.")    
+    
+setuptools.setup(
+    name="nnAudio", # Replace with your own username
+    version=get_version("nnAudio/__init__.py"),
+    author="KinWaiCheuk",
+    author_email="u3500684@connect.hku.hk",
+    description="A fast GPU audio processing toolbox with 1D convolutional neural network",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/KinWaiCheuk/nnAudio",
+    packages=setuptools.find_packages(),
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    python_requires='>=3.6',
+)
--- a/third_party/nnAudio/tests/parameters.py
+++ b/third_party/nnAudio/tests/parameters.py
+# Creating parameters for STFT test
+"""
+It is equivalent to 
+[(1024, 128, 'ones'),
+ (1024, 128, 'hann'),
+ (1024, 128, 'hamming'),
+ (2048, 128, 'ones'),
+ (2048, 512, 'ones'),
+ (2048, 128, 'hann'),
+ (2048, 512, 'hann'),
+ (2048, 128, 'hamming'),
+ (2048, 512, 'hamming'),
+ (None, None, None)]
+"""
+
+stft_parameters = []
+n_fft = [1024,2048]
+hop_length = {128,512,1024}
+window = ['ones', 'hann', 'hamming']
+for i in n_fft:
+    for k in window:
+        for j in hop_length:
+            if j < (i/2):
+                stft_parameters.append((i,j,k))
+stft_parameters.append((256, None, 'hann'))
+
+stft_with_win_parameters = []
+n_fft = [512,1024]
+win_length = [400, 900]
+hop_length = {128,256}
+for i in n_fft:
+    for j in win_length:
+        if j < i:     
+            for k in hop_length:
+                if k < (i/2):
+                    stft_with_win_parameters.append((i,j,k))
+
+mel_win_parameters = [(512,400), (1024, 1000)]
\ No newline at end of file
--- a/third_party/nnAudio/tests/test_spectrogram.py
+++ b/third_party/nnAudio/tests/test_spectrogram.py
+import pytest
+import librosa
+import torch
+import matplotlib.pyplot as plt
+from scipy.signal import chirp, sweep_poly
+from nnAudio.Spectrogram import *
+from parameters import *
+
+gpu_idx=0
+
+# librosa example audio for testing
+example_y, example_sr = librosa.load(librosa.util.example_audio_file())
+
+
+@pytest.mark.parametrize("n_fft, hop_length, window", stft_parameters)  
+@pytest.mark.parametrize("device", ['cpu', f'cuda:{gpu_idx}'])
+def test_inverse2(n_fft, hop_length, window, device):
+    x = torch.tensor(example_y,device=device)
+    stft = STFT(n_fft=n_fft, hop_length=hop_length, window=window).to(device)
+    istft = iSTFT(n_fft=n_fft, hop_length=hop_length, window=window).to(device)
+    X = stft(x.unsqueeze(0), output_format="Complex")
+    x_recon = istft(X, length=x.shape[0], onesided=True).squeeze()
+    assert np.allclose(x.cpu(), x_recon.cpu(), rtol=1e-5, atol=1e-3)    
+
+@pytest.mark.parametrize("n_fft, hop_length, window", stft_parameters)
+@pytest.mark.parametrize("device", ['cpu', f'cuda:{gpu_idx}'])
+def test_inverse(n_fft, hop_length, window, device):
+    x = torch.tensor(example_y, device=device)
+    stft = STFT(n_fft=n_fft, hop_length=hop_length, window=window, iSTFT=True).to(device)
+    X = stft(x.unsqueeze(0), output_format="Complex")
+    x_recon = stft.inverse(X, length=x.shape[0]).squeeze()
+    assert np.allclose(x.cpu(), x_recon.cpu(), rtol=1e-3, atol=1)
+    
+
+    
+# @pytest.mark.parametrize("n_fft, hop_length, window", stft_parameters)
+
+# def test_inverse_GPU(n_fft, hop_length, window):
+#     x = torch.tensor(example_y,device=f'cuda:{gpu_idx}')
+#     stft = STFT(n_fft=n_fft, hop_length=hop_length, window=window, device=f'cuda:{gpu_idx}')
+#     X = stft(x.unsqueeze(0), output_format="Complex")
+#     x_recon = stft.inverse(X, num_samples=x.shape[0]).squeeze()
+#     assert np.allclose(x.cpu(), x_recon.cpu(), rtol=1e-3, atol=1)
+
+
+@pytest.mark.parametrize("n_fft, hop_length, window", stft_parameters)
+@pytest.mark.parametrize("device", ['cpu', f'cuda:{gpu_idx}'])
+def test_stft_complex(n_fft, hop_length, window, device):
+    x = example_y
+    stft = STFT(n_fft=n_fft, hop_length=hop_length, window=window).to(device)
+    X = stft(torch.tensor(x, device=device).unsqueeze(0), output_format="Complex")
+    X_real, X_imag = X[:, :, :, 0].squeeze(), X[:, :, :, 1].squeeze()
+    X_librosa = librosa.stft(x, n_fft=n_fft, hop_length=hop_length, window=window)
+    real_diff, imag_diff = np.allclose(X_real.cpu(), X_librosa.real, rtol=1e-3, atol=1e-3), \
+                            np.allclose(X_imag.cpu(), X_librosa.imag, rtol=1e-3, atol=1e-3)
+    
+    assert real_diff and imag_diff 
+    
+# @pytest.mark.parametrize("n_fft, hop_length, window", stft_parameters)    
+# def test_stft_complex_GPU(n_fft, hop_length, window):
+#     x = example_y
+#     stft = STFT(n_fft=n_fft, hop_length=hop_length, window=window, device=f'cuda:{gpu_idx}')
+#     X = stft(torch.tensor(x,device=f'cuda:{gpu_idx}').unsqueeze(0), output_format="Complex")
+#     X_real, X_imag = X[:, :, :, 0].squeeze().detach().cpu(), X[:, :, :, 1].squeeze().detach().cpu()
+#     X_librosa = librosa.stft(x, n_fft=n_fft, hop_length=hop_length, window=window)
+#     real_diff, imag_diff = np.allclose(X_real, X_librosa.real, rtol=1e-3, atol=1e-3), \
+#                             np.allclose(X_imag, X_librosa.imag, rtol=1e-3, atol=1e-3)
+    
+#     assert real_diff and imag_diff        
+    
+@pytest.mark.parametrize("n_fft, win_length, hop_length", stft_with_win_parameters) 
+@pytest.mark.parametrize("device", ['cpu', f'cuda:{gpu_idx}'])
+def test_stft_complex_winlength(n_fft, win_length, hop_length, device):
+    x = example_y
+    stft = STFT(n_fft=n_fft, win_length=win_length, hop_length=hop_length).to(device)
+    X = stft(torch.tensor(x, device=device).unsqueeze(0), output_format="Complex")
+    X_real, X_imag = X[:, :, :, 0].squeeze(), X[:, :, :, 1].squeeze()
+    X_librosa = librosa.stft(x, n_fft=n_fft, win_length=win_length, hop_length=hop_length)
+    real_diff, imag_diff = np.allclose(X_real.cpu(), X_librosa.real, rtol=1e-3, atol=1e-3), \
+                            np.allclose(X_imag.cpu(), X_librosa.imag, rtol=1e-3, atol=1e-3)
+    assert real_diff and imag_diff    
+              
+@pytest.mark.parametrize("device", ['cpu', f'cuda:{gpu_idx}'])
+def test_stft_magnitude(device):
+    x = example_y
+    stft = STFT(n_fft=2048, hop_length=512).to(device)
+    X = stft(torch.tensor(x, device=device).unsqueeze(0), output_format="Magnitude").squeeze()
+    X_librosa, _ = librosa.core.magphase(librosa.stft(x, n_fft=2048, hop_length=512))
+    assert np.allclose(X.cpu(), X_librosa, rtol=1e-3, atol=1e-3)
+
+@pytest.mark.parametrize("device", ['cpu', f'cuda:{gpu_idx}'])
+def test_stft_phase(device):
+    x = example_y
+    stft = STFT(n_fft=2048, hop_length=512).to(device)
+    X = stft(torch.tensor(x, device=device).unsqueeze(0), output_format="Phase")
+    X_real, X_imag = torch.cos(X).squeeze(), torch.sin(X).squeeze()
+    _, X_librosa = librosa.core.magphase(librosa.stft(x, n_fft=2048, hop_length=512))
+
+    real_diff, imag_diff = np.mean(np.abs(X_real.cpu().numpy() - X_librosa.real)), \
+                            np.mean(np.abs(X_imag.cpu().numpy() - X_librosa.imag))
+
+    # I find that np.allclose is too strict for allowing phase to be similar to librosa.
+    # Hence for phase we use average element-wise distance as the test metric.
+    assert real_diff < 2e-4 and imag_diff < 2e-4
+
+@pytest.mark.parametrize("n_fft, win_length", mel_win_parameters)  
+@pytest.mark.parametrize("device", ['cpu', f'cuda:{gpu_idx}'])
+def test_mel_spectrogram(n_fft, win_length, device):
+    x = example_y
+    melspec = MelSpectrogram(n_fft=n_fft, win_length=win_length, hop_length=512).to(device)
+    X = melspec(torch.tensor(x, device=device).unsqueeze(0)).squeeze()
+    X_librosa = librosa.feature.melspectrogram(x, n_fft=n_fft, win_length=win_length, hop_length=512)
+    assert np.allclose(X.cpu(), X_librosa, rtol=1e-3, atol=1e-3)
+    
+    
+@pytest.mark.parametrize("device", ['cpu', f'cuda:{gpu_idx}'])
+def test_cqt_1992(device):
+    # Log sweep case
+    fs = 44100
+    t = 1
+    f0 = 55
+    f1 = 22050
+    s = np.linspace(0, t, fs*t)
+    x = chirp(s, f0, 1, f1, method='logarithmic')
+    x = x.astype(dtype=np.float32)
+
+    # Magnitude
+    stft = CQT1992(sr=fs, fmin=220, output_format="Magnitude",
+                     n_bins=80, bins_per_octave=24).to(device)
+    X = stft(torch.tensor(x, device=device).unsqueeze(0))
+
+
+    # Complex
+    stft = CQT1992(sr=fs, fmin=220, output_format="Complex",
+                     n_bins=80, bins_per_octave=24).to(device)
+    X = stft(torch.tensor(x, device=device).unsqueeze(0))
+
+    # Phase
+    stft = CQT1992(sr=fs, fmin=220, output_format="Phase",
+                     n_bins=160, bins_per_octave=24).to(device)
+    X = stft(torch.tensor(x, device=device).unsqueeze(0))
+    
+    assert True
+    
+@pytest.mark.parametrize("device", ['cpu', f'cuda:{gpu_idx}'])
+def test_cqt_2010(device):
+    # Log sweep case
+    fs = 44100
+    t = 1
+    f0 = 55
+    f1 = 22050
+    s = np.linspace(0, t, fs*t)
+    x = chirp(s, f0, 1, f1, method='logarithmic')
+    x = x.astype(dtype=np.float32)
+
+    # Magnitude
+    stft = CQT2010(sr=fs, fmin=110, output_format="Magnitude",
+                     n_bins=160, bins_per_octave=24).to(device)
+    X = stft(torch.tensor(x, device=device).unsqueeze(0))     
+
+    # Complex
+    stft = CQT2010(sr=fs, fmin=110, output_format="Complex",
+                     n_bins=160, bins_per_octave=24).to(device)
+    X = stft(torch.tensor(x, device=device).unsqueeze(0))
+    
+    # Phase
+    stft = CQT2010(sr=fs, fmin=110, output_format="Phase",
+                     n_bins=160, bins_per_octave=24).to(device)
+    X = stft(torch.tensor(x, device=device).unsqueeze(0))    
+    assert True   
+
+@pytest.mark.parametrize("device", ['cpu', f'cuda:{gpu_idx}'])
+def test_cqt_1992_v2_log(device):
+    # Log sweep case
+    fs = 44100
+    t = 1
+    f0 = 55
+    f1 = 22050
+    s = np.linspace(0, t, fs*t)
+    x = chirp(s, f0, 1, f1, method='logarithmic')
+    x = x.astype(dtype=np.float32)
+
+    # Magnitude
+    stft = CQT1992v2(sr=fs, fmin=55, output_format="Magnitude",
+                     n_bins=207, bins_per_octave=24).to(device)
+    X = stft(torch.tensor(x, device=device).unsqueeze(0))
+    ground_truth = np.load("tests/ground-truths/log-sweep-cqt-1992-mag-ground-truth.npy")
+    X = torch.log(X + 1e-5)
+    assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3)
+
+    # Complex
+    stft = CQT1992v2(sr=fs, fmin=55, output_format="Complex",
+                     n_bins=207, bins_per_octave=24).to(device)
+    X = stft(torch.tensor(x, device=device).unsqueeze(0))
+    ground_truth = np.load("tests/ground-truths/log-sweep-cqt-1992-complex-ground-truth.npy")
+    assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3)
+
+    # Phase
+    stft = CQT1992v2(sr=fs, fmin=55, output_format="Phase",
+                     n_bins=207, bins_per_octave=24).to(device)
+    X = stft(torch.tensor(x, device=device).unsqueeze(0))
+    ground_truth = np.load("tests/ground-truths/log-sweep-cqt-1992-phase-ground-truth.npy")
+    assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3)
+
+@pytest.mark.parametrize("device", ['cpu', f'cuda:{gpu_idx}'])
+def test_cqt_1992_v2_linear(device):
+    # Linear sweep case
+    fs = 44100
+    t = 1
+    f0 = 55
+    f1 = 22050
+    s = np.linspace(0, t, fs*t)
+    x = chirp(s, f0, 1, f1, method='linear')
+    x = x.astype(dtype=np.float32)
+
+    # Magnitude
+    stft = CQT1992v2(sr=fs, fmin=55, output_format="Magnitude",
+                     n_bins=207, bins_per_octave=24).to(device)
+    X = stft(torch.tensor(x, device=device).unsqueeze(0))
+    ground_truth = np.load("tests/ground-truths/linear-sweep-cqt-1992-mag-ground-truth.npy")
+    X = torch.log(X + 1e-5)
+    assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3)
+
+    # Complex
+    stft = CQT1992v2(sr=fs, fmin=55, output_format="Complex",
+                     n_bins=207, bins_per_octave=24).to(device)
+    X = stft(torch.tensor(x, device=device).unsqueeze(0))
+    ground_truth = np.load("tests/ground-truths/linear-sweep-cqt-1992-complex-ground-truth.npy")
+    assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3)
+
+    # Phase
+    stft = CQT1992v2(sr=fs, fmin=55, output_format="Phase",
+                     n_bins=207, bins_per_octave=24).to(device)
+    X = stft(torch.tensor(x, device=device).unsqueeze(0))
+    ground_truth = np.load("tests/ground-truths/linear-sweep-cqt-1992-phase-ground-truth.npy")
+    assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3)
+
+@pytest.mark.parametrize("device", ['cpu', f'cuda:{gpu_idx}'])
+def test_cqt_2010_v2_log(device):
+    # Log sweep case
+    fs = 44100
+    t = 1
+    f0 = 55
+    f1 = 22050
+    s = np.linspace(0, t, fs*t)
+    x = chirp(s, f0, 1, f1, method='logarithmic')
+    x = x.astype(dtype=np.float32)
+
+    # Magnitude
+    stft = CQT2010v2(sr=fs, fmin=55, output_format="Magnitude",
+                     n_bins=207, bins_per_octave=24).to(device)
+    X = stft(torch.tensor(x, device=device).unsqueeze(0))     
+    X = torch.log(X + 1e-2)
+#     np.save("tests/ground-truths/log-sweep-cqt-2010-mag-ground-truth", X.cpu())
+    ground_truth = np.load("tests/ground-truths/log-sweep-cqt-2010-mag-ground-truth.npy")
+    assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3)
+
+    # Complex
+    stft = CQT2010v2(sr=fs, fmin=55, output_format="Complex",
+                     n_bins=207, bins_per_octave=24).to(device)
+    X = stft(torch.tensor(x, device=device).unsqueeze(0))
+#     np.save("tests/ground-truths/log-sweep-cqt-2010-complex-ground-truth", X.cpu())         
+    ground_truth = np.load("tests/ground-truths/log-sweep-cqt-2010-complex-ground-truth.npy")
+    assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3)
+
+#     # Phase
+#     stft = CQT2010v2(sr=fs, fmin=55, device=device, output_format="Phase",
+#                      n_bins=207, bins_per_octave=24)
+#     X = stft(torch.tensor(x, device=device).unsqueeze(0))
+# #     np.save("tests/ground-truths/log-sweep-cqt-2010-phase-ground-truth", X.cpu())      
+#     ground_truth = np.load("tests/ground-truths/log-sweep-cqt-2010-phase-ground-truth.npy")
+#     assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3)
+
+@pytest.mark.parametrize("device", ['cpu', f'cuda:{gpu_idx}'])
+def test_cqt_2010_v2_linear(device):
+    # Linear sweep case
+    fs = 44100
+    t = 1
+    f0 = 55
+    f1 = 22050
+    s = np.linspace(0, t, fs*t)
+    x = chirp(s, f0, 1, f1, method='linear')
+    x = x.astype(dtype=np.float32)
+
+    # Magnitude
+    stft = CQT2010v2(sr=fs, fmin=55, output_format="Magnitude",
+                     n_bins=207, bins_per_octave=24).to(device)
+    X = stft(torch.tensor(x, device=device).unsqueeze(0))
+    X = torch.log(X + 1e-2)
+#     np.save("tests/ground-truths/linear-sweep-cqt-2010-mag-ground-truth", X.cpu()) 
+    ground_truth = np.load("tests/ground-truths/linear-sweep-cqt-2010-mag-ground-truth.npy")    
+    assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3)
+
+    # Complex
+    stft = CQT2010v2(sr=fs, fmin=55, output_format="Complex",
+                     n_bins=207, bins_per_octave=24).to(device)
+    X = stft(torch.tensor(x, device=device).unsqueeze(0))
+#     np.save("tests/ground-truths/linear-sweep-cqt-2010-complex-ground-truth", X.cpu())
+    ground_truth = np.load("tests/ground-truths/linear-sweep-cqt-2010-complex-ground-truth.npy")
+    assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3)
+
+    # Phase
+#     stft = CQT2010v2(sr=fs, fmin=55, device=device, output_format="Phase",
+#                      n_bins=207, bins_per_octave=24)
+#     X = stft(torch.tensor(x, device=device).unsqueeze(0))
+# #     np.save("tests/ground-truths/linear-sweep-cqt-2010-phase-ground-truth", X.cpu())
+#     ground_truth = np.load("tests/ground-truths/linear-sweep-cqt-2010-phase-ground-truth.npy")
+#     assert np.allclose(X.cpu(), ground_truth, rtol=1e-3, atol=1e-3)
+
+@pytest.mark.parametrize("device", ['cpu', f'cuda:{gpu_idx}'])
+def test_mfcc(device):
+    x = example_y
+    mfcc = MFCC(sr=example_sr).to(device)
+    X = mfcc(torch.tensor(x, device=device).unsqueeze(0)).squeeze()
+    X_librosa = librosa.feature.mfcc(x, sr=example_sr)
+    assert np.allclose(X.cpu(), X_librosa, rtol=1e-3, atol=1e-3)
+    
+
+x = torch.randn((4,44100)) # Create a batch of input for the following Data.Parallel test
+
+@pytest.mark.parametrize("device", [f'cuda:{gpu_idx}'])
+def test_STFT_Parallel(device):
+    spec_layer = STFT(hop_length=512, n_fft=2048, window='hann', 
+                                  freq_scale='no',
+                                  output_format='Complex').to(device)
+    inverse_spec_layer = iSTFT(hop_length=512, n_fft=2048, window='hann', 
+                                  freq_scale='no').to(device)    
+    
+    spec_layer_parallel = torch.nn.DataParallel(spec_layer)
+    inverse_spec_layer_parallel = torch.nn.DataParallel(inverse_spec_layer)
+    spec = spec_layer_parallel(x)
+    x_recon = inverse_spec_layer_parallel(spec, onesided=True, length=x.shape[-1])
+    
+    assert np.allclose(x_recon.detach().cpu(), x.detach().cpu(), rtol=1e-3, atol=1e-3)
+
+@pytest.mark.parametrize("device", [f'cuda:{gpu_idx}'])   
+def test_MelSpectrogram_Parallel(device):
+    spec_layer = MelSpectrogram(sr=22050, n_fft=2048, n_mels=128, hop_length=512,
+                                            window='hann', center=True, pad_mode='reflect', 
+                                            power=2.0, htk=False, fmin=0.0, fmax=None, norm=1, 
+                                            verbose=True).to(device)
+    spec_layer_parallel = torch.nn.DataParallel(spec_layer)
+    spec = spec_layer_parallel(x)
+
+@pytest.mark.parametrize("device", [f'cuda:{gpu_idx}'])    
+def test_MFCC_Parallel(device):
+    spec_layer = MFCC().to(device)
+    spec_layer_parallel = torch.nn.DataParallel(spec_layer)
+    spec = spec_layer_parallel(x)    
+
+@pytest.mark.parametrize("device", [f'cuda:{gpu_idx}'])
+def test_CQT1992_Parallel(device):
+    spec_layer = CQT1992(fmin=110, n_bins=60, bins_per_octave=12).to(device)
+    spec_layer_parallel = torch.nn.DataParallel(spec_layer)
+    spec = spec_layer_parallel(x) 
+
+@pytest.mark.parametrize("device", [f'cuda:{gpu_idx}'])    
+def test_CQT1992v2_Parallel(device):
+    spec_layer = CQT1992v2().to(device)
+    spec_layer_parallel = torch.nn.DataParallel(spec_layer)
+    spec = spec_layer_parallel(x)        
+
+@pytest.mark.parametrize("device", [f'cuda:{gpu_idx}'])    
+def test_CQT2010_Parallel(device):
+    spec_layer = CQT2010().to(device)
+    spec_layer_parallel = torch.nn.DataParallel(spec_layer)
+    spec = spec_layer_parallel(x)  
+    
+@pytest.mark.parametrize("device", [f'cuda:{gpu_idx}'])    
+def test_CQT2010v2_Parallel(device):
+    spec_layer = CQT2010v2().to(device)
+    spec_layer_parallel = torch.nn.DataParallel(spec_layer)
+    spec = spec_layer_parallel(x)       
\ No newline at end of file