test_mfcc_librosa_compat.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools

import librosa
import numpy as np
import paddle
import paddleaudio
import pytest
from utils import load_example_audio1

eps_float32 = 1e-3
eps_float64 = 2.2e-5
# Pre-loading to speed up the test
signal, _ = load_example_audio1()
signal_tensor = paddle.to_tensor(signal)


def generate_mfcc_test():
    sr = [16000]
    n_fft = [512]  #, 1024]
    hop_length = [160]  #, 400]
    win_length = [512]
    window = ['hann']  # 'hamming', ('gaussian', 50)]
    center = [True]  #, False]
    pad_mode = ['reflect', 'constant']
    power = [2.0]
    n_mels = [64]  #32]
    fmin = [0, 10]
    fmax = [8000, None]
    dtype = ['float32', 'float64']
    device = ['gpu', 'cpu']
    n_mfcc = [40, 20]
    htk = [True]
    args = [
        sr, n_fft, hop_length, win_length, window, center, pad_mode, power,
        n_mels, fmin, fmax, dtype, device, n_mfcc, htk
    ]
    return itertools.product(*args)


@pytest.mark.parametrize(
    'sr, n_fft, hop_length, win_length, window, center, pad_mode, power,\
        n_mels, fmin, fmax,dtype,device,n_mfcc,htk', generate_mfcc_test())
def test_mfcc_case(sr, n_fft, hop_length, win_length, window, center, pad_mode, power,\
        n_mels, fmin, fmax,dtype,device,n_mfcc,htk):
    # paddle.set_device(device)
    # hop_length = 160
    # win_length = 512
    # window = 'hann'
    # pad_mode = 'constant'
    # power = 2.0
    # sample_rate = 16000
    # center = True
    # f_min = 0.0

    # for librosa, the norm is default to 'slaney'
    expected = librosa.feature.mfcc(signal,
                                    sr=sr,
                                    n_mfcc=n_mfcc,
                                    n_fft=win_length,
                                    hop_length=hop_length,
                                    win_length=win_length,
                                    window=window,
                                    center=center,
                                    n_mels=n_mels,
                                    pad_mode=pad_mode,
                                    fmin=fmin,
                                    fmax=fmax,
                                    htk=htk,
                                    power=2.0)

    paddle_mfcc = paddleaudio.functional.mfcc(signal_tensor,
                                              sr=sr,
                                              n_mfcc=n_mfcc,
                                              n_fft=win_length,
                                              hop_length=hop_length,
                                              win_length=win_length,
                                              window=window,
                                              center=center,
                                              n_mels=n_mels,
                                              pad_mode=pad_mode,
                                              f_min=fmin,
                                              f_max=fmax,
                                              htk=htk,
                                              norm='slaney',
                                              dtype=dtype)

    paddle_librosa_diff = np.mean(np.abs(expected - paddle_mfcc.numpy()))
    if dtype == 'float64':
        assert paddle_librosa_diff < eps_float64
    else:
        assert paddle_librosa_diff < eps_float32

    try:  # if we have torchaudio installed
        import torch
        import torchaudio
        kwargs = {
            'n_fft': win_length,
            'hop_length': hop_length,
            'win_length': win_length,
            # 'window':window,
            'center': center,
            'n_mels': n_mels,
            'pad_mode': pad_mode,
            'f_min': fmin,
            'f_max': fmax,
            'mel_scale': 'htk',
            'norm': 'slaney',
            'power': 2.0
        }
        torch_mfcc_transform = torchaudio.transforms.MFCC(n_mfcc=20,
                                                          log_mels=False,
                                                          melkwargs=kwargs)
        torch_mfcc = torch_mfcc_transform(torch.tensor(signal))
        paddle_torch_mfcc_diff = np.mean(
            np.abs(paddle_mfcc.numpy() - torch_mfcc.numpy()))
        assert paddle_torch_mfcc_diff < 5e-5
        torch_librosa_mfcc_diff = np.mean(np.abs(torch_mfcc.numpy() - expected))
        assert torch_librosa_mfcc_diff < 5e-5
    except:
        pass


#test_mfcc_case(512, 40, 20, True, 8000, 'cpu','float64',eps_float64)