test_audio_functions.py 10.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest

import librosa
import numpy as np
import os
import paddle

import paddle.audio
from scipy import signal
import itertools
from parameterized import parameterized


def parameterize(*params):
    return parameterized.expand(list(itertools.product(*params)))


class TestAudioFuncitons(unittest.TestCase):
    def setUp(self):
        self.initParmas()

    def initParmas(self):
        def get_wav_data(dtype: str, num_channels: int, num_frames: int):
            dtype_ = getattr(paddle, dtype)
            base = paddle.linspace(-1.0, 1.0, num_frames, dtype=dtype_) * 0.1
            data = base.tile([num_channels, 1])
            return data

        self.n_fft = 512
        self.hop_length = 128
        self.n_mels = 40
        self.n_mfcc = 20
        self.fmin = 0.0
        self.window_str = 'hann'
        self.pad_mode = 'reflect'
        self.top_db = 80.0
        self.duration = 0.5
        self.num_channels = 1
        self.sr = 16000
        self.dtype = "float32"
        self.window_size = 1024
55 56 57
        waveform_tensor = get_wav_data(
            self.dtype, self.num_channels, num_frames=self.duration * self.sr
        )
58 59 60 61 62 63
        self.waveform = waveform_tensor.numpy()

    @parameterize([1.0, 3.0, 9.0, 25.0], [True, False])
    def test_audio_function(self, val: float, htk_flag: bool):
        mel_paddle = paddle.audio.functional.hz_to_mel(val, htk_flag)
        mel_paddle_tensor = paddle.audio.functional.hz_to_mel(
64 65
            paddle.to_tensor(val), htk_flag
        )
66 67
        mel_librosa = librosa.hz_to_mel(val, htk_flag)
        np.testing.assert_almost_equal(mel_paddle, mel_librosa, decimal=5)
68 69 70
        np.testing.assert_almost_equal(
            mel_paddle_tensor.numpy(), mel_librosa, decimal=4
        )
71 72 73

        hz_paddle = paddle.audio.functional.mel_to_hz(val, htk_flag)
        hz_paddle_tensor = paddle.audio.functional.mel_to_hz(
74 75
            paddle.to_tensor(val), htk_flag
        )
76 77
        hz_librosa = librosa.mel_to_hz(val, htk_flag)
        np.testing.assert_almost_equal(hz_paddle, hz_librosa, decimal=4)
78 79 80
        np.testing.assert_almost_equal(
            hz_paddle_tensor.numpy(), hz_librosa, decimal=4
        )
81 82

        decibel_paddle = paddle.audio.functional.power_to_db(
83 84
            paddle.to_tensor(val)
        )
85
        decibel_librosa = librosa.power_to_db(val)
86 87 88
        np.testing.assert_almost_equal(
            decibel_paddle.numpy(), decibel_paddle, decimal=5
        )
89

90 91 92 93 94 95 96 97 98
    @parameterize(
        [64, 128, 256], [0.0, 0.5, 1.0], [10000, 11025], [False, True]
    )
    def test_audio_function_mel(
        self, n_mels: int, f_min: float, f_max: float, htk_flag: bool
    ):
        librosa_mel_freq = librosa.mel_frequencies(
            n_mels, f_min, f_max, htk_flag
        )
99
        paddle_mel_freq = paddle.audio.functional.mel_frequencies(
100 101 102 103 104
            n_mels, f_min, f_max, htk_flag, 'float64'
        )
        np.testing.assert_almost_equal(
            paddle_mel_freq, librosa_mel_freq, decimal=3
        )
105 106 107 108 109 110 111 112 113 114

    @parameterize([8000, 16000], [64, 128, 256])
    def test_audio_function_fft(self, sr: int, n_fft: int):
        librosa_fft = librosa.fft_frequencies(sr, n_fft)
        paddle_fft = paddle.audio.functional.fft_frequencies(sr, n_fft)
        np.testing.assert_almost_equal(paddle_fft, librosa_fft, decimal=5)

    @parameterize([1.0, 3.0, 9.0])
    def test_audio_function_exception(self, spect: float):
        try:
115 116 117
            paddle.audio.functional.power_to_db(
                paddle.to_tensor([spect]), amin=0
            )
118 119 120 121
        except Exception:
            pass

        try:
122 123 124
            paddle.audio.functional.power_to_db(
                paddle.to_tensor([spect]), ref_value=0
            )
125 126 127 128 129

        except Exception:
            pass

        try:
130 131 132
            paddle.audio.functional.power_to_db(
                paddle.to_tensor([spect]), top_db=-1
            )
133 134 135
        except Exception:
            pass

136 137 138 139 140 141 142 143 144 145 146 147 148
    @parameterize(
        [
            "hamming",
            "hann",
            "triang",
            "bohman",
            "blackman",
            "cosine",
            "tukey",
            "taylor",
        ],
        [1, 512],
    )
149 150 151
    def test_window(self, window_type: str, n_fft: int):
        window_scipy = signal.get_window(window_type, n_fft)
        window_paddle = paddle.audio.functional.get_window(window_type, n_fft)
152 153 154
        np.testing.assert_array_almost_equal(
            window_scipy, window_paddle.numpy(), decimal=5
        )
155 156

    @parameterize([1, 512])
157
    def test_gaussian_window_and_exception(self, n_fft: int):
158 159
        window_scipy_gaussain = signal.windows.gaussian(n_fft, std=7)
        window_paddle_gaussian = paddle.audio.functional.get_window(
160 161 162 163 164
            ('gaussian', 7), n_fft, False
        )
        np.testing.assert_array_almost_equal(
            window_scipy_gaussain, window_paddle_gaussian.numpy(), decimal=5
        )
165
        window_scipy_general_gaussain = signal.windows.general_gaussian(
166 167
            n_fft, 1, 7
        )
168
        window_paddle_general_gaussian = paddle.audio.functional.get_window(
169 170 171 172 173
            ('general_gaussian', 1, 7), n_fft, False
        )
        np.testing.assert_array_almost_equal(
            window_scipy_gaussain, window_paddle_gaussian.numpy(), decimal=5
        )
174 175 176

        window_scipy_exp = signal.windows.exponential(n_fft)
        window_paddle_exp = paddle.audio.functional.get_window(
177 178 179 180 181
            ('exponential', None, 1), n_fft, False
        )
        np.testing.assert_array_almost_equal(
            window_scipy_exp, window_paddle_exp.numpy(), decimal=5
        )
182 183 184 185 186 187 188
        try:
            window_paddle = paddle.audio.functional.get_window("hann", -1)
        except ValueError:
            pass

        try:
            window_paddle = paddle.audio.functional.get_window(
189 190
                "fake_window", self.n_fft
            )
191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
        except ValueError:
            pass

        try:
            window_paddle = paddle.audio.functional.get_window(1043, self.n_fft)
        except ValueError:
            pass

    @parameterize([5, 13, 23], [257, 513, 1025])
    def test_create_dct(self, n_mfcc: int, n_mels: int):
        def dct(n_filters, n_input):
            basis = np.empty((n_filters, n_input))
            basis[0, :] = 1.0 / np.sqrt(n_input)
            samples = np.arange(1, 2 * n_input, 2) * np.pi / (2.0 * n_input)

            for i in range(1, n_filters):
                basis[i, :] = np.cos(i * samples) * np.sqrt(2.0 / n_input)
            return basis.T

        librosa_dct = dct(n_mfcc, n_mels)
        paddle_dct = paddle.audio.functional.create_dct(n_mfcc, n_mels)
        np.testing.assert_array_almost_equal(librosa_dct, paddle_dct, decimal=5)

214 215 216 217 218 219
    @parameterize(
        [128, 256, 512], ["hamming", "hann", "triang", "bohman"], [True, False]
    )
    def test_stft_and_spect(
        self, n_fft: int, window_str: str, center_flag: bool
    ):
220 221 222
        hop_length = int(n_fft / 4)
        if len(self.waveform.shape) == 2:  # (C, T)
            self.waveform = self.waveform.squeeze(
223 224
                0
            )  # 1D input for librosa.feature.melspectrogram
225 226 227 228 229 230 231 232 233 234 235
        feature_librosa = librosa.core.stft(
            y=self.waveform,
            n_fft=n_fft,
            hop_length=hop_length,
            win_length=None,
            window=window_str,
            center=center_flag,
            dtype=None,
            pad_mode=self.pad_mode,
        )
        x = paddle.to_tensor(self.waveform).unsqueeze(0)
236 237 238
        window = paddle.audio.functional.get_window(
            window_str, n_fft, dtype=x.dtype
        )
239 240 241 242 243 244 245 246 247 248 249
        feature_paddle = paddle.signal.stft(
            x=x,
            n_fft=n_fft,
            hop_length=hop_length,
            win_length=None,
            window=window,
            center=center_flag,
            pad_mode=self.pad_mode,
            normalized=False,
            onesided=True,
        ).squeeze(0)
250 251 252
        np.testing.assert_array_almost_equal(
            feature_librosa, feature_paddle, decimal=5
        )
253 254 255 256 257 258 259 260 261 262 263 264

        feature_bg = np.power(np.abs(feature_librosa), 2.0)
        feature_extractor = paddle.audio.features.Spectrogram(
            n_fft=n_fft,
            hop_length=hop_length,
            win_length=None,
            window=window_str,
            power=2.0,
            center=center_flag,
            pad_mode=self.pad_mode,
        )
        feature_layer = feature_extractor(x).squeeze(0)
265 266 267
        np.testing.assert_array_almost_equal(
            feature_layer, feature_bg, decimal=3
        )
268

269 270 271
    @parameterize(
        [128, 256, 512], [64, 82], ["hamming", "hann", "triang", "bohman"]
    )
272 273 274
    def test_istft(self, n_fft: int, hop_length: int, window_str: str):
        if len(self.waveform.shape) == 2:  # (C, T)
            self.waveform = self.waveform.squeeze(
275 276
                0
            )  # 1D input for librosa.feature.melspectrogram
277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297
        # librosa
        # Get stft result from librosa.
        stft_matrix = librosa.core.stft(
            y=self.waveform,
            n_fft=n_fft,
            hop_length=hop_length,
            win_length=None,
            window=window_str,
            center=True,
            pad_mode=self.pad_mode,
        )
        feature_librosa = librosa.core.istft(
            stft_matrix=stft_matrix,
            hop_length=hop_length,
            win_length=None,
            window=window_str,
            center=True,
            dtype=None,
            length=None,
        )
        x = paddle.to_tensor(stft_matrix).unsqueeze(0)
298 299 300
        window = paddle.audio.functional.get_window(
            window_str, n_fft, dtype=paddle.to_tensor(self.waveform).dtype
        )
301 302 303 304 305 306 307 308 309 310 311 312 313
        feature_paddle = paddle.signal.istft(
            x=x,
            n_fft=n_fft,
            hop_length=hop_length,
            win_length=None,
            window=window,
            center=True,
            normalized=False,
            onesided=True,
            length=None,
            return_complex=False,
        ).squeeze(0)

314 315 316
        np.testing.assert_array_almost_equal(
            feature_librosa, feature_paddle, decimal=5
        )
317 318 319 320


if __name__ == '__main__':
    unittest.main()