diff --git a/paddleaudio/tests/benchmark/README.md b/paddleaudio/tests/benchmark/README.md index 9655632df42f636c5f21ecbfd2808bb1ca69fffa..b391788b9e8a605cb29eaad1c3848efdef0928ad 100644 --- a/paddleaudio/tests/benchmark/README.md +++ b/paddleaudio/tests/benchmark/README.md @@ -17,24 +17,31 @@ platform linux -- Python 3.7.7, pytest-7.0.1, pluggy-1.0.0 benchmark: 3.4.1 (defaults: timer=time.perf_counter disable_gc=False min_rounds=5 min_time=0.000005 max_time=1.0 calibration_precision=10 warmup=False warmup_iterations=100000) rootdir: /ssd3/chenxiaojie06/PaddleSpeech/DeepSpeech/paddleaudio plugins: typeguard-2.12.1, benchmark-3.4.1, anyio-3.5.0 -collected 6 items +collected 12 items -features.py ...... [100%] +features.py ............ [100%] -------------------------------------------------------------------------------------------------- benchmark: 6 tests ------------------------------------------------------------------------------------------------ -Name (time in us) Min Max Mean StdDev Median IQR Outliers OPS Rounds Iterations ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -test_melspect_gpu 632.2041 (1.0) 898.7449 (1.0) 709.3824 (1.0) 109.7022 (6.91) 676.1923 (1.0) 115.2642 (22.19) 1;0 1,409.6768 (1.0) 5 1 -test_log_melspect_gpu 912.9159 (1.44) 1,222.0535 (1.36) 931.2489 (1.31) 34.4270 (2.17) 924.9896 (1.37) 5.1949 (1.0) 4;13 1,073.8268 (0.76) 82 1 -test_mfcc_gpu 1,244.8374 (1.97) 1,321.3232 (1.47) 1,262.1319 (1.78) 15.8698 (1.0) 1,258.3155 (1.86) 14.1086 (2.72) 17;9 792.3102 (0.56) 91 1 -test_melspect_cpu 19,106.5744 (30.22) 46,194.2125 (51.40) 27,458.7850 (38.71) 9,786.1071 (616.65) 23,830.0692 (35.24) 14,344.4724 (>1000.0) 3;0 36.4182 (0.03) 14 1 -test_log_melspect_cpu 19,513.7132 (30.87) 20,367.2443 (22.66) 19,765.4018 (27.86) 167.1289 (10.53) 19,750.2729 (29.21) 188.9346 (36.37) 16;1 50.5935 (0.04) 49 1 -test_mfcc_cpu 19,881.3528 (31.45) 20,427.2158 (22.73) 20,104.6574 (28.34) 129.5621 (8.16) 20,075.8977 (29.69) 150.9022 (29.05) 12;2 49.7397 (0.04) 48 1 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +---------------------------------------------------------------------------------------------------- benchmark: 12 tests ---------------------------------------------------------------------------------------------------- +Name (time in us) Min Max Mean StdDev Median IQR Outliers OPS Rounds Iterations +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +test_melspect_gpu_torchaudio 210.7229 (1.0) 338.5879 (1.0) 217.4949 (1.0) 11.3591 (1.02) 214.0319 (1.0) 8.3707 (1.0) 6;5 4,597.8093 (1.0) 186 1 +test_log_melspect_gpu_torchaudio 375.4422 (1.78) 1,024.8050 (3.03) 387.3589 (1.78) 18.7080 (1.69) 385.2872 (1.80) 9.4259 (1.13) 31;31 2,581.5853 (0.56) 1420 1 +test_mfcc_gpu_torchaudio 422.4107 (2.00) 700.7364 (2.07) 454.9903 (2.09) 47.3926 (4.27) 436.6031 (2.04) 15.4376 (1.84) 159;193 2,197.8493 (0.48) 1078 1 +test_melspect_gpu 819.3776 (3.89) 1,161.9311 (3.43) 900.9168 (4.14) 147.0245 (13.26) 830.7453 (3.88) 115.4500 (13.79) 1;1 1,109.9805 (0.24) 5 1 +test_log_melspect_gpu 1,197.9323 (5.68) 1,280.0004 (3.78) 1,214.0182 (5.58) 11.0918 (1.0) 1,211.6358 (5.66) 10.0820 (1.20) 84;31 823.7109 (0.18) 533 1 +test_mfcc_gpu 1,337.0719 (6.35) 1,601.5675 (4.73) 1,355.4527 (6.23) 26.4458 (2.38) 1,348.6911 (6.30) 13.1410 (1.57) 16;17 737.7609 (0.16) 193 1 +test_melspect_cpu_torchaudio 1,374.8817 (6.52) 3,937.5033 (11.63) 1,574.8930 (7.24) 355.4223 (32.04) 1,409.1432 (6.58) 193.7435 (23.15) 36;49 634.9638 (0.14) 291 1 +test_log_melspect_cpu_torchaudio 1,390.2634 (6.60) 2,121.2976 (6.27) 1,559.3045 (7.17) 220.3090 (19.86) 1,409.4356 (6.59) 349.1524 (41.71) 106;0 641.3116 (0.14) 445 1 +test_mfcc_cpu_torchaudio 1,445.6678 (6.86) 3,801.8432 (11.23) 1,680.8559 (7.73) 395.5443 (35.66) 1,469.8748 (6.87) 305.6149 (36.51) 38;35 594.9350 (0.13) 469 1 +test_melspect_cpu 20,620.2641 (97.85) 20,984.0760 (61.98) 20,721.4942 (95.27) 70.2757 (6.34) 20,717.8025 (96.80) 57.8668 (6.91) 6;2 48.2591 (0.01) 30 1 +test_log_melspect_cpu 21,025.3932 (99.78) 48,894.0198 (144.41) 23,057.7049 (106.01) 5,440.3207 (490.48) 21,190.5045 (99.01) 190.0699 (22.71) 4;9 43.3695 (0.01) 44 1 +test_mfcc_cpu 21,127.2798 (100.26) 45,811.5358 (135.30) 23,176.4022 (106.56) 5,041.0751 (454.49) 21,319.1714 (99.61) 149.0396 (17.80) 5;9 43.1473 (0.01) 44 1 +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Legend: Outliers: 1 Standard Deviation from Mean; 1.5 IQR (InterQuartile Range) from 1st Quartile and 3rd Quartile. OPS: Operations Per Second, computed as 1 / Mean - ========================================================================== 6 passed in 20.51s =========================================================================== + ========================================================================== 12 passed in 26.81s ========================================================================== + ``` diff --git a/paddleaudio/tests/benchmark/features.py b/paddleaudio/tests/benchmark/features.py index 67bec9e66e07e1d0a266c22710bdaf8609aef3e1..30ef6f9947473768521b98d7b47026bc49df6b60 100644 --- a/paddleaudio/tests/benchmark/features.py +++ b/paddleaudio/tests/benchmark/features.py @@ -11,15 +11,28 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os +import urllib.request + import librosa import numpy as np import paddle +import torch +import torchaudio import paddleaudio +wav_url = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav' +if not os.path.isfile(os.path.basename(wav_url)): + urllib.request.urlretrieve(wav_url, os.path.basename(wav_url)) + +waveform, sr = paddleaudio.load(os.path.abspath(os.path.basename(wav_url))) +waveform_tensor = paddle.to_tensor(waveform).unsqueeze(0) +waveform_tensor_torch = torch.from_numpy(waveform).unsqueeze(0) + # Feature conf mel_conf = { - 'sr': 16000, + 'sr': sr, 'n_fft': 512, 'hop_length': 128, 'n_mels': 40, @@ -30,9 +43,18 @@ mfcc_conf = { } mfcc_conf.update(mel_conf) -input_shape = (48000) -waveform = np.random.random(size=input_shape) -waveform_tensor = paddle.to_tensor(waveform).unsqueeze(0) +mel_conf_torchaudio = { + 'sample_rate': sr, + 'n_fft': 512, + 'hop_length': 128, + 'n_mels': 40, + 'norm': 'slaney', + 'mel_scale': 'slaney', +} +mfcc_conf_torchaudio = { + 'sample_rate': sr, + 'n_mfcc': 20, +} def enable_cpu_device(): @@ -56,7 +78,7 @@ def test_melspect_cpu(benchmark): feature_paddleaudio = benchmark(melspectrogram) feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf) np.testing.assert_array_almost_equal( - feature_librosa, feature_paddleaudio, decimal=4) + feature_librosa, feature_paddleaudio, decimal=3) def test_melspect_gpu(benchmark): @@ -64,11 +86,39 @@ def test_melspect_gpu(benchmark): feature_paddleaudio = benchmark(melspectrogram) feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf) np.testing.assert_array_almost_equal( - feature_librosa, feature_paddleaudio, decimal=4) + feature_librosa, feature_paddleaudio, decimal=3) + + +mel_extractor_torchaudio = torchaudio.transforms.MelSpectrogram( + **mel_conf_torchaudio, f_min=0.0) + + +def melspectrogram_torchaudio(): + return mel_extractor_torchaudio(waveform_tensor_torch).squeeze(0) + + +def test_melspect_cpu_torchaudio(benchmark): + global waveform_tensor_torch, mel_extractor_torchaudio + mel_extractor_torchaudio = mel_extractor_torchaudio.to('cpu') + waveform_tensor_torch = waveform_tensor_torch.to('cpu') + feature_paddleaudio = benchmark(melspectrogram_torchaudio) + feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf) + np.testing.assert_array_almost_equal( + feature_librosa, feature_paddleaudio, decimal=3) + + +def test_melspect_gpu_torchaudio(benchmark): + global waveform_tensor_torch, mel_extractor_torchaudio + mel_extractor_torchaudio = mel_extractor_torchaudio.to('cuda') + waveform_tensor_torch = waveform_tensor_torch.to('cuda') + feature_torchaudio = benchmark(melspectrogram_torchaudio) + feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf) + np.testing.assert_array_almost_equal( + feature_librosa, feature_torchaudio.cpu(), decimal=3) log_mel_extractor = paddleaudio.features.LogMelSpectrogram( - **mel_conf, f_min=0.0, dtype=waveform_tensor.dtype) + **mel_conf, f_min=0.0, top_db=80.0, dtype=waveform_tensor.dtype) def log_melspectrogram(): @@ -79,18 +129,54 @@ def test_log_melspect_cpu(benchmark): enable_cpu_device() feature_paddleaudio = benchmark(log_melspectrogram) feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf) - feature_librosa = librosa.power_to_db(feature_librosa, top_db=None) + feature_librosa = librosa.power_to_db(feature_librosa, top_db=80.0) np.testing.assert_array_almost_equal( - feature_librosa, feature_paddleaudio, decimal=4) + feature_librosa, feature_paddleaudio, decimal=3) def test_log_melspect_gpu(benchmark): enable_gpu_device() feature_paddleaudio = benchmark(log_melspectrogram) feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf) - feature_librosa = librosa.power_to_db(feature_librosa, top_db=None) + feature_librosa = librosa.power_to_db(feature_librosa, top_db=80.0) + np.testing.assert_array_almost_equal( + feature_librosa, feature_paddleaudio, decimal=2) + + +amplitude_to_DB = torchaudio.transforms.AmplitudeToDB('power', top_db=80.0) + + +def log_melspectrogram_torchaudio(): + mel_specgram = mel_extractor_torchaudio(waveform_tensor_torch) + return amplitude_to_DB(mel_specgram).squeeze(0) + + +def test_log_melspect_cpu_torchaudio(benchmark): + global waveform_tensor_torch, mel_extractor_torchaudio, amplitude_to_DB + + mel_extractor_torchaudio = mel_extractor_torchaudio.to('cpu') + waveform_tensor_torch = waveform_tensor_torch.to('cpu') + amplitude_to_DB = amplitude_to_DB.to('cpu') + + feature_paddleaudio = benchmark(log_melspectrogram_torchaudio) + feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf) + feature_librosa = librosa.power_to_db(feature_librosa, top_db=80.0) + np.testing.assert_array_almost_equal( + feature_librosa, feature_paddleaudio, decimal=3) + + +def test_log_melspect_gpu_torchaudio(benchmark): + global waveform_tensor_torch, mel_extractor_torchaudio, amplitude_to_DB + + mel_extractor_torchaudio = mel_extractor_torchaudio.to('cuda') + waveform_tensor_torch = waveform_tensor_torch.to('cuda') + amplitude_to_DB = amplitude_to_DB.to('cuda') + + feature_torchaudio = benchmark(log_melspectrogram_torchaudio) + feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf) + feature_librosa = librosa.power_to_db(feature_librosa, top_db=80.0) np.testing.assert_array_almost_equal( - feature_librosa, feature_paddleaudio, decimal=4) + feature_librosa, feature_torchaudio.cpu(), decimal=2) mfcc_extractor = paddleaudio.features.MFCC( @@ -106,7 +192,7 @@ def test_mfcc_cpu(benchmark): feature_paddleaudio = benchmark(mfcc) feature_librosa = librosa.feature.mfcc(waveform, **mel_conf) np.testing.assert_array_almost_equal( - feature_librosa, feature_paddleaudio, decimal=4) + feature_librosa, feature_paddleaudio, decimal=3) def test_mfcc_gpu(benchmark): @@ -114,4 +200,37 @@ def test_mfcc_gpu(benchmark): feature_paddleaudio = benchmark(mfcc) feature_librosa = librosa.feature.mfcc(waveform, **mel_conf) np.testing.assert_array_almost_equal( - feature_librosa, feature_paddleaudio, decimal=4) + feature_librosa, feature_paddleaudio, decimal=3) + + +del mel_conf_torchaudio['sample_rate'] +mfcc_extractor_torchaudio = torchaudio.transforms.MFCC( + **mfcc_conf_torchaudio, melkwargs=mel_conf_torchaudio) + + +def mfcc_torchaudio(): + return mfcc_extractor_torchaudio(waveform_tensor_torch).squeeze(0) + + +def test_mfcc_cpu_torchaudio(benchmark): + global waveform_tensor_torch, mfcc_extractor_torchaudio + + mel_extractor_torchaudio = mfcc_extractor_torchaudio.to('cpu') + waveform_tensor_torch = waveform_tensor_torch.to('cpu') + + feature_paddleaudio = benchmark(mfcc_torchaudio) + feature_librosa = librosa.feature.mfcc(waveform, **mel_conf) + np.testing.assert_array_almost_equal( + feature_librosa, feature_paddleaudio, decimal=3) + + +def test_mfcc_gpu_torchaudio(benchmark): + global waveform_tensor_torch, mfcc_extractor_torchaudio + + mel_extractor_torchaudio = mfcc_extractor_torchaudio.to('cuda') + waveform_tensor_torch = waveform_tensor_torch.to('cuda') + + feature_torchaudio = benchmark(mfcc_torchaudio) + feature_librosa = librosa.feature.mfcc(waveform, **mel_conf) + np.testing.assert_array_almost_equal( + feature_librosa, feature_torchaudio.cpu(), decimal=3)