From 0ffe1f91143b0489fd38be90747afcbb5e61fedc Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Mon, 28 Mar 2022 03:35:55 +0000 Subject: [PATCH] replace kaidi_fbank with paddleaudio --- examples/aishell/asr1/conf/preprocess.yaml | 9 ++-- paddlespeech/s2t/transform/spectrogram.py | 45 ++++++++++++++++++++ paddlespeech/s2t/transform/transformation.py | 1 + 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/examples/aishell/asr1/conf/preprocess.yaml b/examples/aishell/asr1/conf/preprocess.yaml index f7f4c58d..a20ff2ab 100644 --- a/examples/aishell/asr1/conf/preprocess.yaml +++ b/examples/aishell/asr1/conf/preprocess.yaml @@ -3,8 +3,9 @@ process: - type: fbank_kaldi fs: 16000 n_mels: 80 - n_shift: 160 - win_length: 400 + n_frame_length: 25 + n_frame_shift: 10 + energy_floor: 0.0 dither: 0.1 - type: cmvn_json cmvn_path: data/mean_std.json @@ -23,7 +24,3 @@ process: n_mask: 2 inplace: true replace_with_zero: false - - - - diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py index 889cd349..f779b07d 100644 --- a/paddlespeech/s2t/transform/spectrogram.py +++ b/paddlespeech/s2t/transform/spectrogram.py @@ -14,8 +14,11 @@ # Modified from espnet(https://github.com/espnet/espnet) import librosa import numpy as np +import paddle from python_speech_features import logfbank +import paddleaudio.compliance.kaldi as kaldi + def stft(x, n_fft, @@ -309,6 +312,48 @@ class IStft(): class LogMelSpectrogramKaldi(): + def __init__(self, + fs=16000, + n_mels=80, + n_frame_length=25, + n_frame_shift=10, + energy_floor=0.0, + dither=0.1): + self.fs = fs + self.n_mels = n_mels + self.n_frame_length = n_frame_length + self.n_frame_shift = n_frame_shift + self.energy_floor = energy_floor + self.dither = dither + + def __repr__(self): + return ( + "{name}(fs={fs}, n_mels={n_mels}, " + "n_frame_shift={n_frame_shift}, n_frame_length={n_frame_length}, " + "dither={dither}))".format( + name=self.__class__.__name__, + fs=self.fs, + n_mels=self.n_mels, + n_frame_shift=self.n_frame_shift, + n_frame_length=self.n_frame_length, + dither=self.dither, )) + + def __call__(self, x, train): + dither = self.dither if train else 0.0 + waveform = paddle.to_tensor(np.expand_dims(x, 0), dtype=paddle.float32) + mat = kaldi.fbank( + waveform, + n_mels=self.n_mels, + frame_length=self.n_frame_length, + frame_shift=self.n_frame_shift, + dither=dither, + energy_floor=self.energy_floor, + sr=self.fs) + mat = np.squeeze(mat.numpy()) + return mat + + +class LogMelSpectrogramKaldi_decay(): def __init__( self, fs=16000, diff --git a/paddlespeech/s2t/transform/transformation.py b/paddlespeech/s2t/transform/transformation.py index 381b0cdc..3b433cb0 100644 --- a/paddlespeech/s2t/transform/transformation.py +++ b/paddlespeech/s2t/transform/transformation.py @@ -31,6 +31,7 @@ import_alias = dict( freq_mask="paddlespeech.s2t.transform.spec_augment:FreqMask", spec_augment="paddlespeech.s2t.transform.spec_augment:SpecAugment", speed_perturbation="paddlespeech.s2t.transform.perturb:SpeedPerturbation", + speed_perturbation_sox="paddlespeech.s2t.transform.perturb:SpeedPerturbationSox", volume_perturbation="paddlespeech.s2t.transform.perturb:VolumePerturbation", noise_injection="paddlespeech.s2t.transform.perturb:NoiseInjection", bandpass_perturbation="paddlespeech.s2t.transform.perturb:BandpassPerturbation", -- GitLab