Merge pull request #1432 from Jackwaterveg/fix

[Bug fix] fix resample

Merge pull request #1432 from Jackwaterveg/fix
[Bug fix] fix resample
70ebbfd8 · 小湉湉 · GitHub · e0280ff9 · 9a55783a · 70ebbfd8
4 changed file
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -311,8 +311,10 @@ class ASRExecutor(BaseExecutor):
                    audio = audio[:, 0]
                # pcm16 -> pcm 32
                audio = self._pcm16to32(audio)
-                audio = librosa.resample(audio, audio_sample_rate,
-                                         self.sample_rate)
+                audio = librosa.resample(
+                    audio,
+                    orig_sr=audio_sample_rate,
+                    target_sr=self.sample_rate)
                audio_sample_rate = self.sample_rate
                # pcm32 -> pcm 16
                audio = self._pcm32to16(audio)

--- a/paddlespeech/s2t/transform/perturb.py
+++ b/paddlespeech/s2t/transform/perturb.py
@@ -90,7 +90,8 @@ class SpeedPerturbation():

        # Note1: resample requires the sampling-rate of input and output,
        #        but actually only the ratio is used.
-        y = librosa.resample(x, ratio, 1, res_type=self.res_type)
+        y = librosa.resample(
+            x, orig_sr=ratio, target_sr=1, res_type=self.res_type)

        if self.keep_length:
            diff = abs(len(x) - len(y))

--- a/paddlespeech/s2t/transform/spectrogram.py
+++ b/paddlespeech/s2t/transform/spectrogram.py
@@ -38,7 +38,7 @@ def stft(x,
    x = np.stack(
        [
            librosa.stft(
-                x[:, ch],
+                y=x[:, ch],
                n_fft=n_fft,
                hop_length=n_shift,
                win_length=win_length,
@@ -67,7 +67,7 @@ def istft(x, n_shift, win_length=None, window="hann", center=True):
    x = np.stack(
        [
            librosa.istft(
-                x[:, ch].T,  # [Time, Freq] -> [Freq, Time]
+                y=x[:, ch].T,  # [Time, Freq] -> [Freq, Time]
                hop_length=n_shift,
                win_length=win_length,
                window=window,
@@ -95,7 +95,8 @@ def stft2logmelspectrogram(x_stft,
    # spc: (Time, Channel, Freq) or (Time, Freq)
    spc = np.abs(x_stft)
    # mel_basis: (Mel_freq, Freq)
-    mel_basis = librosa.filters.mel(fs, n_fft, n_mels, fmin, fmax)
+    mel_basis = librosa.filters.mel(
+        sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
    # lmspc: (Time, Channel, Mel_freq) or (Time, Mel_freq)
    lmspc = np.log10(np.maximum(eps, np.dot(spc, mel_basis.T)))


--- a/paddlespeech/vector/exps/ge2e/audio_processor.py
+++ b/paddlespeech/vector/exps/ge2e/audio_processor.py
@@ -206,7 +206,8 @@ class SpeakerVerificationPreprocessor(object):

        # Resample if numpy.array is passed and sr does not match
        if source_sr is not None and source_sr != self.sampling_rate:
-            wav = librosa.resample(wav, source_sr, self.sampling_rate)
+            wav = librosa.resample(
+                wav, orig_sr=source_sr, target_sr=self.sampling_rate)

        # loudness normalization
        wav = normalize_volume(
@@ -221,7 +222,7 @@ class SpeakerVerificationPreprocessor(object):

    def melspectrogram(self, wav):
        mel = librosa.feature.melspectrogram(
-            wav,
+            y=wav,
            sr=self.sampling_rate,
            n_fft=self.n_fft,
            hop_length=self.hop_length,