Merge branch 'develop' of https://github.com/PaddlePaddle/PaddleSpeech into rename_tacotron2

30085ac2 · 小湉湉 · 25347bb6 · 70ebbfd8 · 30085ac2 · 30085ac2
4 changed file
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -311,8 +311,10 @@ class ASRExecutor(BaseExecutor):
                    audio = audio[:, 0]
                # pcm16 -> pcm 32
                audio = self._pcm16to32(audio)
-                audio = librosa.resample(audio, audio_sample_rate,
-                                         self.sample_rate)
+                audio = librosa.resample(
+                    audio,
+                    orig_sr=audio_sample_rate,
+                    target_sr=self.sample_rate)
                audio_sample_rate = self.sample_rate
                # pcm32 -> pcm 16
                audio = self._pcm32to16(audio)

--- a/paddlespeech/s2t/transform/perturb.py
+++ b/paddlespeech/s2t/transform/perturb.py
@@ -90,7 +90,8 @@ class SpeedPerturbation():

        # Note1: resample requires the sampling-rate of input and output,
        #        but actually only the ratio is used.
-        y = librosa.resample(x, ratio, 1, res_type=self.res_type)
+        y = librosa.resample(
+            x, orig_sr=ratio, target_sr=1, res_type=self.res_type)

        if self.keep_length:
            diff = abs(len(x) - len(y))

--- a/paddlespeech/s2t/transform/spectrogram.py
+++ b/paddlespeech/s2t/transform/spectrogram.py
@@ -38,7 +38,7 @@ def stft(x,
    x = np.stack(
        [
            librosa.stft(
-                x[:, ch],
+                y=x[:, ch],
                n_fft=n_fft,
                hop_length=n_shift,
                win_length=win_length,
@@ -67,7 +67,7 @@ def istft(x, n_shift, win_length=None, window="hann", center=True):
    x = np.stack(
        [
            librosa.istft(
-                x[:, ch].T,  # [Time, Freq] -> [Freq, Time]
+                y=x[:, ch].T,  # [Time, Freq] -> [Freq, Time]
                hop_length=n_shift,
                win_length=win_length,
                window=window,
@@ -95,7 +95,8 @@ def stft2logmelspectrogram(x_stft,
    # spc: (Time, Channel, Freq) or (Time, Freq)
    spc = np.abs(x_stft)
    # mel_basis: (Mel_freq, Freq)
-    mel_basis = librosa.filters.mel(fs, n_fft, n_mels, fmin, fmax)
+    mel_basis = librosa.filters.mel(
+        sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
    # lmspc: (Time, Channel, Mel_freq) or (Time, Mel_freq)
    lmspc = np.log10(np.maximum(eps, np.dot(spc, mel_basis.T)))


--- a/paddlespeech/vector/exps/ge2e/audio_processor.py
+++ b/paddlespeech/vector/exps/ge2e/audio_processor.py
@@ -127,7 +127,7 @@ def compute_partial_slices(n_samples: int,
    partial_utterance_n_frames : int
        the number of mel spectrogram frames in each partial utterance.

-    min_pad_coverage : int 
+    min_pad_coverage : int
        when reaching the last partial utterance, it may or may not have enough frames.
        If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
        then the last partial utterance will be considered, as if we padded the audio. Otherwise,
@@ -137,7 +137,7 @@ def compute_partial_slices(n_samples: int,
        by how much the partial utterance should overlap. If set to 0, the partial utterances are entirely disjoint.
    Returns
    ----------
-        the waveform slices and mel spectrogram slices as lists of array slices. 
+        the waveform slices and mel spectrogram slices as lists of array slices.
        Index respectively the waveform and the mel spectrogram with these slices to obtain the partialutterances.
    """
    assert 0 <= overlap < 1
@@ -206,7 +206,8 @@ class SpeakerVerificationPreprocessor(object):

        # Resample if numpy.array is passed and sr does not match
        if source_sr is not None and source_sr != self.sampling_rate:
-            wav = librosa.resample(wav, source_sr, self.sampling_rate)
+            wav = librosa.resample(
+                wav, orig_sr=source_sr, target_sr=self.sampling_rate)

        # loudness normalization
        wav = normalize_volume(
@@ -221,7 +222,7 @@ class SpeakerVerificationPreprocessor(object):

    def melspectrogram(self, wav):
        mel = librosa.feature.melspectrogram(
-            wav,
+            y=wav,
            sr=self.sampling_rate,
            n_fft=self.n_fft,
            hop_length=self.hop_length,