diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index 6e14e0d682f0a3edb3e8908ea9774816b947f6fb..ef769fbc9b910e20fd91735b70247ea00b2129e2 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -311,8 +311,10 @@ class ASRExecutor(BaseExecutor): audio = audio[:, 0] # pcm16 -> pcm 32 audio = self._pcm16to32(audio) - audio = librosa.resample(audio, audio_sample_rate, - self.sample_rate) + audio = librosa.resample( + audio, + orig_sr=audio_sample_rate, + target_sr=self.sample_rate) audio_sample_rate = self.sample_rate # pcm32 -> pcm 16 audio = self._pcm32to16(audio) diff --git a/paddlespeech/s2t/transform/perturb.py b/paddlespeech/s2t/transform/perturb.py index 226885f36688d7896ed68b602c691d207363f8f3..9e41b824b6ed0261db5acb24fb5e0aff2a4758fa 100644 --- a/paddlespeech/s2t/transform/perturb.py +++ b/paddlespeech/s2t/transform/perturb.py @@ -90,7 +90,8 @@ class SpeedPerturbation(): # Note1: resample requires the sampling-rate of input and output, # but actually only the ratio is used. - y = librosa.resample(x, ratio, 1, res_type=self.res_type) + y = librosa.resample( + x, orig_sr=ratio, target_sr=1, res_type=self.res_type) if self.keep_length: diff = abs(len(x) - len(y)) diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py index a6346c344ff86c19b0e5d169683be01f7bdedee3..988fd627102641e206c75e471b8faff54e54f59e 100644 --- a/paddlespeech/s2t/transform/spectrogram.py +++ b/paddlespeech/s2t/transform/spectrogram.py @@ -38,7 +38,7 @@ def stft(x, x = np.stack( [ librosa.stft( - x[:, ch], + y=x[:, ch], n_fft=n_fft, hop_length=n_shift, win_length=win_length, @@ -67,7 +67,7 @@ def istft(x, n_shift, win_length=None, window="hann", center=True): x = np.stack( [ librosa.istft( - x[:, ch].T, # [Time, Freq] -> [Freq, Time] + y=x[:, ch].T, # [Time, Freq] -> [Freq, Time] hop_length=n_shift, win_length=win_length, window=window, @@ -95,7 +95,8 @@ def stft2logmelspectrogram(x_stft, # spc: (Time, Channel, Freq) or (Time, Freq) spc = np.abs(x_stft) # mel_basis: (Mel_freq, Freq) - mel_basis = librosa.filters.mel(fs, n_fft, n_mels, fmin, fmax) + mel_basis = librosa.filters.mel( + sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax) # lmspc: (Time, Channel, Mel_freq) or (Time, Mel_freq) lmspc = np.log10(np.maximum(eps, np.dot(spc, mel_basis.T))) diff --git a/paddlespeech/vector/exps/ge2e/audio_processor.py b/paddlespeech/vector/exps/ge2e/audio_processor.py index 2d6bbe34ef31461d19ff409c5f2c7830a03c7a3e..1ab0419e118e20cab11e52be12983f7cc951514a 100644 --- a/paddlespeech/vector/exps/ge2e/audio_processor.py +++ b/paddlespeech/vector/exps/ge2e/audio_processor.py @@ -127,7 +127,7 @@ def compute_partial_slices(n_samples: int, partial_utterance_n_frames : int the number of mel spectrogram frames in each partial utterance. - min_pad_coverage : int + min_pad_coverage : int when reaching the last partial utterance, it may or may not have enough frames. If at least of are present, then the last partial utterance will be considered, as if we padded the audio. Otherwise, @@ -137,7 +137,7 @@ def compute_partial_slices(n_samples: int, by how much the partial utterance should overlap. If set to 0, the partial utterances are entirely disjoint. Returns ---------- - the waveform slices and mel spectrogram slices as lists of array slices. + the waveform slices and mel spectrogram slices as lists of array slices. Index respectively the waveform and the mel spectrogram with these slices to obtain the partialutterances. """ assert 0 <= overlap < 1 @@ -206,7 +206,8 @@ class SpeakerVerificationPreprocessor(object): # Resample if numpy.array is passed and sr does not match if source_sr is not None and source_sr != self.sampling_rate: - wav = librosa.resample(wav, source_sr, self.sampling_rate) + wav = librosa.resample( + wav, orig_sr=source_sr, target_sr=self.sampling_rate) # loudness normalization wav = normalize_volume( @@ -221,7 +222,7 @@ class SpeakerVerificationPreprocessor(object): def melspectrogram(self, wav): mel = librosa.feature.melspectrogram( - wav, + y=wav, sr=self.sampling_rate, n_fft=self.n_fft, hop_length=self.hop_length,