未验证 提交 70ebbfd8 编写于 作者: 小湉湉's avatar 小湉湉 提交者: GitHub

Merge pull request #1432 from Jackwaterveg/fix

[Bug fix] fix resample
...@@ -311,8 +311,10 @@ class ASRExecutor(BaseExecutor): ...@@ -311,8 +311,10 @@ class ASRExecutor(BaseExecutor):
audio = audio[:, 0] audio = audio[:, 0]
# pcm16 -> pcm 32 # pcm16 -> pcm 32
audio = self._pcm16to32(audio) audio = self._pcm16to32(audio)
audio = librosa.resample(audio, audio_sample_rate, audio = librosa.resample(
self.sample_rate) audio,
orig_sr=audio_sample_rate,
target_sr=self.sample_rate)
audio_sample_rate = self.sample_rate audio_sample_rate = self.sample_rate
# pcm32 -> pcm 16 # pcm32 -> pcm 16
audio = self._pcm32to16(audio) audio = self._pcm32to16(audio)
......
...@@ -90,7 +90,8 @@ class SpeedPerturbation(): ...@@ -90,7 +90,8 @@ class SpeedPerturbation():
# Note1: resample requires the sampling-rate of input and output, # Note1: resample requires the sampling-rate of input and output,
# but actually only the ratio is used. # but actually only the ratio is used.
y = librosa.resample(x, ratio, 1, res_type=self.res_type) y = librosa.resample(
x, orig_sr=ratio, target_sr=1, res_type=self.res_type)
if self.keep_length: if self.keep_length:
diff = abs(len(x) - len(y)) diff = abs(len(x) - len(y))
......
...@@ -38,7 +38,7 @@ def stft(x, ...@@ -38,7 +38,7 @@ def stft(x,
x = np.stack( x = np.stack(
[ [
librosa.stft( librosa.stft(
x[:, ch], y=x[:, ch],
n_fft=n_fft, n_fft=n_fft,
hop_length=n_shift, hop_length=n_shift,
win_length=win_length, win_length=win_length,
...@@ -67,7 +67,7 @@ def istft(x, n_shift, win_length=None, window="hann", center=True): ...@@ -67,7 +67,7 @@ def istft(x, n_shift, win_length=None, window="hann", center=True):
x = np.stack( x = np.stack(
[ [
librosa.istft( librosa.istft(
x[:, ch].T, # [Time, Freq] -> [Freq, Time] y=x[:, ch].T, # [Time, Freq] -> [Freq, Time]
hop_length=n_shift, hop_length=n_shift,
win_length=win_length, win_length=win_length,
window=window, window=window,
...@@ -95,7 +95,8 @@ def stft2logmelspectrogram(x_stft, ...@@ -95,7 +95,8 @@ def stft2logmelspectrogram(x_stft,
# spc: (Time, Channel, Freq) or (Time, Freq) # spc: (Time, Channel, Freq) or (Time, Freq)
spc = np.abs(x_stft) spc = np.abs(x_stft)
# mel_basis: (Mel_freq, Freq) # mel_basis: (Mel_freq, Freq)
mel_basis = librosa.filters.mel(fs, n_fft, n_mels, fmin, fmax) mel_basis = librosa.filters.mel(
sr=fs, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
# lmspc: (Time, Channel, Mel_freq) or (Time, Mel_freq) # lmspc: (Time, Channel, Mel_freq) or (Time, Mel_freq)
lmspc = np.log10(np.maximum(eps, np.dot(spc, mel_basis.T))) lmspc = np.log10(np.maximum(eps, np.dot(spc, mel_basis.T)))
......
...@@ -127,7 +127,7 @@ def compute_partial_slices(n_samples: int, ...@@ -127,7 +127,7 @@ def compute_partial_slices(n_samples: int,
partial_utterance_n_frames : int partial_utterance_n_frames : int
the number of mel spectrogram frames in each partial utterance. the number of mel spectrogram frames in each partial utterance.
min_pad_coverage : int min_pad_coverage : int
when reaching the last partial utterance, it may or may not have enough frames. when reaching the last partial utterance, it may or may not have enough frames.
If at least <min_pad_coverage> of <partial_utterance_n_frames> are present, If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
then the last partial utterance will be considered, as if we padded the audio. Otherwise, then the last partial utterance will be considered, as if we padded the audio. Otherwise,
...@@ -137,7 +137,7 @@ def compute_partial_slices(n_samples: int, ...@@ -137,7 +137,7 @@ def compute_partial_slices(n_samples: int,
by how much the partial utterance should overlap. If set to 0, the partial utterances are entirely disjoint. by how much the partial utterance should overlap. If set to 0, the partial utterances are entirely disjoint.
Returns Returns
---------- ----------
the waveform slices and mel spectrogram slices as lists of array slices. the waveform slices and mel spectrogram slices as lists of array slices.
Index respectively the waveform and the mel spectrogram with these slices to obtain the partialutterances. Index respectively the waveform and the mel spectrogram with these slices to obtain the partialutterances.
""" """
assert 0 <= overlap < 1 assert 0 <= overlap < 1
...@@ -206,7 +206,8 @@ class SpeakerVerificationPreprocessor(object): ...@@ -206,7 +206,8 @@ class SpeakerVerificationPreprocessor(object):
# Resample if numpy.array is passed and sr does not match # Resample if numpy.array is passed and sr does not match
if source_sr is not None and source_sr != self.sampling_rate: if source_sr is not None and source_sr != self.sampling_rate:
wav = librosa.resample(wav, source_sr, self.sampling_rate) wav = librosa.resample(
wav, orig_sr=source_sr, target_sr=self.sampling_rate)
# loudness normalization # loudness normalization
wav = normalize_volume( wav = normalize_volume(
...@@ -221,7 +222,7 @@ class SpeakerVerificationPreprocessor(object): ...@@ -221,7 +222,7 @@ class SpeakerVerificationPreprocessor(object):
def melspectrogram(self, wav): def melspectrogram(self, wav):
mel = librosa.feature.melspectrogram( mel = librosa.feature.melspectrogram(
wav, y=wav,
sr=self.sampling_rate, sr=self.sampling_rate,
n_fft=self.n_fft, n_fft=self.n_fft,
hop_length=self.hop_length, hop_length=self.hop_length,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册