diff --git a/paddlespeech/s2t/transform/perturb.py b/paddlespeech/s2t/transform/perturb.py index 153d494bf76b7d1ea99c79caf16af88ddc70638c..ee4c7ce02ea2be062d7ea59345b937e393c569be 100644 --- a/paddlespeech/s2t/transform/perturb.py +++ b/paddlespeech/s2t/transform/perturb.py @@ -16,7 +16,7 @@ import librosa import numpy import scipy import soundfile - +import soxbindings as sox from paddlespeech.s2t.io.reader import SoundHDF5File @@ -82,7 +82,6 @@ class SpeedPerturbation(): def __call__(self, x, uttid=None, train=True): if not train: return x - x = x.astype(numpy.float32) if self.accept_uttid: ratio = self.utt2ratio[uttid] @@ -108,6 +107,109 @@ class SpeedPerturbation(): return y +class SpeedPerturbationSox(): + """SpeedPerturbationSox + + The speed perturbation in kaldi uses sox-speed instead of sox-tempo, + and sox-speed just to resample the input, + i.e pitch and tempo are changed both. + + To speed up or slow down the sound of a file, + use speed to modify the pitch and the duration of the file. + This raises the speed and reduces the time. + The default factor is 1.0 which makes no change to the audio. + 2.0 doubles speed, thus time length is cut by a half and pitch is one interval higher. + + "Why use speed option instead of tempo -s in SoX for speed perturbation" + https://groups.google.com/forum/#!topic/kaldi-help/8OOG7eE4sZ8 + + tempo option: + sox -t wav input.wav -t wav output.tempo0.9.wav tempo -s 0.9 + + speed option: + sox -t wav input.wav -t wav output.speed0.9.wav speed 0.9 + + If we use speed option like above, the pitch of audio also will be changed, + but the tempo option does not change the pitch. + """ + + def __init__( + self, + lower=0.9, + upper=1.1, + utt2ratio=None, + keep_length=True, + sr=16000, + seed=None, ): + self.sr = sr + self.keep_length = keep_length + self.state = numpy.random.RandomState(seed) + + if utt2ratio is not None: + self.utt2ratio = {} + # Use the scheduled ratio for each utterances + self.utt2ratio_file = utt2ratio + self.lower = None + self.upper = None + self.accept_uttid = True + + with open(utt2ratio, "r") as f: + for line in f: + utt, ratio = line.rstrip().split(None, 1) + ratio = float(ratio) + self.utt2ratio[utt] = ratio + else: + self.utt2ratio = None + # The ratio is given on runtime randomly + self.lower = lower + self.upper = upper + + def __repr__(self): + if self.utt2ratio is None: + return f"""{self.__class__.__name__}( + lower={self.lower}, + upper={self.upper}, + keep_length={self.keep_length}, + sample_rate={self.sr})""" + else: + return f"""{self.__class__.__name__}( + utt2ratio={self.utt2ratio_file}, + sample_rate={self.sr})""" + + def __call__(self, x, uttid=None, train=True): + if not train: + return x + + x = x.astype(numpy.float32) + if self.accept_uttid: + ratio = self.utt2ratio[uttid] + else: + ratio = self.state.uniform(self.lower, self.upper) + + tfm = sox.Transformer() + tfm.set_globals(multithread=False) + tfm.speed(ratio) + y = tfm.build_array(input_array=x, sample_rate_in=self.sr) + + if self.keep_length: + diff = abs(len(x) - len(y)) + if len(y) > len(x): + # Truncate noise + y = y[diff // 2:-((diff + 1) // 2)] + elif len(y) < len(x): + # Assume the time-axis is the first: (Time, Channel) + pad_width = [(diff // 2, (diff + 1) // 2)] + [ + (0, 0) for _ in range(y.ndim - 1) + ] + y = numpy.pad( + y, pad_width=pad_width, constant_values=0, mode="constant") + + if y.ndim == 2 and x.ndim == 1: + # (T, C) -> (T) + y = y.sequence(1) + return y + + class BandpassPerturbation(): """BandpassPerturbation