diff --git a/augmentation.config b/augmentation.config new file mode 100644 index 0000000000000000000000000000000000000000..9ddedd4074c614a8f737173c9cc604b55a2a178f --- /dev/null +++ b/augmentation.config @@ -0,0 +1,34 @@ +[ + { + "type": "noise", + "params": {"min_snr_dB": 50, + "max_snr_dB": 50, + "noise_manifest": "datasets/manifest.noise"}, + "prob": 0.0 + }, + { + "type": "speed", + "params": {"min_speed_rate": 0.9, + "max_speed_rate": 1.1}, + "prob": 0.0 + }, + { + "type": "shift", + "params": {"min_shift_ms": -5, + "max_shift_ms": 5}, + "prob": 1.0 + }, + { + "type": "volume", + "params": {"min_gain_dBFS": -10, + "max_gain_dBFS": 10}, + "prob": 0.0 + }, + { + "type": "bayesian_normal", + "params": {"target_db": -20, + "prior_db": -20, + "prior_samples": 100}, + "prob": 0.0 + } +] diff --git a/data_utils/audio.py b/data_utils/audio.py index 3891f5b923f6d73c6b87dcb90bede0183b0e081c..30e25221cd84aa6849061635749188e3bd13d67b 100644 --- a/data_utils/audio.py +++ b/data_utils/audio.py @@ -204,7 +204,7 @@ class AudioSegment(object): :raise ValueError: If the sample rates of the two segments are not equal, or if the lengths of segments don't match. """ - if type(self) != type(other): + if isinstance(other, type(self)): raise TypeError("Cannot add segments of different types: %s " "and %s." % (type(self), type(other))) if self._sample_rate != other._sample_rate: @@ -231,7 +231,7 @@ class AudioSegment(object): Note that this is an in-place transformation. :param gain: Gain in decibels to apply to samples. - :type gain: float + :type gain: float|1darray """ self._samples *= 10.**(gain / 20.) @@ -457,9 +457,9 @@ class AudioSegment(object): audio segments when resample is not allowed. """ if allow_resample and self.sample_rate != impulse_segment.sample_rate: - impulse_segment = impulse_segment.resample(self.sample_rate) + impulse_segment.resample(self.sample_rate) if self.sample_rate != impulse_segment.sample_rate: - raise ValueError("Impulse segment's sample rate (%d Hz) is not" + raise ValueError("Impulse segment's sample rate (%d Hz) is not " "equal to base signal sample rate (%d Hz)." % (impulse_segment.sample_rate, self.sample_rate)) samples = signal.fftconvolve(self.samples, impulse_segment.samples, diff --git a/data_utils/augmentor/augmentation.py b/data_utils/augmentor/augmentation.py index 8a50e4400d4df5f64511597003a43f9e23ffa17d..c9e360313c7434491d20d531a942a988c69961ee 100644 --- a/data_utils/augmentor/augmentation.py +++ b/data_utils/augmentor/augmentation.py @@ -9,6 +9,7 @@ from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor from data_utils.augmentor.shift_perturb import ShiftPerturbAugmentor from data_utils.augmentor.speed_perturb import SpeedPerturbAugmentor from data_utils.augmentor.noise_perturb import NoisePerturbAugmentor +from data_utils.augmentor.impulse_response import ImpulseResponseAugmentor from data_utils.augmentor.resample import ResampleAugmentor from data_utils.augmentor.online_bayesian_normalization import \ OnlineBayesianNormalizationAugmentor @@ -24,21 +25,46 @@ class AugmentationPipeline(object): string, e.g. .. code-block:: - - '[{"type": "volume", - "params": {"min_gain_dBFS": -15, - "max_gain_dBFS": 15}, - "prob": 0.5}, - {"type": "speed", - "params": {"min_speed_rate": 0.8, - "max_speed_rate": 1.2}, - "prob": 0.5} - ]' + [ { + "type": "noise", + "params": {"min_snr_dB": 10, + "max_snr_dB": 20, + "noise_manifest": "datasets/manifest.noise"}, + "prob": 0.0 + }, + { + "type": "speed", + "params": {"min_speed_rate": 0.9, + "max_speed_rate": 1.1}, + "prob": 1.0 + }, + { + "type": "shift", + "params": {"min_shift_ms": -5, + "max_shift_ms": 5}, + "prob": 1.0 + }, + { + "type": "volume", + "params": {"min_gain_dBFS": -10, + "max_gain_dBFS": 10}, + "prob": 0.0 + }, + { + "type": "bayesian_normal", + "params": {"target_db": -20, + "prior_db": -20, + "prior_samples": 100}, + "prob": 0.0 + } + ] + This augmentation configuration inserts two augmentation models into the pipeline, with one is VolumePerturbAugmentor and the other SpeedPerturbAugmentor. "prob" indicates the probability of the current - augmentor to take effect. + augmentor to take effect. If "prob" is zero, the augmentor does not take + effect. :param augmentation_config: Augmentation configuration in json string. :type augmentation_config: str @@ -61,7 +87,7 @@ class AugmentationPipeline(object): :type audio_segment: AudioSegmenet|SpeechSegment """ for augmentor, rate in zip(self._augmentors, self._rates): - if self._rng.uniform(0., 1.) <= rate: + if self._rng.uniform(0., 1.) < rate: augmentor.transform_audio(audio_segment) def _parse_pipeline_from(self, config_json): @@ -92,5 +118,7 @@ class AugmentationPipeline(object): return OnlineBayesianNormalizationAugmentor(self._rng, **params) elif augmentor_type == "noise": return NoisePerturbAugmentor(self._rng, **params) + elif augmentor_type == "impulse": + return ImpulseResponseAugmentor(self._rng, **params) else: raise ValueError("Unknown augmentor type [%s]." % augmentor_type) diff --git a/data_utils/augmentor/impulse_response.py b/data_utils/augmentor/impulse_response.py new file mode 100644 index 0000000000000000000000000000000000000000..d868c3a1ca8cfd9d682a28858c6622b2d50984b4 --- /dev/null +++ b/data_utils/augmentor/impulse_response.py @@ -0,0 +1,34 @@ +"""Contains the impulse response augmentation model.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from data_utils.augmentor.base import AugmentorBase +from data_utils import utils +from data_utils.audio import AudioSegment + + +class ImpulseResponseAugmentor(AugmentorBase): + """Augmentation model for adding impulse response effect. + + :param rng: Random generator object. + :type rng: random.Random + :param impulse_manifest: Manifest path for impulse audio data. + :type impulse_manifest: basestring + """ + + def __init__(self, rng, impulse_manifest): + self._rng = rng + self._manifest = utils.read_manifest(manifest_path=impulse_manifest) + + def transform_audio(self, audio_segment): + """Add impulse response effect. + + Note that this is an in-place transformation. + + :param audio_segment: Audio segment to add effects to. + :type audio_segment: AudioSegmenet|SpeechSegment + """ + noise_json = self._rng.sample(self._manifest, 1)[0] + noise_segment = AudioSegment.from_file(noise_json['audio_filepath']) + audio_segment.convolve(noise_segment, allow_resample=True) diff --git a/data_utils/augmentor/noise_perturb.py b/data_utils/augmentor/noise_perturb.py index c97ab8432b51d71de59ed97001c1b8ccba1586d4..b4fa18e18df4423d39b4765ac7d10b72554857eb 100644 --- a/data_utils/augmentor/noise_perturb.py +++ b/data_utils/augmentor/noise_perturb.py @@ -5,7 +5,7 @@ from __future__ import print_function from data_utils.augmentor.base import AugmentorBase from data_utils import utils -from data_utils.speech import SpeechSegment +from data_utils.audio import AudioSegment class NoisePerturbAugmentor(AugmentorBase): @@ -17,6 +17,8 @@ class NoisePerturbAugmentor(AugmentorBase): :type min_snr_dB: float :param max_snr_dB: Maximal signal noise ratio, in decibels. :type max_snr_dB: float + :param noise_manifest: Manifest path for noise audio data. + :type noise_manifest: basestring """ def __init__(self, rng, min_snr_dB, max_snr_dB, noise_manifest): @@ -40,8 +42,8 @@ class NoisePerturbAugmentor(AugmentorBase): diff_duration = noise_json['duration'] - audio_segment.duration start = self._rng.uniform(0, diff_duration) end = start + audio_segment.duration - noise_segment = SpeechSegment.slice_from_file( - noise_json['audio_filepath'], transcript="", start=start, end=end) + noise_segment = AudioSegment.slice_from_file( + noise_json['audio_filepath'], start=start, end=end) snr_dB = self._rng.uniform(self._min_snr_dB, self._max_snr_dB) audio_segment.add_noise( noise_segment, snr_dB, allow_downsampling=True, rng=self._rng) diff --git a/data_utils/data.py b/data_utils/data.py index 34f32019c238b3c2082ffdc809527b7cf6c49d0a..159bf69d582d6418f01ecbea01d716ac4a279207 100644 --- a/data_utils/data.py +++ b/data_utils/data.py @@ -169,7 +169,7 @@ class DataGenerator(object): manifest, batch_size, clipped=True) elif shuffle_method == "instance_shuffle": self._rng.shuffle(manifest) - elif not shuffle_method: + elif shuffle_method == None: pass else: raise ValueError("Unknown shuffle method %s." % diff --git a/data_utils/speech.py b/data_utils/speech.py index 568e4443ba557149505dfb4de6f230b4962e332a..17d68f315d04b6cc1aae2346df78cf77982cd7bc 100644 --- a/data_utils/speech.py +++ b/data_utils/speech.py @@ -115,7 +115,7 @@ class SpeechSegment(AudioSegment): speech file. :rtype: SpeechSegment """ - audio = Audiosegment.slice_from_file(filepath, start, end) + audio = AudioSegment.slice_from_file(filepath, start, end) return cls(audio.samples, audio.sample_rate, transcript) @classmethod diff --git a/train.py b/train.py index aff6193796e48a8f8db349d3865b064c9a4f5f56..34c406015a876ad7c3442644532f467506e9cd15 100644 --- a/train.py +++ b/train.py @@ -123,9 +123,7 @@ parser.add_argument( help="Directory for saving models. (default: %(default)s)") parser.add_argument( "--augmentation_config", - default='[{"type": "shift", ' - '"params": {"min_shift_ms": -5, "max_shift_ms": 5},' - '"prob": 1.0}]', + default=open('augmentation.config', 'r').read(), type=str, help="Augmentation configuration in json-format. " "(default: %(default)s)")