diff --git a/deep_speech_2/README.md b/deep_speech_2/README.md old mode 100755 new mode 100644 diff --git a/deep_speech_2/conf/augmentation.config b/deep_speech_2/conf/augmentation.config new file mode 100644 index 0000000000000000000000000000000000000000..6c24da5497460d4bae9c9c4fecdbe96ab8da7532 --- /dev/null +++ b/deep_speech_2/conf/augmentation.config @@ -0,0 +1,8 @@ +[ + { + "type": "shift", + "params": {"min_shift_ms": -5, + "max_shift_ms": 5}, + "prob": 1.0 + } +] diff --git a/deep_speech_2/conf/augmentation.config.example b/deep_speech_2/conf/augmentation.config.example new file mode 100644 index 0000000000000000000000000000000000000000..21ed6ee10375a749f4c072389509db2020d9e9c9 --- /dev/null +++ b/deep_speech_2/conf/augmentation.config.example @@ -0,0 +1,39 @@ +[ + { + "type": "noise", + "params": {"min_snr_dB": 40, + "max_snr_dB": 50, + "noise_manifest_path": "datasets/manifest.noise"}, + "prob": 0.6 + }, + { + "type": "impulse", + "params": {"impulse_manifest_path": "datasets/manifest.impulse"}, + "prob": 0.5 + }, + { + "type": "speed", + "params": {"min_speed_rate": 0.95, + "max_speed_rate": 1.05}, + "prob": 0.5 + }, + { + "type": "shift", + "params": {"min_shift_ms": -5, + "max_shift_ms": 5}, + "prob": 1.0 + }, + { + "type": "volume", + "params": {"min_gain_dBFS": -10, + "max_gain_dBFS": 10}, + "prob": 0.0 + }, + { + "type": "bayesian_normal", + "params": {"target_db": -20, + "prior_db": -20, + "prior_samples": 100}, + "prob": 0.0 + } +] diff --git a/deep_speech_2/data_utils/audio.py b/deep_speech_2/data_utils/audio.py index 3891f5b923f6d73c6b87dcb90bede0183b0e081c..30e25221cd84aa6849061635749188e3bd13d67b 100644 --- a/deep_speech_2/data_utils/audio.py +++ b/deep_speech_2/data_utils/audio.py @@ -204,7 +204,7 @@ class AudioSegment(object): :raise ValueError: If the sample rates of the two segments are not equal, or if the lengths of segments don't match. """ - if type(self) != type(other): + if isinstance(other, type(self)): raise TypeError("Cannot add segments of different types: %s " "and %s." % (type(self), type(other))) if self._sample_rate != other._sample_rate: @@ -231,7 +231,7 @@ class AudioSegment(object): Note that this is an in-place transformation. :param gain: Gain in decibels to apply to samples. - :type gain: float + :type gain: float|1darray """ self._samples *= 10.**(gain / 20.) @@ -457,9 +457,9 @@ class AudioSegment(object): audio segments when resample is not allowed. """ if allow_resample and self.sample_rate != impulse_segment.sample_rate: - impulse_segment = impulse_segment.resample(self.sample_rate) + impulse_segment.resample(self.sample_rate) if self.sample_rate != impulse_segment.sample_rate: - raise ValueError("Impulse segment's sample rate (%d Hz) is not" + raise ValueError("Impulse segment's sample rate (%d Hz) is not " "equal to base signal sample rate (%d Hz)." % (impulse_segment.sample_rate, self.sample_rate)) samples = signal.fftconvolve(self.samples, impulse_segment.samples, diff --git a/deep_speech_2/data_utils/augmentor/augmentation.py b/deep_speech_2/data_utils/augmentor/augmentation.py index 9dced47314a81f52dc0eafd6e592e240953f291d..5c30b627ef9a23ff41d1f64f270934f149a793a2 100644 --- a/deep_speech_2/data_utils/augmentor/augmentation.py +++ b/deep_speech_2/data_utils/augmentor/augmentation.py @@ -8,6 +8,8 @@ import random from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor from data_utils.augmentor.shift_perturb import ShiftPerturbAugmentor from data_utils.augmentor.speed_perturb import SpeedPerturbAugmentor +from data_utils.augmentor.noise_perturb import NoisePerturbAugmentor +from data_utils.augmentor.impulse_response import ImpulseResponseAugmentor from data_utils.augmentor.resample import ResampleAugmentor from data_utils.augmentor.online_bayesian_normalization import \ OnlineBayesianNormalizationAugmentor @@ -23,21 +25,46 @@ class AugmentationPipeline(object): string, e.g. .. code-block:: - - '[{"type": "volume", - "params": {"min_gain_dBFS": -15, - "max_gain_dBFS": 15}, - "prob": 0.5}, - {"type": "speed", - "params": {"min_speed_rate": 0.8, - "max_speed_rate": 1.2}, - "prob": 0.5} - ]' + [ { + "type": "noise", + "params": {"min_snr_dB": 10, + "max_snr_dB": 20, + "noise_manifest_path": "datasets/manifest.noise"}, + "prob": 0.0 + }, + { + "type": "speed", + "params": {"min_speed_rate": 0.9, + "max_speed_rate": 1.1}, + "prob": 1.0 + }, + { + "type": "shift", + "params": {"min_shift_ms": -5, + "max_shift_ms": 5}, + "prob": 1.0 + }, + { + "type": "volume", + "params": {"min_gain_dBFS": -10, + "max_gain_dBFS": 10}, + "prob": 0.0 + }, + { + "type": "bayesian_normal", + "params": {"target_db": -20, + "prior_db": -20, + "prior_samples": 100}, + "prob": 0.0 + } + ] + This augmentation configuration inserts two augmentation models into the pipeline, with one is VolumePerturbAugmentor and the other SpeedPerturbAugmentor. "prob" indicates the probability of the current - augmentor to take effect. + augmentor to take effect. If "prob" is zero, the augmentor does not take + effect. :param augmentation_config: Augmentation configuration in json string. :type augmentation_config: str @@ -60,7 +87,7 @@ class AugmentationPipeline(object): :type audio_segment: AudioSegmenet|SpeechSegment """ for augmentor, rate in zip(self._augmentors, self._rates): - if self._rng.uniform(0., 1.) <= rate: + if self._rng.uniform(0., 1.) < rate: augmentor.transform_audio(audio_segment) def _parse_pipeline_from(self, config_json): @@ -89,5 +116,9 @@ class AugmentationPipeline(object): return ResampleAugmentor(self._rng, **params) elif augmentor_type == "bayesian_normal": return OnlineBayesianNormalizationAugmentor(self._rng, **params) + elif augmentor_type == "noise": + return NoisePerturbAugmentor(self._rng, **params) + elif augmentor_type == "impulse": + return ImpulseResponseAugmentor(self._rng, **params) else: raise ValueError("Unknown augmentor type [%s]." % augmentor_type) diff --git a/deep_speech_2/data_utils/augmentor/impulse_response.py b/deep_speech_2/data_utils/augmentor/impulse_response.py new file mode 100644 index 0000000000000000000000000000000000000000..c3de0fdbb2a40150f8cffdef3487ceb4400e52ed --- /dev/null +++ b/deep_speech_2/data_utils/augmentor/impulse_response.py @@ -0,0 +1,35 @@ +"""Contains the impulse response augmentation model.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from data_utils.augmentor.base import AugmentorBase +from data_utils import utils +from data_utils.audio import AudioSegment + + +class ImpulseResponseAugmentor(AugmentorBase): + """Augmentation model for adding impulse response effect. + + :param rng: Random generator object. + :type rng: random.Random + :param impulse_manifest_path: Manifest path for impulse audio data. + :type impulse_manifest_path: basestring + """ + + def __init__(self, rng, impulse_manifest_path): + self._rng = rng + self._impulse_manifest = utils.read_manifest( + manifest_path=impulse_manifest_path) + + def transform_audio(self, audio_segment): + """Add impulse response effect. + + Note that this is an in-place transformation. + + :param audio_segment: Audio segment to add effects to. + :type audio_segment: AudioSegmenet|SpeechSegment + """ + impulse_json = self._rng.sample(self._impulse_manifest, 1)[0] + impulse_segment = AudioSegment.from_file(impulse_json['audio_filepath']) + audio_segment.convolve(impulse_segment, allow_resample=True) diff --git a/deep_speech_2/data_utils/augmentor/noise_perturb.py b/deep_speech_2/data_utils/augmentor/noise_perturb.py new file mode 100644 index 0000000000000000000000000000000000000000..281174af42c2f6d673ead94bd532941769c79c25 --- /dev/null +++ b/deep_speech_2/data_utils/augmentor/noise_perturb.py @@ -0,0 +1,50 @@ +"""Contains the noise perturb augmentation model.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from data_utils.augmentor.base import AugmentorBase +from data_utils import utils +from data_utils.audio import AudioSegment + + +class NoisePerturbAugmentor(AugmentorBase): + """Augmentation model for adding background noise. + + :param rng: Random generator object. + :type rng: random.Random + :param min_snr_dB: Minimal signal noise ratio, in decibels. + :type min_snr_dB: float + :param max_snr_dB: Maximal signal noise ratio, in decibels. + :type max_snr_dB: float + :param noise_manifest_path: Manifest path for noise audio data. + :type noise_manifest_path: basestring + """ + + def __init__(self, rng, min_snr_dB, max_snr_dB, noise_manifest_path): + self._min_snr_dB = min_snr_dB + self._max_snr_dB = max_snr_dB + self._rng = rng + self._noise_manifest = utils.read_manifest( + manifest_path=noise_manifest_path) + + def transform_audio(self, audio_segment): + """Add background noise audio. + + Note that this is an in-place transformation. + + :param audio_segment: Audio segment to add effects to. + :type audio_segment: AudioSegmenet|SpeechSegment + """ + noise_json = self._rng.sample(self._noise_manifest, 1)[0] + if noise_json['duration'] < audio_segment.duration: + raise RuntimeError("The duration of sampled noise audio is smaller " + "than the audio segment to add effects to.") + diff_duration = noise_json['duration'] - audio_segment.duration + start = self._rng.uniform(0, diff_duration) + end = start + audio_segment.duration + noise_segment = AudioSegment.slice_from_file( + noise_json['audio_filepath'], start=start, end=end) + snr_dB = self._rng.uniform(self._min_snr_dB, self._max_snr_dB) + audio_segment.add_noise( + noise_segment, snr_dB, allow_downsampling=True, rng=self._rng) diff --git a/deep_speech_2/data_utils/augmentor/online_bayesian_normalization.py b/deep_speech_2/data_utils/augmentor/online_bayesian_normalization.py old mode 100755 new mode 100644 diff --git a/deep_speech_2/data_utils/augmentor/resample.py b/deep_speech_2/data_utils/augmentor/resample.py old mode 100755 new mode 100644 diff --git a/deep_speech_2/data_utils/data.py b/deep_speech_2/data_utils/data.py index 34f32019c238b3c2082ffdc809527b7cf6c49d0a..159bf69d582d6418f01ecbea01d716ac4a279207 100644 --- a/deep_speech_2/data_utils/data.py +++ b/deep_speech_2/data_utils/data.py @@ -169,7 +169,7 @@ class DataGenerator(object): manifest, batch_size, clipped=True) elif shuffle_method == "instance_shuffle": self._rng.shuffle(manifest) - elif not shuffle_method: + elif shuffle_method == None: pass else: raise ValueError("Unknown shuffle method %s." % diff --git a/deep_speech_2/data_utils/speech.py b/deep_speech_2/data_utils/speech.py index 568e4443ba557149505dfb4de6f230b4962e332a..17d68f315d04b6cc1aae2346df78cf77982cd7bc 100644 --- a/deep_speech_2/data_utils/speech.py +++ b/deep_speech_2/data_utils/speech.py @@ -115,7 +115,7 @@ class SpeechSegment(AudioSegment): speech file. :rtype: SpeechSegment """ - audio = Audiosegment.slice_from_file(filepath, start, end) + audio = AudioSegment.slice_from_file(filepath, start, end) return cls(audio.samples, audio.sample_rate, transcript) @classmethod diff --git a/deep_speech_2/datasets/noise/chime3_background.py b/deep_speech_2/datasets/noise/chime3_background.py new file mode 100644 index 0000000000000000000000000000000000000000..f79ca7335bda7aec795bc43c32a51519f3363d85 --- /dev/null +++ b/deep_speech_2/datasets/noise/chime3_background.py @@ -0,0 +1,128 @@ +"""Prepare CHiME3 background data. + +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import distutils.util +import os +import wget +import zipfile +import argparse +import soundfile +import json +from paddle.v2.dataset.common import md5file + +DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') + +URL = "https://d4s.myairbridge.com/packagev2/AG0Y3DNBE5IWRRTV/?dlid=W19XG7T0NNHB027139H0EQ" +MD5 = "c3ff512618d7a67d4f85566ea1bc39ec" + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default=DATA_HOME + "/chime3_background", + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--manifest_filepath", + default="manifest.chime3.background", + type=str, + help="Filepath for output manifests. (default: %(default)s)") +args = parser.parse_args() + + +def download(url, md5sum, target_dir, filename=None): + """Download file from url to target_dir, and check md5sum.""" + if filename == None: + filename = url.split("/")[-1] + if not os.path.exists(target_dir): os.makedirs(target_dir) + filepath = os.path.join(target_dir, filename) + if not (os.path.exists(filepath) and md5file(filepath) == md5sum): + print("Downloading %s ..." % url) + wget.download(url, target_dir) + print("\nMD5 Chesksum %s ..." % filepath) + if not md5file(filepath) == md5sum: + raise RuntimeError("MD5 checksum failed.") + else: + print("File exists, skip downloading. (%s)" % filepath) + return filepath + + +def unpack(filepath, target_dir): + """Unpack the file to the target_dir.""" + print("Unpacking %s ..." % filepath) + if filepath.endswith('.zip'): + zip = zipfile.ZipFile(filepath, 'r') + zip.extractall(target_dir) + zip.close() + elif filepath.endswith('.tar') or filepath.endswith('.tar.gz'): + tar = zipfile.open(filepath) + tar.extractall(target_dir) + tar.close() + else: + raise ValueError("File format is not supported for unpacking.") + + +def create_manifest(data_dir, manifest_path): + """Create a manifest json file summarizing the data set, with each line + containing the meta data (i.e. audio filepath, transcription text, audio + duration) of each audio file within the data set. + """ + print("Creating manifest %s ..." % manifest_path) + json_lines = [] + for subfolder, _, filelist in sorted(os.walk(data_dir)): + for filename in filelist: + if filename.endswith('.wav'): + filepath = os.path.join(data_dir, subfolder, filename) + audio_data, samplerate = soundfile.read(filepath) + duration = float(len(audio_data)) / samplerate + json_lines.append( + json.dumps({ + 'audio_filepath': filepath, + 'duration': duration, + 'text': '' + })) + with open(manifest_path, 'w') as out_file: + for line in json_lines: + out_file.write(line + '\n') + + +def prepare_chime3(url, md5sum, target_dir, manifest_path): + """Download, unpack and create summmary manifest file.""" + if not os.path.exists(os.path.join(target_dir, "CHiME3")): + # download + filepath = download(url, md5sum, target_dir, + "myairbridge-AG0Y3DNBE5IWRRTV.zip") + # unpack + unpack(filepath, target_dir) + unpack( + os.path.join(target_dir, 'CHiME3_background_bus.zip'), target_dir) + unpack( + os.path.join(target_dir, 'CHiME3_background_caf.zip'), target_dir) + unpack( + os.path.join(target_dir, 'CHiME3_background_ped.zip'), target_dir) + unpack( + os.path.join(target_dir, 'CHiME3_background_str.zip'), target_dir) + else: + print("Skip downloading and unpacking. Data already exists in %s." % + target_dir) + # create manifest json file + create_manifest(target_dir, manifest_path) + + +def main(): + prepare_chime3( + url=URL, + md5sum=MD5, + target_dir=args.target_dir, + manifest_path=args.manifest_filepath) + + +if __name__ == '__main__': + main() diff --git a/deep_speech_2/datasets/run_noise.sh b/deep_speech_2/datasets/run_noise.sh new file mode 100644 index 0000000000000000000000000000000000000000..7b27abde47a97b671609f0cd15e81565b3a00d02 --- /dev/null +++ b/deep_speech_2/datasets/run_noise.sh @@ -0,0 +1,10 @@ +cd noise +python chime3_background.py +if [ $? -ne 0 ]; then + echo "Prepare CHiME3 background noise failed. Terminated." + exit 1 +fi +cd - + +cat noise/manifest.* > manifest.noise +echo "All done." diff --git a/deep_speech_2/train.py b/deep_speech_2/train.py index aff6193796e48a8f8db349d3865b064c9a4f5f56..0d4e2508dddf5cc6834b4f61f0c2cc8deee405af 100644 --- a/deep_speech_2/train.py +++ b/deep_speech_2/train.py @@ -123,9 +123,7 @@ parser.add_argument( help="Directory for saving models. (default: %(default)s)") parser.add_argument( "--augmentation_config", - default='[{"type": "shift", ' - '"params": {"min_shift_ms": -5, "max_shift_ms": 5},' - '"prob": 1.0}]', + default=open('conf/augmentation.config', 'r').read(), type=str, help="Augmentation configuration in json-format. " "(default: %(default)s)")