diff --git a/deep_speech_2/data_utils/audio.py b/deep_speech_2/data_utils/audio.py index aef13c30ff3051fcc95edddf51ff59454dff5e3c..ee4e6d84e13bcc96dd02a4a7bc9a78f034c68ed2 100755 --- a/deep_speech_2/data_utils/audio.py +++ b/deep_speech_2/data_utils/audio.py @@ -8,6 +8,7 @@ import io import soundfile import scikits.samplerate from scipy import signal +import random class AudioSegment(object): @@ -46,6 +47,32 @@ class AudioSegment(object): """Return whether two objects are unequal.""" return not self.__eq__(other) + def __len__(self): + """Returns length of segment in samples.""" + return self.num_samples + + def __add__(self, other): + """Add samples from another segment to those of this segment and return + a new segment (sample-wise addition, not segment concatenation). + + :param other: Segment containing samples to be + added in. + :type other: AudioSegment + :return: New segment containing resulting samples. + :rtype: AudioSegment + :raise TypeError: If sample rates of segments don't match, + or if length of segments don't match. + """ + if type(self) != type(other): + raise TypeError("Cannot add segment of different type: {}" + .format(type(other))) + if self._sample_rate != other._sample_rate: + raise TypeError("Sample rates must match to add segments.") + if len(self._samples) != len(other._samples): + raise TypeError("Segment lengths must match to add segments.") + samples = self.samples + other.samples + return type(self)(samples, sample_rate=self._sample_rate) + def __str__(self): """Return human-readable representation of segment.""" return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, " @@ -64,69 +91,6 @@ class AudioSegment(object): samples, sample_rate = soundfile.read(file, dtype='float32') return cls(samples, sample_rate) - @classmethod - def slice_from_file(cls, fname, start=None, end=None): - """ - Loads a small section of an audio without having to load - the entire file into the memory which can be incredibly wasteful. - - :param fname: input audio file name - :type fname: bsaestring - :param start: start time in seconds (supported granularity is ms) - If start is negative, it wraps around from the end. If not - provided, this function reads from the very beginning. - :type start: float - :param end: start time in seconds (supported granularity is ms) - If end is negative, it wraps around from the end. If not - provided, the default behvaior is to read to the end of the - file. - :type end: float - - :return:the specified slice of input audio in the audio.AudioSegment - format. - """ - sndfile = soundfile.SoundFile(fname) - - sample_rate = sndfile.samplerate - if sndfile.channels != 1: - raise TypeError("{} has more than 1 channel.".format(fname)) - - duration = float(len(sndfile)) / sample_rate - - if start is None: - start = 0.0 - if end is None: - end = duration - - if start < 0.0: - start += duration - if end < 0.0: - end += duration - - if start < 0.0: - raise IndexError("The slice start position ({} s) is out of " - "bounds. Filename: {}".format(start, fname)) - if end < 0.0: - raise IndexError("The slice end position ({} s) is out of bounds " - "Filename: {}".format(end, fname)) - - if start > end: - raise IndexError("The slice start position ({} s) is later than " - "the slice end position ({} s)." - .format(start, end)) - - if end > duration: - raise ValueError("The slice end time ({} s) is out of " - "bounds (> {} s) Filename: {}" - .format(end, duration, fname)) - - start_frame = int(start * sample_rate) - end_frame = int(end * sample_rate) - sndfile.seek(start_frame) - data = sndfile.read(frames=end_frame - start_frame, dtype='float32') - - return cls(data, sample_rate) - @classmethod def from_bytes(cls, bytes): """Create audio segment from a byte string containing audio samples. @@ -140,43 +104,30 @@ class AudioSegment(object): io.BytesIO(bytes), dtype='float32') return cls(samples, sample_rate) - @classmethod - def make_silence(cls, duration, sample_rate): - """Creates a silent audio segment of the given duration and - sample rate. - - :param duration: length of silence in seconds - :type duration: scalar - :param sample_rate: sample rate - :type sample_rate: scalar - :returns: silence of the given duration - :rtype: AudioSegment - """ - samples = np.zeros(int(float(duration) * sample_rate)) - return cls(samples, sample_rate) - - @classmethod - def concatenate(cls, *segments): + def concatenate(self, *segments): """Concatenate an arbitrary number of audio segments together. - :param *segments: input audio segments - :type *segments: [AudioSegment] + :param *segments: Input audio segments + :type *segments: AudioSegment + :return: Audio segment instance. + :rtype: AudioSegment + :raises ValueError: If number of segments is zero, or if sample_rate + not match between two audio segments + :raises TypeError: If item of segments is not Audiosegment instance """ # Perform basic sanity-checks. - N = len(segments) - if N == 0: + if len(segments) == 0: raise ValueError("No audio segments are given to concatenate.") sample_rate = segments[0]._sample_rate - for segment in segments: - if sample_rate != segment._sample_rate: + for seg in segments: + if sample_rate != seg._sample_rate: raise ValueError("Can't concatenate segments with " "different sample rates") - if type(segment) is not cls: + if type(seg) is not type(self): raise TypeError("Only audio segments of the same type " "instance can be concatenated.") - samples = np.concatenate([seg.samples for seg in segments]) - return cls(samples, sample_rate) + return type(self)(samples, sample_rate) def to_wav_file(self, filepath, dtype='float32'): """Save audio segment to disk as wav file. @@ -203,6 +154,65 @@ class AudioSegment(object): format='WAV', subtype=subtype_map[dtype]) + def slice_from_file(self, file, start=None, end=None): + """Loads a small section of an audio without having to load + the entire file into the memory which can be incredibly wasteful. + + :param file: Input audio filepath + :type file: basestring + :param start: Start time in seconds. If start is negative, it wraps + around from the end. If not provided, this function + reads from the very beginning. + :type start: float + :param end: End time in seconds. If end is negative, it wraps around + from the end. If not provided, the default behvaior is + to read to the end of the file. + :type end: float + :return: The specified slice of input audio in the audio.AudioSegment format. + :rtype: AudioSegment + :rainse ValueError: If the position is error, or if the time is out bounds. + """ + sndfile = soundfile.SoundFile(file) + sample_rate = sndfile.samplerate + duration = float(len(sndfile)) / sample_rate + start = 0. if start is None else start + end = 0. if end is None else end + if start < 0.0: + start += duration + if end < 0.0: + end += duration + if start < 0.0: + raise ValueError("The slice start position (%f s) is out of " + "bounds. Filename: %s" % (start, file)) + if end < 0.0: + raise ValueError("The slice end position (%f s) is out of bounds " + "Filename: %s" % (end, file)) + if start > end: + raise ValueError("The slice start position (%f s) is later than " + "the slice end position (%f s)." % (start, end)) + if end > duration: + raise ValueError("The slice end time (%f s) is out of bounds " + "(> %f s) Filename: %s" % (end, duration, file)) + start_frame = int(start * sample_rate) + end_frame = int(end * sample_rate) + sndfile.seek(start_frame) + data = sndfile.read(frames=end_frame - start_frame, dtype='float32') + return type(self)(data, sample_rate) + + def make_silence(self, duration, sample_rate): + """Creates a silent audio segment of the given duration and + sample rate. + + :param duration: Length of silence in seconds + :type duration: float + :param sample_rate: Sample rate + :type sample_rate: float + :return: Silence of the given duration + :rtype: AudioSegment + """ + samples = np.zeros(int(duration * sample_rate)) + return type(self)(samples, sample_rate) + def to_bytes(self, dtype='float32'): """Create a byte string containing the audio content. @@ -247,52 +257,49 @@ class AudioSegment(object): self._samples = np.interp(new_indices, old_indices, self._samples) def normalize(self, target_db=-20, max_gain_db=300.0): - """Normalize audio to desired RMS value in decibels. + """Normalize audio to be desired RMS value in decibels. Note that this is an in-place transformation. - :param target_db: Target RMS value in decibels.This value - should be less than 0.0 as 0.0 is full-scale audio. - :type target_db: float, optional - :param max_gain_db: Max amount of gain in dB that can be applied - for normalization. This is to prevent nans when attempting - to normalize a signal consisting of all zeros. - :type max_gain_db: float, optional - - :raises NormalizationWarning: if the required gain to normalize the - segment to the target_db value exceeds max_gain_db. + :param target_db: Target RMS value in decibels. This value should + be less than 0.0 as 0.0 is full-scale audio. + :type target_db: float + :param max_gain_db: Max amount of gain in dB that can be applied for + normalization. This is to prevent nans when attempting + to normalize a signal consisting of all zeros. + :type max_gain_db: float + :raises ValueError: If the required gain to normalize the segment to + the target_db value exceeds max_gain_db. """ gain = target_db - self.rms_db if gain > max_gain_db: raise ValueError( - "Unable to normalize segment to {} dB because it has an RMS " - "value of {} dB and the difference exceeds max_gain_db ({} dB)" - .format(target_db, self.rms_db, max_gain_db)) - gain = min(max_gain_db, target_db - self.rms_db) - self.apply_gain(gain) + "Unable to normalize segment to %f dB because it has an RMS " + "value of %f dB and the difference exceeds max_gain_db (%f dB)" + % (target_db, self.rms_db, max_gain_db)) + self.apply_gain(min(max_gain_db, target_db - self.rms_db)) def normalize_online_bayesian(self, target_db, prior_db, prior_samples, startup_delay=0.0): - """ - Normalize audio using a production-compatible online/causal algorithm. - This uses an exponential likelihood and gamma prior to make - online estimates of the RMS even when there are very few samples. + """Normalize audio using a production-compatible online/causal algorithm. + This uses an exponential likelihood and gamma prior to make online estimates + of the RMS even when there are very few samples. Note that this is an in-place transformation. :param target_db: Target RMS value in decibels - :type target_bd: scalar + :type target_bd: float :param prior_db: Prior RMS estimate in decibels - :type prior_db: scalar + :type prior_db: float :param prior_samples: Prior strength in number of samples - :type prior_samples: scalar - :param startup_delay: Default: 0.0 s. If provided, this - function will accrue statistics for the first startup_delay - seconds before applying online normalization. - :type startup_delay: scalar + :type prior_samples: float + :param startup_delay: Default 0.0 s. If provided, this function will accrue + statistics for the first startup_delay seconds before + applying online normalization. + :type startup_delay: float """ # Estimate total RMS online startup_sample_idx = min(self.num_samples - 1, @@ -309,88 +316,54 @@ class AudioSegment(object): mean_squared_estimate = ((cumsum_of_squares + prior_sum_of_squares) / (sample_count + prior_samples)) rms_estimate_db = 10 * np.log10(mean_squared_estimate) - # Compute required time-varying gain gain_db = target_db - rms_estimate_db - - # Apply gain to new segment - self.apply_gain(gain_db) - - def normalize_ewma(self, - target_db, - decay_rate, - startup_delay, - rms_eps=1e-6, - max_gain_db=300.0): - startup_sample_idx = min(self.num_samples - 1, - int(self.sample_rate * startup_delay)) - mean_sq = self.samples**2 - if startup_sample_idx > 0: - mean_sq[:startup_sample_idx] = \ - np.sum(mean_sq[:startup_sample_idx]) / startup_sample_idx - idx_start = max(0, startup_sample_idx - 1) - initial_condition = mean_sq[idx_start] * decay_rate - mean_sq[idx_start:] = lfilter( - [1.0 - decay_rate], [1.0, -decay_rate], - mean_sq[idx_start:], - axis=0, - zi=[initial_condition])[0] - rms_estimate_db = 10.0 * np.log10(mean_sq + rms_eps) - gain_db = target_db - rms_estimate_db - if np.any(gain_db > max_gain_db): - warnings.warn( - "Unable to normalize segment to {} dB because it has an RMS " - "value of {} dB and the difference exceeds max_gain_db ({} dB)" - .format(target_db, self.rms_db, max_gain_db), - NormalizationWarning) - gain_db = np.minimum(gain_db, max_gain_db) self.apply_gain(gain_db) def resample(self, target_sample_rate, quality='sinc_medium'): - """Resample audio and return new AudioSegment. - This resamples the audio to a new sample rate and returns a brand - new AudioSegment. The existing AudioSegment is unchanged. + """Resample audio segment. This resamples the audio to a new + sample rate. Note that this is an in-place transformation. - :param new_sample_rate: target sample rate - :type new_sample_rate: scalar + :param target_sample_rate: Target sample rate + :type target_sample_rate: int :param quality: One of {'sinc_fastest', 'sinc_medium', 'sinc_best'}. - Sets resampling speed/quality tradeoff. - See http://www.mega-nerd.com/SRC/api_misc.html#Converters + Sets resampling speed/quality tradeoff. + See http://www.mega-nerd.com/SRC/api_misc.html#Converters :type quality: basestring """ resample_ratio = target_sample_rate / self._sample_rate new_samples = scikits.samplerate.resample( self._samples, r=resample_ratio, type=quality) self._samples = new_samples - self._sample_rate = new_sample_rate + self._sample_rate = target_sample_rate def pad_silence(self, duration, sides='both'): """Pads this audio sample with a period of silence. Note that this is an in-place transformation. - :param duration: length of silence in seconds to pad + :param duration: Length of silence in seconds to pad :type duration: float - :param sides: - 'beginning' - adds silence in the beginning - 'end' - adds silence in the end - 'both' - adds silence in both the beginning and the end. - :type sides: basestring + :param sides: Position for padding + 'beginning' - adds silence in the beginning + 'end' - adds silence in the end + 'both' - adds silence in both the beginning and the end. + :type sides: str + :raises ValueError: If the sides not surport """ if duration == 0.0: return self - cls = type(self) - silence = cls.make_silence(duration, self._sample_rate) + silence = self.make_silence(duration, self._sample_rate) if sides == "beginning": - padded = cls.concatenate(silence, self) + padded = self.concatenate(silence, self) elif sides == "end": - padded = cls.concatenate(self, silence) + padded = self.concatenate(self, silence) elif sides == "both": - padded = cls.concatenate(silence, self, silence) + padded = self.concatenate(silence, self, silence) else: - raise ValueError("Unknown value for the kwarg 'sides'") + raise ValueError("Unknown value for the kwarg %s" % sides) self._samples = padded._samples self._sample_rate = padded._sample_rate @@ -398,88 +371,83 @@ class AudioSegment(object): """Return new AudioSegment containing audio between given boundaries. :param start_sec: Beginning of subsegment in seconds, - (beginning of segment if None). - :type start_sec: scalar + (beginning of segment if None). + :type start_sec: float :param end_sec: End of subsegment in seconds, - (end of segment if None). - :type end_sec: scalar - - :return: New AudioSegment containing specified - subsegment. - :trype: AudioSegment + (end of segment if None). + :type end_sec: float + :return: New AudioSegment containing specified subsegment. + :rtype: AudioSegment """ - # Default boundaries - if start_sec is None: - start_sec = 0.0 - if end_sec is None: - end_sec = self.duration - + start_sec = 0.0 if start_sec is None else start_sec + end_sec = self.duration if end_sec is None else end_sec # negative boundaries are relative to end of segment if start_sec < 0.0: start_sec = self.duration + start_sec if end_sec < 0.0: end_sec = self.duration + end_sec - start_sample = int(round(start_sec * self._sample_rate)) end_sample = int(round(end_sec * self._sample_rate)) samples = self._samples[start_sample:end_sample] - return type(self)(samples, sample_rate=self._sample_rate) def random_subsegment(self, subsegment_length, rng=None): - """ - Return a random subsegment of a specified length in seconds. + """Return a random subsegment of a specified length in seconds. :param subsegment_length: Subsegment length in seconds. - :type subsegment_length: scalar + :type subsegment_length: float :param rng: Random number generator state - :type rng: random.Random [optional] - - - :return:clip (SpeechDLSegment): New SpeechDLSegmen containing random - subsegment of original segment. + :type rng: random.Random + :return: New AudioSegment containing random subsegment + of original segment + :rtype: AudioSegment + :raises ValueError: If the length of subsegment greater than origineal + segemnt. """ - if rng is None: - rng = random.Random() - + rng = random.Random() if rng is None else rng if subsegment_length > self.duration: raise ValueError("Length of subsegment must not be greater " "than original segment.") start_time = rng.uniform(0.0, self.duration - subsegment_length) return self.subsegment(start_time, start_time + subsegment_length) - def convolve(self, ir, allow_resampling=False): + def convolve(self, impulse_segment, allow_resample=False): """Convolve this audio segment with the given filter. - :param ir: impulse response - :type ir: AudioSegment - :param allow_resampling: indicates whether resampling is allowed - when the ir has a different sample rate from this signal. - :type allow_resampling: boolean - """ - if allow_resampling and self.sample_rate != ir.sample_rate: - ir = ir.resample(self.sample_rate) - - if self.sample_rate != ir.sample_rate: - raise ValueError("Impulse response sample rate ({}Hz) is " - "equal to base signal sample rate ({}Hz)." - .format(ir.sample_rate, self.sample_rate)) + Note that this is an in-place transformation. - samples = signal.fftconvolve(self.samples, ir.samples, "full") + :param impulse_segment: Impulse response segments. + :type impulse_segment: AudioSegment + :param allow_resample: indicates whether resampling is allowed when + the impulse_segment has a different sample + rate from this signal. + :type allow_resample: boolean + :raises ValueError: If the sample rate is not match between two + audio segments and resample is not allowed. + """ + if allow_resample and self.sample_rate != impulse_segment.sample_rate: + impulse_segment = impulse_segment.resample(self.sample_rate) + if self.sample_rate != impulse_segment.sample_rate: + raise ValueError("Impulse segment's sample rate (%d Hz) is not" + "equal to base signal sample rate (%d Hz)." % + (impulse_segment.sample_rate, self.sample_rate)) + samples = signal.fftconvolve(self.samples, impulse_segment.samples, + "full") self._samples = samples - def convolve_and_normalize(self, ir, allow_resample=False): + def convolve_and_normalize(self, impulse_segment, allow_resample=False): """Convolve and normalize the resulting audio segment so that it has the same average power as the input signal. - :param ir: impulse response - :type ir: AudioSegment - :param allow_resampling: indicates whether resampling is allowed - when the ir has a different sample rate from this signal. - :type allow_resampling: boolean + :param impulse_segment: Impulse response segments. + :type impulse_segment: AudioSegment + :param allow_resample: indicates whether resampling is allowed when + the impulse_segment has a different sample rate from this signal. + :type allow_resample: boolean """ - self.convolve(ir, allow_resampling=allow_resampling) - self.normalize(target_db=self.rms_db) + target_db = self.rms_db + self.convolve(impulse_segment, allow_resample=allow_resample) + self.normalize(target_db) def add_noise(self, noise, @@ -492,36 +460,33 @@ class AudioSegment(object): of matching length is sampled from it and used instead. :param noise: Noise signal to add. - :type noise: SpeechDLSegment + :type noise: AudioSegment :param snr_dB: Signal-to-Noise Ratio, in decibels. - :type snr_dB: scalar - :param allow_downsampling: whether to allow the noise signal - to be downsampled to match the base signal sample rate. + :type snr_dB: float + :param allow_downsampling: whether to allow the noise signal to be downsampled + to match the base signal sample rate. :type allow_downsampling: boolean - :param max_gain_db: Maximum amount of gain to apply to noise - signal before adding it in. This is to prevent attempting - to apply infinite gain to a zero signal. - :type max_gain_db: scalar + :param max_gain_db: Maximum amount of gain to apply to noise signal before + adding it in. This is to prevent attempting to apply infinite + gain to a zero signal. + :type max_gain_db: float :param rng: Random number generator state. :type rng: random.Random - - Returns: - SpeechDLSegment: signal with noise added. + :raises ValueError: If the sample rate does not match between the two audio segments + and resample is not allowed, or if the duration of noise segments + is shorter than original audio segments. """ - if rng is None: - rng = random.Random() - + rng = random.Random() if rng is None else rng if allow_downsampling and noise.sample_rate > self.sample_rate: noise = noise.resample(self.sample_rate) - if noise.sample_rate != self.sample_rate: - raise ValueError("Noise sample rate ({}Hz) is not equal to " - "base signal sample rate ({}Hz)." - .format(noise.sample_rate, self.sample_rate)) + raise ValueError("Noise sample rate (%d Hz) is not equal to " + "base signal sample rate (%d Hz)." % + (noise.sample_rate, self.sample_rate)) if noise.duration < self.duration: - raise ValueError("Noise signal ({} sec) must be at " - "least as long as base signal ({} sec)." - .format(noise.duration, self.duration)) + raise ValueError("Noise signal (%f sec) must be at " + "least as long as base signal (%f sec)." % + (noise.duration, self.duration)) noise_gain_db = self.rms_db - noise.rms_db - snr_dB noise_gain_db = min(max_gain_db, noise_gain_db) noise_subsegment = noise.random_subsegment(self.duration, rng=rng) @@ -529,6 +494,12 @@ class AudioSegment(object): self._samples = output._samples self._sample_rate = output._sample_rate + def tranform_noise(self, noise_subsegment, noise_gain_db): + """ tranform noise file + """ + return type(self)(noise_subsegment._samples * (10.**( + noise_gain_db / 20.)), noise_subsegment._sample_rate) + @property def samples(self): """Return audio samples. @@ -618,9 +589,3 @@ class AudioSegment(object): else: raise TypeError("Unsupported sample type: %s." % samples.dtype) return output_samples.astype(dtype) - - def tranform_noise(self, noise_subsegment, noise_gain_db): - """ tranform noise file - """ - return type(self)(noise_subsegment._samples * (10.**( - noise_gain_db / 20.)), noise_subsegment._sample_rate) diff --git a/deep_speech_2/data_utils/augmentor/audio_database.py b/deep_speech_2/data_utils/augmentor/audio_database.py deleted file mode 100755 index e41c6dd72be16840bc49343d02a5c95c28828464..0000000000000000000000000000000000000000 --- a/deep_speech_2/data_utils/augmentor/audio_database.py +++ /dev/null @@ -1,401 +0,0 @@ -from __future__ import print_function -from collections import defaultdict -import bisect -import logging -import numpy as np -import os -import random -import sys - -UNK_TAG = "" - - -def stream_audio_index(fname, UNK=UNK_TAG): - """Reads an audio index file and emits one record in the index at a time. - - :param fname: audio index path - :type fname: basestring - :param UNK: UNK token to denote that certain audios are not tagged. - :type UNK: basesring - - Yields: - idx, duration, size, relpath, tags (int, float, int, str, list(str)): - audio file id, length of the audio in seconds, size in byte, - relative path w.r.t. to the root noise directory, list of tags - """ - with open(fname) as audio_index_file: - for i, line in enumerate(audio_index_file): - tok = line.strip().split("\t") - assert len(tok) >= 4, \ - "Invalid line at line {} in file {}".format( - i + 1, audio_index_file) - idx = int(tok[0]) - duration = float(tok[1]) - # Sometimes, the duration can round down to 0.0 - assert duration >= 0.0, \ - "Invalid duration at line {} in file {}".format( - i + 1, audio_index_file) - size = int(tok[2]) - assert size > 0, \ - "Invalid size at line {} in file {}".format( - i + 1, audio_index_file) - relpath = tok[3] - if len(tok) == 4: - tags = [UNK_TAG] - else: - tags = tok[4:] - yield idx, duration, size, relpath, tags - - -def truncate_float(val, ndigits=6): - """ Truncates a floating-point value to have the desired number of - digits after the decimal point. - - :param val: input value. - :type val: float - :parma ndigits: desired number of digits. - :type ndigits: int - - :return: truncated value - :rtype: float - """ - p = 10.0**ndigits - return float(int(val * p)) / p - - -def print_audio_index(idx, duration, size, relpath, tags, file=sys.stdout): - """Prints an audio record to the index file. - - :param idx: Audio file id. - :type idx: int - :param duration: length of the audio in seconds - :type duration: float - :param size: size of the file in bytes - :type size: int - :param relpath: relative path w.r.t. to the root noise directory. - :type relpath: basestring - :parma tags: list of tags - :parma tags: list(str) - :parma file: file to which we want to write an audio record. - :type file: sys.stdout - """ - file.write("{}\t{:.6f}\t{}\t{}" - .format(idx, truncate_float(duration, ndigits=6), size, relpath)) - for tag in tags: - file.write("\t{}".format(tag)) - file.write("\n") - - -class AudioIndex(object): - """ In-memory index of audio files that do not have annotations. - This supports duration-based sampling and sampling from a target - distribution. - - Each line in the index file consists of the following fields: - (id (int), duration (float), size (int), relative path (str), - list of tags ([str])) - """ - - def __init__(self): - self.audio_dir = None - self.index_fname = None - self.tags = None - self.bin_size = 2.0 - self.clear() - - def clear(self): - """ Clears the index - - Returns: - None - """ - self.idx_to_record = {} - # The list of indices correspond to audio files whose duration is - # greater than or equal to the key. - self.duration_to_id_set = {} - self.duration_to_id_set_per_tag = defaultdict(lambda: {}) - self.duration_to_list = defaultdict(lambda: []) - self.duration_to_list_per_tag = defaultdict( - lambda: defaultdict(lambda: [])) - self.tag_to_id_set = defaultdict(lambda: set()) - self.shared_duration_bins = [] - self.id_set_complete = set() - self.id_set = set() - self.duration_bins = [] - - def has_audio(self, distr=None): - """ - :param distr: The target distribution of audio tags that we want to - match. If this is not supplied, the function simply checks that - there are some audio files. - :parma distr: dict - :return: True if there are audio files. - :rtype: boolean - """ - if distr is None: - return len(self.id_set) > 0 - else: - for tag in distr: - if tag not in self.duration_to_list_per_tag: - return False - return True - - def _load_all_records_from_disk(self, audio_dir, idx_fname, bin_size): - """Loads all audio records from the disk into memory and groups them - into chunks based on their duration and the bin_size granalarity. - - Once all the records are read, indices are built from these records - by another function so that the audio samples can be drawn efficiently. - - Updates: - self.audio_dir (path): audio root directory - self.idx_fname (path): audio database index filename - self.bin_size (float): granularity of bins - self.idx_to_record (dict): maps from the audio id to - (duration, file_size, relative_path, tags) - self.tag_to_id_set (dict): maps from the tag to - the set of id's of audios that have this tag. - self.id_set_complete (set): set of all audio id's in the index file - self.min_duration (float): minimum audio duration observed in the - index file - self.duration_bins (list): the lower bounds on the duration of - audio files falling in each bin - self.duration_to_id_set (dict): contains (k, v) where v is the set - of id's of audios whose lengths are longer than or equal to k. - (e.g. k is the duration lower bound of this bin). - self.duration_to_id_set_per_tag (dict): Something like above but - has a finer granularity mapping from the tag to - duration_to_id_set. - self.shared_duration_bins (list): list of sets where each set - contains duration lower bounds whose audio id sets are the - same. The rationale for having this is that there are a few - but extremely long audio files which lead to a lot of bins. - When the id sets do not change across various minimum duration - boundaries, we - cluster these together and make them point to the same id set - reference. - - :return: whether the records were read from the disk. The assumption is - that the audio index file on disk and the actual audio files - are constructed once and never change during training. We only - re-read when either the directory or the index file path change. - """ - if self.audio_dir == audio_dir and self.idx_fname == idx_fname and \ - self.bin_size == bin_size: - # The audio directory and/or the list of audio files - # haven't changed. No need to load the list again. - return False - - # Remember where the audio index is most recently read from. - self.audio_dir = audio_dir - self.idx_fname = idx_fname - self.bin_size = bin_size - - # Read in the idx and compute the number of bins necessary - self.clear() - rank = [] - min_duration = float('inf') - max_duration = float('-inf') - for idx, duration, file_size, relpath, tags in \ - stream_audio_index(idx_fname): - self.idx_to_record[idx] = (duration, file_size, relpath, tags) - max_duration = max(max_duration, duration) - min_duration = min(min_duration, duration) - rank.append((duration, idx)) - for tag in tags: - self.tag_to_id_set[tag].add(idx) - if len(rank) == 0: - # file is empty - raise IOError("Index file {} is empty".format(idx_fname)) - for tag in self.tag_to_id_set: - self.id_set_complete |= self.tag_to_id_set[tag] - dur = min_duration - self.min_duration = min_duration - while dur < max_duration + bin_size: - self.duration_bins.append(dur) - dur += bin_size - - # Sort in decreasing order of duration and populate - # the cumulative indices lists. - rank.sort(reverse=True) - - # These are indices for `rank` and used to keep track of whether - # there are new records to add in the current bin. - last = 0 - cur = 0 - - # The set of audios falling in the previous bin; in the case, - # where we don't find new audios for the current bin, we store - # the reference to the last set so as to conserve memory. - # This is not such a big problem if the audio duration is - # bounded by a small number like 30 seconds and the - # bin size is big enough. But, for raw freesound audios, - # some audios can be as long as a few hours! - last_audio_set = set() - - # The same but for each tag so that we can pick audios based on - # tags and also some user-specified tag distribution. - last_audio_set_per_tag = defaultdict(lambda: set()) - - # Set of lists of bins sharing the same audio sets. - shared = set() - - for i in range(len(self.duration_bins) - 1, -1, -1): - lower_bound = self.duration_bins[i] - new_audio_idxs = set() - new_audio_idxs_per_tag = defaultdict(lambda: set()) - while cur < len(rank) and rank[cur][0] >= lower_bound: - idx = rank[cur][1] - tags = self.idx_to_record[idx][3] - new_audio_idxs.add(idx) - for tag in tags: - new_audio_idxs_per_tag[tag].add(idx) - cur += 1 - # This makes certain that the same list is shared across - # different bins if no new indices are added. - if cur == last: - shared.add(lower_bound) - else: - last_audio_set = last_audio_set | new_audio_idxs - for tag in new_audio_idxs_per_tag: - last_audio_set_per_tag[tag] = \ - last_audio_set_per_tag[tag] | \ - new_audio_idxs_per_tag[tag] - if len(shared) > 0: - self.shared_duration_bins.append(shared) - shared = set([lower_bound]) - ### last_audio_set = set() should set blank - last = cur - self.duration_to_id_set[lower_bound] = last_audio_set - for tag in last_audio_set_per_tag: - self.duration_to_id_set_per_tag[lower_bound][tag] = \ - last_audio_set_per_tag[tag] - - # The last `shared` record isn't added to the `shared_duration_bins`. - self.shared_duration_bins.append(shared) - - # We make sure that the while loop above has exhausted through the - # `rank` list by checking if the `cur`rent index in `rank` equals - # the length of the array, which is the halting condition. - assert cur == len(rank) - - return True - - def _build_index_from_records(self, tag_list): - """ Uses the in-memory records read from the index file to build - an in-memory index restricted to the given tag list. - - :param tag_list: List of tags we are interested in sampling from. - :type tag_list: list(str) - - Updates: - self.id_set (set): the set of all audio id's that can be sampled. - self.duration_to_list (dict): maps from the duration lower bound - to the id's of audios longer than this duration. - self.duration_to_list_per_tag (dict): maps from the tag to - the same structure as self.duration_to_list. This is to support - sampling from a target noise distribution. - - :return: whether the index was built from scratch - """ - if self.tags == tag_list: - return False - - self.tags = tag_list - if len(tag_list) == 0: - self.id_set = self.id_set_complete - else: - self.id_set = set() - for tag in tag_list: - self.id_set |= self.tag_to_id_set[tag] - - # Next, we need to take a subset of the audio files - for shared in self.shared_duration_bins: - # All bins in `shared' have the same index lists - # so we can intersect once and set all of them to this list. - lb = list(shared)[0] - intersected = list(self.id_set & self.duration_to_id_set[lb]) - duration_to_id_set = self.duration_to_id_set_per_tag[lb] - intersected_per_tag = { - tag: self.tag_to_id_set[tag] & duration_to_id_set[tag] - for tag in duration_to_id_set - } - for bin_key in shared: - self.duration_to_list[bin_key] = intersected - for tag in intersected_per_tag: - self.duration_to_list_per_tag[tag][bin_key] = \ - intersected_per_tag[tag] - assert len(self.duration_to_list) == len(self.duration_to_id_set) - return True - - def refresh_records_from_index_file(self, - audio_dir, - idx_fname, - tag_list, - bin_size=2.0): - """ Loads the index file and populates the records - for building the internal index. - - If the audio directory or index file name has changed, the whole index - is reloaded from scratch. If only the tag_list is changed, then the - desired index is built from the complete, in-memory record. - - :param audio_dir: audio directory - :type audio_dir: basestring - :param idx_fname: audio index file name - :type idex_fname: basestring - :param tag_list: list of tags we are interested in loading; - if empty, we load all. - :type tag_list: list - :param bin_size: optional argument for controlling the granularity - of duration bins - :type bin_size: float - """ - if tag_list is None: - tag_list = [] - reloaded_records = self._load_all_records_from_disk(audio_dir, - idx_fname, bin_size) - if reloaded_records or self.tags != tag_list: - self._build_index_from_records(tag_list) - logger.info('loaded {} audio files from {}' - .format(len(self.id_set), idx_fname)) - - def sample_audio(self, duration, rng=None, distr=None): - """ Uniformly draws an audio record of at least the desired duration - - :param duration: minimum desired audio duration - :type duration: float - :param rng: random number generator - :type rng: random.Random - :param distr: target distribution of audio tags. If not provided, - :type distr: dict - all audio files are sampled uniformly at random. - - :returns: success, (duration, file_size, path) - """ - if duration < 0.0: - duration = self.min_duration - i = bisect.bisect_left(self.duration_bins, duration) - if i == len(self.duration_bins): - return False, None - bin_key = self.duration_bins[i] - if distr is None: - indices = self.duration_to_list[bin_key] - else: - # If a desired audio distribution is given, we sample from it. - if rng is None: - rng = random.Random() - nprng = np.random.RandomState(rng.getrandbits(32)) - prob_masses = distr.values() - prob_masses /= np.sum(prob_masses) - tag = nprng.choice(distr.keys(), p=prob_masses) - indices = self.duration_to_list_per_tag[tag][bin_key] - if len(indices) == 0: - return False, None - else: - if rng is None: - rng = random.Random() - # duration, file size and relative path from root - s = self.idx_to_record[rng.sample(indices, 1)[0]] - s = (s[0], s[1], os.path.join(self.audio_dir, s[2])) - return True, s diff --git a/deep_speech_2/data_utils/augmentor/augmentation.py b/deep_speech_2/data_utils/augmentor/augmentation.py index c0a70ad186e4401326dd1f5b80b829fde4b099c7..abe1a0ec89c5d6fc6f8ac1822df184cf5db4d7e1 100755 --- a/deep_speech_2/data_utils/augmentor/augmentation.py +++ b/deep_speech_2/data_utils/augmentor/augmentation.py @@ -6,11 +6,6 @@ from __future__ import print_function import json import random from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor -from data_utils.augmentor.resamler import ResamplerAugmentor -from data_utils.augmentor.speed_perturb import SpeedPerturbatioAugmentor -from data_utils.augmentor.online_bayesian_normalization import OnlineBayesianNormalizationAugmentor -from data_utils.augmentor.Impulse_response import ImpulseResponseAugmentor -from data_utils.augmentor.noise_speech import NoiseSpeechAugmentor class AugmentationPipeline(object): @@ -81,15 +76,5 @@ class AugmentationPipeline(object): """Return an augmentation model by the type name, and pass in params.""" if augmentor_type == "volume": return VolumePerturbAugmentor(self._rng, **params) - if augmentor_type == "resamle": - return ResamplerAugmentor(self._rng, **params) - if augmentor_type == "speed": - return SpeedPerturbatioAugmentor(self._rng, **params) - if augmentor_type == "online_bayesian_normalization": - return OnlineBayesianNormalizationAugmentor(self._rng, **params) - if augmentor_type == "Impulse_response": - return ImpulseResponseAugmentor(self._rng, **params) - if augmentor_type == "noise_speech": - return NoiseSpeechAugmentor(self._rng, **params) else: raise ValueError("Unknown augmentor type [%s]." % augmentor_type) diff --git a/deep_speech_2/data_utils/augmentor/implus_response.py b/deep_speech_2/data_utils/augmentor/implus_response.py deleted file mode 100755 index cc2053421a51c9833367e985a0b17e4e996b5378..0000000000000000000000000000000000000000 --- a/deep_speech_2/data_utils/augmentor/implus_response.py +++ /dev/null @@ -1,76 +0,0 @@ -""" Impulse response""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from . import base -from . import audio_database -from data_utils.speech import SpeechSegment - - -class ImpulseResponseAugmentor(base.AugmentorBase): - """ Instantiates an impulse response model - - :param ir_dir: directory containing impulse responses - :type ir_dir: basestring - :param tags: optional parameter for specifying what - particular impulse responses to apply. - :type tags: list - :parm tag_distr: optional noise distribution - :type tag_distr: dict - """ - - def __init__(self, rng, ir_dir, index_file, tags=None, tag_distr=None): - # Define all required parameter maps here. - self.ir_dir = ir_dir - self.index_file = index_file - - self.tags = tags - self.tag_distr = tag_distr - - self.audio_index = audio_database.AudioIndex() - self.rng = rng - - def _init_data(self): - """ Preloads stuff from disk in an attempt (e.g. list of files, etc) - to make later loading faster. If the data configuration remains the - same, this function does nothing. - - """ - self.audio_index.refresh_records_from_index_file( - self.ir_dir, self.index_file, self.tags) - - def transform_audio(self, audio_segment): - """ Convolves the input audio with an impulse response. - - :param audio_segment: input audio - :type audio_segment: AudioSegemnt - """ - # This handles the cases where the data source or directories change. - self._init_data() - - read_size = 0 - tag_distr = self.tag_distr - if not self.audio_index.has_audio(tag_distr): - if tag_distr is None: - if not self.tags: - raise RuntimeError("The ir index does not have audio " - "files to sample from.") - else: - raise RuntimeError("The ir index does not have audio " - "files of the given tags to sample " - "from.") - else: - raise RuntimeError("The ir index does not have audio " - "files to match the target ir " - "distribution.") - else: - # Querying with a negative duration triggers the index to search - # from all impulse responses. - success, record = self.audio_index.sample_audio( - -1.0, rng=self.rng, distr=tag_distr) - if success is True: - _, read_size, ir_fname = record - ir_wav = SpeechSegment.from_file(ir_fname) - audio_segment.convolve(ir_wav, allow_resampling=True) diff --git a/deep_speech_2/data_utils/augmentor/noise_speech.py b/deep_speech_2/data_utils/augmentor/noise_speech.py deleted file mode 100755 index 8cf7c27b660f432b040871f36307e734558a42fc..0000000000000000000000000000000000000000 --- a/deep_speech_2/data_utils/augmentor/noise_speech.py +++ /dev/null @@ -1,318 +0,0 @@ -""" noise speech -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -import logging -import numpy as np -import os -from collections import defaultdict - -from . import base -from . import audio_database -from data_utils.speech import SpeechSegment - -TURK = "turk" -USE_AUDIO_DATABASE_SOURCES = frozenset(["freesound", "chime"]) -HALF_NOISE_LENGTH_MIN_THRESHOLD = 3.0 -FIND_NOISE_MAX_ATTEMPTS = 20 - -logger = logging.getLogger(__name__) - - -def get_first_smaller(items, value): - index = bisect.bisect_left(items, value) - 1 - assert items[index] < value, \ - 'get_first_smaller failed! %d %d' % (items[index], value) - return items[index] - - -def get_first_larger(items, value): - 'Find leftmost value greater than value' - index = bisect.bisect_right(items, value) - assert index < len(items), \ - "no noise bin exists for this audio length (%f)" % value - assert items[index] > value, \ - 'get_first_larger failed! %d %d' % (items[index], value) - return items[index] - - -def _get_turk_noise_files(noise_dir, index_file): - """ Creates a map from duration => a list of noise filenames - - :param noise_dir: Directory of noise files which contains - "noise-samples-list" - :type noise_dir: basestring - :param index_file: Noise list - :type index_file: basestring - - returns:noise_files (defaultdict): A map of bins to noise files. - Each key is the duration, and the value is a list of noise - files binned to this duration. Each bin is 2 secs. - - Note: noise-samples-list should contain one line per noise (wav) file - along with its duration in milliseconds - """ - noise_files = defaultdict(list) - if not os.path.exists(index_file): - logger.error('No noise files were found at {}'.format(index_file)) - return noise_files - num_noise_files = 0 - rounded_durations = list(range(0, 65, 2)) - with open(index_file, 'r') as fl: - for line in fl: - fname = os.path.join(noise_dir, line.strip().split()[0]) - duration = float(line.strip().split()[1]) / 1000 - # bin the noise files into length bins rounded by 2 sec - bin_id = get_first_smaller(rounded_durations, duration) - noise_files[bin_id].append(fname) - num_noise_files += 1 - logger.info('Loaded {} turk noise files'.format(num_noise_files)) - return noise_files - - -class NoiseSpeechAugmentor(base.AugmentorBase): - """ Noise addition block - - :param snr_min: minimum signal-to-noise ratio - :type snr_min: float - :param snr_max: maximum signal-to-noise ratio - :type snr_max: float - :param noise_dir: root of where noise files are stored - :type noise_fir: basestring - :param index_file: index of noises of interest in noise_dir - :type index_file: basestring - :param source: select one from - - turk - - freesound - - chime - Note that this field is no longer required for the freesound - and chime - :type source: string - :param tags: optional parameter for specifying what - particular noises we want to add. See above for the available tags. - :type tags: list - :param tag_distr: optional noise distribution - :type tag_distr: dict - """ - - def __init__(self, - rng, - snr_min, - snr_max, - noise_dir, - source, - allow_downsampling=None, - index_file=None, - tags=None, - tag_distr=None): - # Define all required parameter maps here. - self.rng = rng - self.snr_min = snr_min - self.snr_max = snr_max - self.noise_dir = noise_dir - self.source = source - - self.allow_downsampling = allow_downsampling - self.index_file = index_file - self.tags = tags - self.tag_distr = tag_distr - - # When new noise sources are added, make sure to define the - # associated bookkeeping variables here. - self.turk_noise_files = [] - self.turk_noise_dir = None - self.audio_index = audio_database.AudioIndex() - - def _init_data(self): - """ Preloads stuff from disk in an attempt (e.g. list of files, etc) - to make later loading faster. If the data configuration remains the - same, this function does nothing. - - """ - noise_dir = self.noise_dir - index_file = self.index_file - source = self.source - if not index_file: - if source == TURK: - index_file = os.path.join(noise_dir, 'noise-samples-list') - logger.debug("index_file not provided; " + "defaulting to " + - index_file) - else: - if source != "": - assert source in USE_AUDIO_DATABASE_SOURCES, \ - "{} not supported by audio_database".format(source) - index_file = os.path.join(noise_dir, - "audio_index_commercial.txt") - logger.debug("index_file not provided; " + "defaulting to " + - index_file) - - if source == TURK: - if self.turk_noise_dir != noise_dir: - self.turk_noise_dir = noise_dir - self.turk_noise_files = _get_turk_noise_files(noise_dir, - index_file) - # elif source == TODO_SUPPORT_NON_AUDIO_DATABASE_BASED_SOURCES: - else: - if source != "": - assert source in USE_AUDIO_DATABASE_SOURCES, \ - "{} not supported by audio_database".format(source) - self.audio_index.refresh_records_from_index_file( - self.noise_dir, index_file, self.tags) - - def transform_audio(self, audio_segment): - """Adds walla noise - - :param audio_segment: Input audio - :type audio_segment: SpeechSegment - """ - # This handles the cases where the data source or directories change. - self._init_data - source = self.source - allow_downsampling = self.allow_downsampling - if source == TURK: - self._add_turk_noise(audio_segment, self.rng, allow_downsampling) - # elif source == TODO_SUPPORT_NON_AUDIO_DATABASE_BASED_SOURCES: - else: - self._add_noise(audio_segment, self.rng, allow_downsampling) - - def _sample_snr(self): - """ Returns a float sampled in [`self.snr_min`, `self.snr_max`] - if both `self.snr_min` and `self.snr_max` are non-zero. - """ - snr_min = self.snr_min - snr_max = self.snr_max - sampled_snr = self.rng.uniform(snr_min, snr_max) - return sampled_snr - - def _add_turk_noise(self, audio_segment, allow_downsampling): - """ Adds a turk noise to the input audio. - - :param audio_segment: input audio - :type audio_segment: audiosegment - :param allow_downsampling: indicates whether downsampling - is allowed - :type allow_downsampling: boolean - """ - read_size = 0 - if len(self.turk_noise_files) > 0: - snr = self._sample_snr(self.rng) - # Draw the noise file randomly from noise files that are - # slightly longer than the utterance - noise_bins = sorted(self.turk_noise_files.keys()) - # note some bins can be empty, so we can't just round up - # to the nearest 2-sec interval - rounded_duration = get_first_larger(noise_bins, - audio_segment.duration) - noise_fname = \ - self.rng.sample(self.turk_noise_files[rounded_duration], 1)[0] - noise = SpeechSegment.from_wav_file(noise_fname) - logger.debug('noise_fname {}'.format(noise_fname)) - logger.debug('snr {}'.format(snr)) - read_size = len(noise) * 2 - # May throw exceptions, but this is caught by - # AudioFeaturizer.get_audio_files. - audio_segment.add_noise( - noise, snr, rng=self.rng, allow_downsampling=allow_downsampling) - - def _add_noise(self, audio_segment, allow_downsampling): - """ Adds a noise indexed in audio_database.AudioIndex. - - :param audio_segment: input audio - :type audio_segment: SpeechSegment - :param allow_downsampling: indicates whether downsampling - is allowed - :type allow_downsampling: boolean - - Returns: - (SpeechSegment, int) - - sound with turk noise added - - number of bytes read from disk - """ - read_size = 0 - tag_distr = self.tag_distr - if not self.audio_index.has_audio(tag_distr): - if tag_distr is None: - if not self.tags: - raise RuntimeError("The noise index does not have audio " - "files to sample from.") - else: - raise RuntimeError("The noise index does not have audio " - "files of the given tags to sample " - "from.") - else: - raise RuntimeError("The noise index does not have audio " - "files to match the target noise " - "distribution.") - else: - # Compute audio segment related statistics - audio_duration = audio_segment.duration - - # Sample relevant augmentation parameters. - snr = self._sample_snr(self.rng) - - # Perhaps, we may not have a sufficiently long noise, so we need - # to search iteratively. - min_duration = audio_duration + 0.25 - for _ in range(FIND_NOISE_MAX_ATTEMPTS): - logger.debug("attempting to find noise of length " - "at least {}".format(min_duration)) - - success, record = \ - self.audio_index.sample_audio(min_duration, - rng=self.rng, - distr=tag_distr) - - if success is True: - noise_duration, read_size, noise_fname = record - - # Assert after logging so we know - # what caused augmentation to fail. - logger.debug("noise_fname {}".format(noise_fname)) - logger.debug("snr {}".format(snr)) - assert noise_duration >= min_duration - break - - # Decrease the desired minimum duration linearly. - # If the value becomes smaller than some threshold, - # we half the value instead. - if min_duration > HALF_NOISE_LENGTH_MIN_THRESHOLD: - min_duration -= 2.0 - else: - min_duration *= 0.5 - - if success is False: - logger.info("Failed to find a noise file") - return - - diff_duration = audio_duration + 0.25 - noise_duration - if diff_duration >= 0.0: - # Here, the noise is shorter than the audio file, so - # we pad with zeros to make sure the noise sound is applied - # with a uniformly random shift. - noise = SpeechSegment.from_file(noise_fname) - noise = noise.pad_silence(diff_duration, sides="both") - else: - # The noise clip is at least ~25 ms longer than the audio - # segment here. - diff_duration = int(noise_duration * audio_segment.sample_rate) - \ - int(audio_duration * audio_segment.sample_rate) - \ - int(0.02 * audio_segment.sample_rate) - start = float(self.rng.randint(0, diff_duration)) / \ - audio.sample_rate - finish = min(start + audio_duration + 0.2, noise_duration) - noise = SpeechSegment.slice_from_file(noise_fname, start, - finish) - - if len(noise) < len(audio_segment): - # This is to ensure that the noise clip is at least as - # long as the audio segment. - num_samples_to_pad = len(audio_segment) - len(noise) - # Padding this amount of silence on both ends ensures that - # the placement of the noise clip is uniformly random. - silence = SpeechSegment( - np.zeros(num_samples_to_pad), audio_segment.sample_rate) - noise = SpeechSegment.concatenate(silence, noise, silence) - - audio_segment.add_noise( - noise, snr, rng=self.rng, allow_downsampling=allow_downsampling) diff --git a/deep_speech_2/data_utils/augmentor/online_bayesian_normalization.py b/deep_speech_2/data_utils/augmentor/online_bayesian_normalization.py deleted file mode 100755 index bc2d6c1b65103fbe4d4bac541bada02a2ad39fc0..0000000000000000000000000000000000000000 --- a/deep_speech_2/data_utils/augmentor/online_bayesian_normalization.py +++ /dev/null @@ -1,57 +0,0 @@ -""" Online bayesian normalization -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from . import base - - -class OnlineBayesianNormalizationAugmentor(base.AugmentorBase): - """ - Instantiates an online bayesian normalization module. - :param target_db: Target RMS value in decibels - :type target_db: func[int->scalar] - :param prior_db: Prior RMS estimate in decibels - :type prior_db: func[int->scalar] - :param prior_samples: Prior strength in number of samples - :type prior_samples: func[int->scalar] - :param startup_delay: Start-up delay in seconds during - which normalization statistics is accrued. - :type starup_delay: func[int->scalar] - """ - - def __init__(self, - rng, - target_db, - prior_db, - prior_samples, - startup_delay=base.parse_parameter_from(0.0)): - - self.target_db = target_db - self.prior_db = prior_db - self.prior_samples = prior_samples - self.startup_delay = startup_delay - self.rng = rng - - def transform_audio(self, audio_segment): - """ - Normalizes the input audio using the online Bayesian approach. - - :param audio_segment: input audio - :type audio_segment: SpeechSegment - :param iteration: current iteration - :type iteration: int - :param text: audio transcription - :type text: basestring - :param rng: RNG to use for augmentation - :type rng: random.Random - - """ - read_size = 0 - target_db = self.target_db(iteration) - prior_db = self.prior_db(iteration) - prior_samples = self.prior_samples(iteration) - startup_delay = self.startup_delay(iteration) - audio.normalize_online_bayesian( - target_db, prior_db, prior_samples, startup_delay=startup_delay) diff --git a/deep_speech_2/data_utils/augmentor/resampler.py b/deep_speech_2/data_utils/augmentor/resampler.py deleted file mode 100755 index 1b959be56c03e873da561567ebc8457fcdea9efe..0000000000000000000000000000000000000000 --- a/deep_speech_2/data_utils/augmentor/resampler.py +++ /dev/null @@ -1,30 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from . import base - - -class ResamplerAugmentor(base.AugmentorBase): - """ Instantiates a resampler module. - - :param new_sample_rate: New sample rate in Hz - :type new_sample_rate: func[int->scalar] - :param rng: Random generator object. - :type rng: random.Random - """ - - def __init__(self, rng, new_sample_rate): - self.new_sample_rate = new_sample_rate - self._rng = rng - - def transform_audio(self, audio_segment): - """ Resamples the input audio to the target sample rate. - - Note that this is an in-place transformation. - - :param audio: input audio - :type audio: SpeechDLSegment - """ - new_sample_rate = self.new_sample_rate - audio.resample(new_sample_rate) \ No newline at end of file diff --git a/deep_speech_2/data_utils/augmentor/speed_perturb.py b/deep_speech_2/data_utils/augmentor/speed_perturb.py deleted file mode 100755 index e09be5f74ecb09a387df71a6e1618ebdfe2176b8..0000000000000000000000000000000000000000 --- a/deep_speech_2/data_utils/augmentor/speed_perturb.py +++ /dev/null @@ -1,53 +0,0 @@ -"""Speed perturbation module for making ASR robust to different voice -types (high pitched, low pitched, etc) -Samples uniformly between speed_min and speed_max -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from . import base - - -class SpeedPerturbatioAugmentor(base.AugmentorBase): - """ - Instantiates a speed perturbation module. - - See reference paper here: - - http://www.danielpovey.com/files/2015_interspeech_augmentation.pdf - - :param speed_min: Lower bound on new rate to sample - :type speed_min: func[int->scalar] - :param speed_max: Upper bound on new rate to sample - :type speed_max: func[int->scalar] - """ - - def __init__(self, rng, speed_min, speed_max): - - if (speed_min < 0.9): - raise ValueError( - "Sampling speed below 0.9 can cause unnatural effects") - if (speed_min > 1.1): - raise ValueError( - "Sampling speed above 1.1 can cause unnatural effects") - self.speed_min = speed_min - self.speed_max = speed_max - self.rng = rng - - def transform_audio(self, audio_segment): - """ - Samples a new speed rate from the given range and - changes the speed of the given audio clip. - - Note that this is an in-place transformation. - - :param audio_segment: input audio - :type audio_segment: SpeechDLSegment - """ - read_size = 0 - speed_min = self.speed_min(iteration) - speed_max = self.speed_max(iteration) - sampled_speed = rng.uniform(speed_min, speed_max) - audio = audio.change_speed(sampled_speed) diff --git a/deep_speech_2/data_utils/augmentor/volume_perturb.py b/deep_speech_2/data_utils/augmentor/volume_perturb.py index 15055b9154e3488e8e650e724d613cc3f2685f80..a5a9f6cadac13e651dd6902d68d0efdaa9a61dc4 100755 --- a/deep_speech_2/data_utils/augmentor/volume_perturb.py +++ b/deep_speech_2/data_utils/augmentor/volume_perturb.py @@ -3,10 +3,10 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from . import base +from data_utils.augmentor.base import AugmentorBase -class VolumePerturbAugmentor(base.AugmentorBase): +class VolumePerturbAugmentor(AugmentorBase): """Augmentation model for adding random volume perturbation. This is used for multi-loudness training of PCEN. See diff --git a/deep_speech_2/requirements.txt b/deep_speech_2/requirements.txt index 58a93debe49fca93e5df72164a5f8cf31291f0bd..c37e88ffe75dfda401726f485fd9928cbb477fab 100644 --- a/deep_speech_2/requirements.txt +++ b/deep_speech_2/requirements.txt @@ -1,2 +1,4 @@ SoundFile==0.9.0.post1 wget==3.2 +scikits.samplerate==0.3.3 +scipy==0.13.0b1