add audio part

42ba74ef · chrisxu2014 · a84bdf64 · 42ba74ef · a84bdf64 · 42ba74ef
10 changed file
--- a/deep_speech_2/data_utils/audio.py
+++ b/deep_speech_2/data_utils/audio.py
@@ -8,6 +8,7 @@ import io
 import soundfile
 import scikits.samplerate
 from scipy import signal
+import random
 class AudioSegment(object):
@@ -46,6 +47,32 @@ class AudioSegment(object):
        """Return whether two objects are unequal."""
        return not self.__eq__(other)
+    def __len__(self):
+        """Returns length of segment in samples."""
+        return self.num_samples
+    def __add__(self, other):
+        """Add samples from another segment to those of this segment and return
+        a new segment (sample-wise addition, not segment concatenation).
+        :param other: Segment containing samples to be
+                      added in.
+        :type other: AudioSegment
+        :return: New segment containing resulting samples.
+        :rtype: AudioSegment
+        :raise TypeError: If sample rates of segments don't match,
+                          or if length of segments don't match.
+        """
+        if type(self) != type(other):
+            raise TypeError("Cannot add segment of different type: {}"
+                            .format(type(other)))
+        if self._sample_rate != other._sample_rate:
+            raise TypeError("Sample rates must match to add segments.")
+        if len(self._samples) != len(other._samples):
+            raise TypeError("Segment lengths must match to add segments.")
+        samples = self.samples + other.samples
+        return type(self)(samples, sample_rate=self._sample_rate)
    def __str__(self):
        """Return human-readable representation of segment."""
        return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, "
@@ -64,69 +91,6 @@ class AudioSegment(object):
        samples, sample_rate = soundfile.read(file, dtype='float32')
        return cls(samples, sample_rate)
-    @classmethod
-    def slice_from_file(cls, fname, start=None, end=None):
-        """ 
-        Loads a small section of an audio without having to load
-        the entire file into the memory which can be incredibly wasteful.
-        :param fname: input audio file name
-        :type fname: bsaestring
-        :param start: start time in seconds (supported granularity is ms)
-            If start is negative, it wraps around from the end. If not
-            provided, this function reads from the very beginning.
-        :type start: float
-        :param end: start time in seconds (supported granularity is ms)
-            If end is negative, it wraps around from the end. If not
-            provided, the default behvaior is to read to the end of the
-            file.
-        :type end: float
-        :return:the specified slice of input audio in the audio.AudioSegment
-            format.
-        """
-        sndfile = soundfile.SoundFile(fname)
-        sample_rate = sndfile.samplerate
-        if sndfile.channels != 1:
-            raise TypeError("{} has more than 1 channel.".format(fname))
-        duration = float(len(sndfile)) / sample_rate
-        if start is None:
-            start = 0.0
-        if end is None:
-            end = duration
-        if start < 0.0:
-            start += duration
-        if end < 0.0:
-            end += duration
-        if start < 0.0:
-            raise IndexError("The slice start position ({} s) is out of "
-                             "bounds. Filename: {}".format(start, fname))
-        if end < 0.0:
-            raise IndexError("The slice end position ({} s) is out of bounds "
-                             "Filename: {}".format(end, fname))
-        if start > end:
-            raise IndexError("The slice start position ({} s) is later than "
-                             "the slice end position ({} s)."
-                             .format(start, end))
-        if end > duration:
-            raise ValueError("The slice end time ({} s) is out of "
-                             "bounds (> {} s) Filename: {}"
-                             .format(end, duration, fname))
-        start_frame = int(start * sample_rate)
-        end_frame = int(end * sample_rate)
-        sndfile.seek(start_frame)
-        data = sndfile.read(frames=end_frame - start_frame, dtype='float32')
-        return cls(data, sample_rate)
    @classmethod
    def from_bytes(cls, bytes):
        """Create audio segment from a byte string containing audio samples.
@@ -140,43 +104,30 @@ class AudioSegment(object):
            io.BytesIO(bytes), dtype='float32')
        return cls(samples, sample_rate)
-    @classmethod
+    def concatenate(self, *segments):
-    def make_silence(cls, duration, sample_rate):
-        """Creates a silent audio segment of the given duration and
-        sample rate.
-        :param duration: length of silence in seconds
-        :type duration: scalar
-        :param sample_rate: sample rate
-        :type sample_rate: scalar
-        :returns: silence of the given duration
-        :rtype: AudioSegment
-        """
-        samples = np.zeros(int(float(duration) * sample_rate))
-        return cls(samples, sample_rate)
-    @classmethod
-    def concatenate(cls, *segments):
        """Concatenate an arbitrary number of audio segments together.
-        :param *segments: input audio segments
+        :param *segments: Input audio segments
-        :type *segments: [AudioSegment]
+        :type *segments: AudioSegment
+        :return: Audio segment instance.
+        :rtype: AudioSegment
+        :raises ValueError: If number of segments is zero, or if sample_rate
+                            not match between two audio segments
+        :raises TypeError: If item of segments is not Audiosegment instance
        """
        # Perform basic sanity-checks.
-        N = len(segments)
+        if len(segments) == 0:
-        if N == 0:
            raise ValueError("No audio segments are given to concatenate.")
        sample_rate = segments[0]._sample_rate
-        for segment in segments:
+        for seg in segments:
-            if sample_rate != segment._sample_rate:
+            if sample_rate != seg._sample_rate:
                raise ValueError("Can't concatenate segments with "
                                 "different sample rates")
-            if type(segment) is not cls:
+            if type(seg) is not type(self):
                raise TypeError("Only audio segments of the same type "
                                "instance can be concatenated.")
        samples = np.concatenate([seg.samples for seg in segments])
-        return cls(samples, sample_rate)
+        return type(self)(samples, sample_rate)
    def to_wav_file(self, filepath, dtype='float32'):
        """Save audio segment to disk as wav file.
@@ -203,6 +154,65 @@ class AudioSegment(object):
            format='WAV',
            subtype=subtype_map[dtype])
+    def slice_from_file(self, file, start=None, end=None):
+        """Loads a small section of an audio without having to load
+        the entire file into the memory which can be incredibly wasteful.
+        :param file: Input audio filepath
+        :type file: basestring
+        :param start: Start time in seconds. If start is negative, it wraps
+                      around from the end. If not provided, this function 
+                      reads from the very beginning.
+        :type start: float
+        :param end: End time in seconds. If end is negative, it wraps around
+                    from the end. If not provided, the default behvaior is
+                    to read to the end of the file.
+        :type end: float
+        :return: The specified slice of input audio in the audio.AudioSegment format.
+        :rtype: AudioSegment
+        :rainse ValueError: If the position is error, or if the time is out bounds.
+        """
+        sndfile = soundfile.SoundFile(file)
+        sample_rate = sndfile.samplerate
+        duration = float(len(sndfile)) / sample_rate
+        start = 0. if start is None else start
+        end = 0. if end is None else end
+        if start < 0.0:
+            start += duration
+        if end < 0.0:
+            end += duration
+        if start < 0.0:
+            raise ValueError("The slice start position (%f s) is out of "
+                             "bounds. Filename: %s" % (start, file))
+        if end < 0.0:
+            raise ValueError("The slice end position (%f s) is out of bounds "
+                             "Filename: %s" % (end, file))
+        if start > end:
+            raise ValueError("The slice start position (%f s) is later than "
+                             "the slice end position (%f s)." % (start, end))
+        if end > duration:
+            raise ValueError("The slice end time (%f s) is out of bounds "
+                             "(> %f s) Filename: %s" % (end, duration, file))
+        start_frame = int(start * sample_rate)
+        end_frame = int(end * sample_rate)
+        sndfile.seek(start_frame)
+        data = sndfile.read(frames=end_frame - start_frame, dtype='float32')
+        return type(self)(data, sample_rate)
+    def make_silence(self, duration, sample_rate):
+        """Creates a silent audio segment of the given duration and
+        sample rate.
+        :param duration: Length of silence in seconds
+        :type duration: float
+        :param sample_rate: Sample rate
+        :type sample_rate: float
+        :return: Silence of the given duration
+        :rtype: AudioSegment
+        """
+        samples = np.zeros(int(duration * sample_rate))
+        return type(self)(samples, sample_rate)
    def to_bytes(self, dtype='float32'):
        """Create a byte string containing the audio content.
@@ -247,52 +257,49 @@ class AudioSegment(object):
        self._samples = np.interp(new_indices, old_indices, self._samples)
    def normalize(self, target_db=-20, max_gain_db=300.0):
-        """Normalize audio to desired RMS value in decibels.
+        """Normalize audio to be desired RMS value in decibels.
        Note that this is an in-place transformation.
-        :param target_db: Target RMS value in decibels.This value 
+        :param target_db: Target RMS value in decibels. This value should
-            should be less than 0.0 as 0.0 is full-scale audio.
+                          be less than 0.0 as 0.0 is full-scale audio.
-        :type target_db: float, optional
+        :type target_db: float
-        :param max_gain_db: Max amount of gain in dB that can be applied
+        :param max_gain_db: Max amount of gain in dB that can be applied for
-            for normalization.  This is to prevent nans when attempting
+                            normalization. This is to prevent nans when attempting
                            to normalize a signal consisting of all zeros.
-        :type max_gain_db: float, optional
+        :type max_gain_db: float
+        :raises ValueError: If the required gain to normalize the segment to
-        :raises NormalizationWarning: if the required gain to normalize the
+                            the target_db value exceeds max_gain_db.
-            segment to the target_db value exceeds max_gain_db.
        """
        gain = target_db - self.rms_db
        if gain > max_gain_db:
            raise ValueError(
-                "Unable to normalize segment to {} dB because it has an RMS "
+                "Unable to normalize segment to %f dB because it has an RMS "
-                "value of {} dB and the difference exceeds max_gain_db ({} dB)"
+                "value of %f dB and the difference exceeds max_gain_db (%f dB)"
-                .format(target_db, self.rms_db, max_gain_db))
+                % (target_db, self.rms_db, max_gain_db))
-        gain = min(max_gain_db, target_db - self.rms_db)
+        self.apply_gain(min(max_gain_db, target_db - self.rms_db))
-        self.apply_gain(gain)
    def normalize_online_bayesian(self,
                                  target_db,
                                  prior_db,
                                  prior_samples,
                                  startup_delay=0.0):
-        """
+        """Normalize audio using a production-compatible online/causal algorithm.
-        Normalize audio using a production-compatible online/causal algorithm.
+        This uses an exponential likelihood and gamma prior to make online estimates
-        This uses an exponential likelihood and gamma prior to make
+        of the RMS even when there are very few samples.
-        online estimates of the RMS even when there are very few samples.
        Note that this is an in-place transformation.
        :param target_db: Target RMS value in decibels
-        :type target_bd: scalar
+        :type target_bd: float
        :param prior_db: Prior RMS estimate in decibels
-        :type prior_db: scalar
+        :type prior_db: float
        :param prior_samples: Prior strength in number of samples
-        :type prior_samples: scalar
+        :type prior_samples: float
-        :param startup_delay: Default: 0.0 s. If provided, this
+        :param startup_delay: Default 0.0 s. If provided, this function will accrue
-            function will accrue statistics for the first startup_delay
+                              statistics for the first startup_delay seconds before
-            seconds before applying online normalization.
+                              applying online normalization.
-        :type startup_delay: scalar
+        :type startup_delay: float
        """
        # Estimate total RMS online
        startup_sample_idx = min(self.num_samples - 1,
@@ -309,52 +316,18 @@ class AudioSegment(object):
        mean_squared_estimate = ((cumsum_of_squares + prior_sum_of_squares) /
                                 (sample_count + prior_samples))
        rms_estimate_db = 10 * np.log10(mean_squared_estimate)
        # Compute required time-varying gain
        gain_db = target_db - rms_estimate_db
-        # Apply gain to new segment
-        self.apply_gain(gain_db)
-    def normalize_ewma(self,
-                       target_db,
-                       decay_rate,
-                       startup_delay,
-                       rms_eps=1e-6,
-                       max_gain_db=300.0):
-        startup_sample_idx = min(self.num_samples - 1,
-                                 int(self.sample_rate * startup_delay))
-        mean_sq = self.samples**2
-        if startup_sample_idx > 0:
-            mean_sq[:startup_sample_idx] = \
-                np.sum(mean_sq[:startup_sample_idx]) / startup_sample_idx
-        idx_start = max(0, startup_sample_idx - 1)
-        initial_condition = mean_sq[idx_start] * decay_rate
-        mean_sq[idx_start:] = lfilter(
-            [1.0 - decay_rate], [1.0, -decay_rate],
-            mean_sq[idx_start:],
-            axis=0,
-            zi=[initial_condition])[0]
-        rms_estimate_db = 10.0 * np.log10(mean_sq + rms_eps)
-        gain_db = target_db - rms_estimate_db
-        if np.any(gain_db > max_gain_db):
-            warnings.warn(
-                "Unable to normalize segment to {} dB because it has an RMS "
-                "value of {} dB and the difference exceeds max_gain_db ({} dB)"
-                .format(target_db, self.rms_db, max_gain_db),
-                NormalizationWarning)
-            gain_db = np.minimum(gain_db, max_gain_db)
        self.apply_gain(gain_db)
    def resample(self, target_sample_rate, quality='sinc_medium'):
-        """Resample audio and return new AudioSegment.
+        """Resample audio segment. This resamples the audio to a new 
-        This resamples the audio to a new sample rate and returns a brand
+        sample rate.
-        new AudioSegment.  The existing AudioSegment is unchanged.
        Note that this is an in-place transformation.
-        :param new_sample_rate: target sample rate
+        :param target_sample_rate: Target sample rate
-        :type new_sample_rate: scalar
+        :type target_sample_rate: int
        :param quality: One of {'sinc_fastest', 'sinc_medium', 'sinc_best'}.
                        Sets resampling speed/quality tradeoff.
                        See http://www.mega-nerd.com/SRC/api_misc.html#Converters
@@ -364,33 +337,33 @@ class AudioSegment(object):
        new_samples = scikits.samplerate.resample(
            self._samples, r=resample_ratio, type=quality)
        self._samples = new_samples
-        self._sample_rate = new_sample_rate
+        self._sample_rate = target_sample_rate
    def pad_silence(self, duration, sides='both'):
        """Pads this audio sample with a period of silence.
        Note that this is an in-place transformation.
-        :param duration: length of silence in seconds to pad
+        :param duration: Length of silence in seconds to pad
        :type duration: float
-        :param sides:
+        :param sides: Position for padding
                     'beginning' - adds silence in the beginning
                     'end' - adds silence in the end
                     'both' - adds silence in both the beginning and the end.
-        :type sides: basestring
+        :type sides: str
+        :raises ValueError: If the sides not surport
        """
        if duration == 0.0:
            return self
-        cls = type(self)
+        silence = self.make_silence(duration, self._sample_rate)
-        silence = cls.make_silence(duration, self._sample_rate)
        if sides == "beginning":
-            padded = cls.concatenate(silence, self)
+            padded = self.concatenate(silence, self)
        elif sides == "end":
-            padded = cls.concatenate(self, silence)
+            padded = self.concatenate(self, silence)
        elif sides == "both":
-            padded = cls.concatenate(silence, self, silence)
+            padded = self.concatenate(silence, self, silence)
        else:
-            raise ValueError("Unknown value for the kwarg 'sides'")
+            raise ValueError("Unknown value for the kwarg %s" % sides)
        self._samples = padded._samples
        self._sample_rate = padded._sample_rate
@@ -399,87 +372,82 @@ class AudioSegment(object):
        :param start_sec: Beginning of subsegment in seconds,
                          (beginning of segment if None).
-        :type start_sec:  scalar
+        :type start_sec: float
        :param end_sec: End of subsegment in seconds,
                        (end of segment if None).
-        :type end_sec: scalar
+        :type end_sec: float
+        :return: New AudioSegment containing specified subsegment.
-        :return: New AudioSegment containing specified
+        :rtype: AudioSegment
-            subsegment.
-        :trype: AudioSegment
        """
-        # Default boundaries
+        start_sec = 0.0 if start_sec is None else start_sec
-        if start_sec is None:
+        end_sec = self.duration if end_sec is None else end_sec
-            start_sec = 0.0
-        if end_sec is None:
-            end_sec = self.duration
        # negative boundaries are relative to end of segment
        if start_sec < 0.0:
            start_sec = self.duration + start_sec
        if end_sec < 0.0:
            end_sec = self.duration + end_sec
        start_sample = int(round(start_sec * self._sample_rate))
        end_sample = int(round(end_sec * self._sample_rate))
        samples = self._samples[start_sample:end_sample]
        return type(self)(samples, sample_rate=self._sample_rate)
    def random_subsegment(self, subsegment_length, rng=None):
-        """
+        """Return a random subsegment of a specified length in seconds.
-        Return a random subsegment of a specified length in seconds.
        :param subsegment_length: Subsegment length in seconds.
-        :type subsegment_length: scalar
+        :type subsegment_length: float
        :param rng: Random number generator state
-        :type rng: random.Random [optional]
+        :type rng: random.Random
+        :return: New AudioSegment containing random subsegment
+                 of original segment
-        :return:clip (SpeechDLSegment): New SpeechDLSegmen containing random
+        :rtype: AudioSegment
-            subsegment of original segment.
+        :raises ValueError: If the length of subsegment greater than origineal
+                            segemnt.
        """
-        if rng is None:
+        rng = random.Random() if rng is None else rng
-            rng = random.Random()
        if subsegment_length > self.duration:
            raise ValueError("Length of subsegment must not be greater "
                             "than original segment.")
        start_time = rng.uniform(0.0, self.duration - subsegment_length)
        return self.subsegment(start_time, start_time + subsegment_length)
-    def convolve(self, ir, allow_resampling=False):
+    def convolve(self, impulse_segment, allow_resample=False):
        """Convolve this audio segment with the given filter.
-        :param ir: impulse response
+        Note that this is an in-place transformation.
-        :type ir: AudioSegment
-        :param allow_resampling: indicates whether resampling is allowed
-                when the ir has a different sample rate from this signal.
-        :type allow_resampling: boolean
-        """
-        if allow_resampling and self.sample_rate != ir.sample_rate:
-            ir = ir.resample(self.sample_rate)
-        if self.sample_rate != ir.sample_rate:
-            raise ValueError("Impulse response sample rate ({}Hz) is "
-                             "equal to base signal sample rate ({}Hz)."
-                             .format(ir.sample_rate, self.sample_rate))
-        samples = signal.fftconvolve(self.samples, ir.samples, "full")
+        :param impulse_segment: Impulse response segments.
+        :type impulse_segment: AudioSegment
+        :param allow_resample: indicates whether resampling is allowed when
+                                 the impulse_segment has a different sample 
+                                 rate from this signal.
+        :type allow_resample: boolean
+        :raises ValueError: If the sample rate is not match between two
+                            audio segments and resample is not allowed.
+        """
+        if allow_resample and self.sample_rate != impulse_segment.sample_rate:
+            impulse_segment = impulse_segment.resample(self.sample_rate)
+        if self.sample_rate != impulse_segment.sample_rate:
+            raise ValueError("Impulse segment's sample rate (%d Hz) is not"
+                             "equal to base signal sample rate (%d Hz)." %
+                             (impulse_segment.sample_rate, self.sample_rate))
+        samples = signal.fftconvolve(self.samples, impulse_segment.samples,
+                                     "full")
        self._samples = samples
-    def convolve_and_normalize(self, ir, allow_resample=False):
+    def convolve_and_normalize(self, impulse_segment, allow_resample=False):
        """Convolve and normalize the resulting audio segment so that it
        has the same average power as the input signal.
-        :param ir: impulse response
+        :param impulse_segment: Impulse response segments.
-        :type ir: AudioSegment
+        :type impulse_segment: AudioSegment
-        :param allow_resampling: indicates whether resampling is allowed
+        :param allow_resample: indicates whether resampling is allowed when
-            when the ir has a different sample rate from this signal.
+                               the impulse_segment has a different sample rate from this signal.
-        :type allow_resampling: boolean
+        :type allow_resample: boolean
        """
-        self.convolve(ir, allow_resampling=allow_resampling)
+        target_db = self.rms_db
-        self.normalize(target_db=self.rms_db)
+        self.convolve(impulse_segment, allow_resample=allow_resample)
+        self.normalize(target_db)
    def add_noise(self,
                  noise,
@@ -492,36 +460,33 @@ class AudioSegment(object):
        of matching length is sampled from it and used instead.
        :param noise: Noise signal to add.
-        :type noise: SpeechDLSegment
+        :type noise: AudioSegment
        :param snr_dB: Signal-to-Noise Ratio, in decibels.
-        :type snr_dB: scalar
+        :type snr_dB: float
-        :param allow_downsampling: whether to allow the noise signal
+        :param allow_downsampling: whether to allow the noise signal to be downsampled
-            to be downsampled to match the base signal sample rate.
+                                   to match the base signal sample rate.
        :type allow_downsampling: boolean
-        :param max_gain_db: Maximum amount of gain to apply to noise
+        :param max_gain_db: Maximum amount of gain to apply to noise signal before
-            signal before adding it in.  This is to prevent attempting
+                            adding it in. This is to prevent attempting to apply infinite
-            to apply infinite gain to a zero signal.
+                            gain to a zero signal.
-        :type max_gain_db: scalar
+        :type max_gain_db: float
        :param rng: Random number generator state.
        :type rng: random.Random
+        :raises ValueError: If the sample rate does not match between the two audio segments
-        Returns:
+                            and resample is not allowed, or if the duration of noise segments
-            SpeechDLSegment: signal with noise added.
+                            is shorter than original audio segments.
        """
-        if rng is None:
+        rng = random.Random() if rng is None else rng
-            rng = random.Random()
        if allow_downsampling and noise.sample_rate > self.sample_rate:
            noise = noise.resample(self.sample_rate)
        if noise.sample_rate != self.sample_rate:
-            raise ValueError("Noise sample rate ({}Hz) is not equal to "
+            raise ValueError("Noise sample rate (%d Hz) is not equal to "
-                             "base signal sample rate ({}Hz)."
+                             "base signal sample rate (%d Hz)." %
-                             .format(noise.sample_rate, self.sample_rate))
+                             (noise.sample_rate, self.sample_rate))
        if noise.duration < self.duration:
-            raise ValueError("Noise signal ({} sec) must be at "
+            raise ValueError("Noise signal (%f sec) must be at "
-                             "least as long as base signal ({} sec)."
+                             "least as long as base signal (%f sec)." %
-                             .format(noise.duration, self.duration))
+                             (noise.duration, self.duration))
        noise_gain_db = self.rms_db - noise.rms_db - snr_dB
        noise_gain_db = min(max_gain_db, noise_gain_db)
        noise_subsegment = noise.random_subsegment(self.duration, rng=rng)
@@ -529,6 +494,12 @@ class AudioSegment(object):
        self._samples = output._samples
        self._sample_rate = output._sample_rate
+    def tranform_noise(self, noise_subsegment, noise_gain_db):
+        """ tranform noise file
+        """
+        return type(self)(noise_subsegment._samples * (10.**(
+            noise_gain_db / 20.)), noise_subsegment._sample_rate)
    @property
    def samples(self):
        """Return audio samples.
@@ -618,9 +589,3 @@ class AudioSegment(object):
        else:
            raise TypeError("Unsupported sample type: %s." % samples.dtype)
        return output_samples.astype(dtype)
-    def tranform_noise(self, noise_subsegment, noise_gain_db):
-        """ tranform noise file
-        """
-        return type(self)(noise_subsegment._samples * (10.**(
-            noise_gain_db / 20.)), noise_subsegment._sample_rate)
--- a/deep_speech_2/data_utils/augmentor/audio_database.py
+++ b/deep_speech_2/data_utils/augmentor/audio_database.py
-from __future__ import print_function
-from collections import defaultdict
-import bisect
-import logging
-import numpy as np
-import os
-import random
-import sys
-UNK_TAG = "<UNK>"
-def stream_audio_index(fname, UNK=UNK_TAG):
-    """Reads an audio index file and emits one record in the index at a time.
-    :param fname: audio index path
-    :type fname: basestring
-    :param UNK: UNK token to denote that certain audios are not tagged.
-    :type UNK: basesring
-    Yields:
-        idx, duration, size, relpath, tags (int, float, int, str, list(str)):
-            audio file id, length of the audio in seconds, size in byte,
-            relative path w.r.t. to the root noise directory, list of tags
-    """
-    with open(fname) as audio_index_file:
-        for i, line in enumerate(audio_index_file):
-            tok = line.strip().split("\t")
-            assert len(tok) >= 4, \
-                "Invalid line at line {} in file {}".format(
-                    i + 1, audio_index_file)
-            idx = int(tok[0])
-            duration = float(tok[1])
-            # Sometimes, the duration can round down to 0.0
-            assert duration >= 0.0, \
-                "Invalid duration at line {} in file {}".format(
-                    i + 1, audio_index_file)
-            size = int(tok[2])
-            assert size > 0, \
-                "Invalid size at line {} in file {}".format(
-                    i + 1, audio_index_file)
-            relpath = tok[3]
-            if len(tok) == 4:
-                tags = [UNK_TAG]
-            else:
-                tags = tok[4:]
-            yield idx, duration, size, relpath, tags
-def truncate_float(val, ndigits=6):
-    """ Truncates a floating-point value to have the desired number of
-    digits after the decimal point.
-    :param val: input value.
-    :type val: float
-    :parma ndigits: desired number of digits.
-    :type ndigits: int
-    :return: truncated value
-    :rtype: float
-    """
-    p = 10.0**ndigits
-    return float(int(val * p)) / p
-def print_audio_index(idx, duration, size, relpath, tags, file=sys.stdout):
-    """Prints an audio record to the index file.
-    :param idx: Audio file id.
-    :type idx: int
-    :param duration: length of the audio in seconds
-    :type duration: float
-    :param size: size of the file in bytes
-    :type size: int
-    :param relpath: relative path w.r.t. to the root noise directory.
-    :type relpath:  basestring
-    :parma tags: list of tags
-    :parma tags: list(str)
-    :parma file: file to which we want to write an audio record.
-    :type file: sys.stdout
-    """
-    file.write("{}\t{:.6f}\t{}\t{}"
-               .format(idx, truncate_float(duration, ndigits=6), size, relpath))
-    for tag in tags:
-        file.write("\t{}".format(tag))
-    file.write("\n")
-class AudioIndex(object):
-    """ In-memory index of audio files that do not have annotations.
-    This supports duration-based sampling and sampling from a target
-    distribution.
-    Each line in the index file consists of the following fields:
-        (id (int), duration (float), size (int), relative path (str),
-         list of tags ([str]))
-    """
-    def __init__(self):
-        self.audio_dir = None
-        self.index_fname = None
-        self.tags = None
-        self.bin_size = 2.0
-        self.clear()
-    def clear(self):
-        """ Clears the index
-        Returns:
-            None
-        """
-        self.idx_to_record = {}
-        # The list of indices correspond to audio files whose duration is
-        # greater than or equal to the key.
-        self.duration_to_id_set = {}
-        self.duration_to_id_set_per_tag = defaultdict(lambda: {})
-        self.duration_to_list = defaultdict(lambda: [])
-        self.duration_to_list_per_tag = defaultdict(
-            lambda: defaultdict(lambda: []))
-        self.tag_to_id_set = defaultdict(lambda: set())
-        self.shared_duration_bins = []
-        self.id_set_complete = set()
-        self.id_set = set()
-        self.duration_bins = []
-    def has_audio(self, distr=None):
-        """
-        :param distr: The target distribution of audio tags that we want to
-            match. If this is not supplied, the function simply checks that
-            there are some audio files.
-        :parma distr: dict
-        :return: True if there are audio files.
-        :rtype: boolean
-        """
-        if distr is None:
-            return len(self.id_set) > 0
-        else:
-            for tag in distr:
-                if tag not in self.duration_to_list_per_tag:
-                    return False
-            return True
-    def _load_all_records_from_disk(self, audio_dir, idx_fname, bin_size):
-        """Loads all audio records from the disk into memory and groups them
-        into chunks based on their duration and the bin_size granalarity.
-        Once all the records are read, indices are built from these records
-        by another function so that the audio samples can be drawn efficiently.
-        Updates:
-            self.audio_dir (path): audio root directory
-            self.idx_fname (path): audio database index filename
-            self.bin_size (float): granularity of bins
-            self.idx_to_record (dict): maps from the audio id to
-                (duration, file_size, relative_path, tags)
-            self.tag_to_id_set (dict): maps from the tag to
-                the set of id's of audios that have this tag.
-            self.id_set_complete (set): set of all audio id's in the index file
-            self.min_duration (float): minimum audio duration observed in the
-                index file
-            self.duration_bins (list): the lower bounds on the duration of
-                audio files falling in each bin
-            self.duration_to_id_set (dict): contains (k, v) where v is the set
-                of id's of audios whose lengths are longer than or equal to k.
-                (e.g. k is the duration lower bound of this bin).
-            self.duration_to_id_set_per_tag (dict): Something like above but
-                has a finer granularity mapping from the tag to
-                duration_to_id_set.
-            self.shared_duration_bins (list): list of sets where each set
-                contains duration lower bounds whose audio id sets are the
-                same. The rationale for having this is that there are a few
-                but extremely long audio files which lead to a lot of bins.
-                When the id sets do not change across various minimum duration
-                boundaries, we
-                cluster these together and make them point to the same id set
-                reference.
-        :return: whether the records were read from the disk. The assumption is
-            that the audio index file on disk and the actual audio files
-            are constructed once and never change during training. We only
-            re-read when either the directory or the index file path change.
-        """
-        if self.audio_dir == audio_dir and self.idx_fname == idx_fname and \
-           self.bin_size == bin_size:
-            # The audio directory and/or the list of audio files
-            # haven't changed. No need to load the list again.
-            return False
-        # Remember where the audio index is most recently read from.
-        self.audio_dir = audio_dir
-        self.idx_fname = idx_fname
-        self.bin_size = bin_size
-        # Read in the idx and compute the number of bins necessary
-        self.clear()
-        rank = []
-        min_duration = float('inf')
-        max_duration = float('-inf')
-        for idx, duration, file_size, relpath, tags in \
-                stream_audio_index(idx_fname):
-            self.idx_to_record[idx] = (duration, file_size, relpath, tags)
-            max_duration = max(max_duration, duration)
-            min_duration = min(min_duration, duration)
-            rank.append((duration, idx))
-            for tag in tags:
-                self.tag_to_id_set[tag].add(idx)
-        if len(rank) == 0:
-            # file is empty
-            raise IOError("Index file {} is empty".format(idx_fname))
-        for tag in self.tag_to_id_set:
-            self.id_set_complete |= self.tag_to_id_set[tag]
-        dur = min_duration
-        self.min_duration = min_duration
-        while dur < max_duration + bin_size:
-            self.duration_bins.append(dur)
-            dur += bin_size
-        # Sort in decreasing order of duration and populate
-        # the cumulative indices lists.
-        rank.sort(reverse=True)
-        # These are indices for `rank` and used to keep track of whether
-        # there are new records to add in the current bin.
-        last = 0
-        cur = 0
-        # The set of audios falling in the previous bin; in the case,
-        # where we don't find new audios for the current bin, we store
-        # the reference to the last set so as to conserve memory.
-        # This is not such a big problem if the audio duration is
-        # bounded by a small number like 30 seconds and the
-        # bin size is big enough. But, for raw freesound audios,
-        # some audios can be as long as a few hours!
-        last_audio_set = set()
-        # The same but for each tag so that we can pick audios based on
-        # tags and also some user-specified tag distribution.
-        last_audio_set_per_tag = defaultdict(lambda: set())
-        # Set of lists of bins sharing the same audio sets.
-        shared = set()
-        for i in range(len(self.duration_bins) - 1, -1, -1):
-            lower_bound = self.duration_bins[i]
-            new_audio_idxs = set()
-            new_audio_idxs_per_tag = defaultdict(lambda: set())
-            while cur < len(rank) and rank[cur][0] >= lower_bound:
-                idx = rank[cur][1]
-                tags = self.idx_to_record[idx][3]
-                new_audio_idxs.add(idx)
-                for tag in tags:
-                    new_audio_idxs_per_tag[tag].add(idx)
-                cur += 1
-            # This makes certain that the same list is shared across
-            # different bins if no new indices are added.
-            if cur == last:
-                shared.add(lower_bound)
-            else:
-                last_audio_set = last_audio_set | new_audio_idxs
-                for tag in new_audio_idxs_per_tag:
-                    last_audio_set_per_tag[tag] = \
-                        last_audio_set_per_tag[tag] | \
-                        new_audio_idxs_per_tag[tag]
-                if len(shared) > 0:
-                    self.shared_duration_bins.append(shared)
-                shared = set([lower_bound])
-                ### last_audio_set = set()  should set blank
-            last = cur
-            self.duration_to_id_set[lower_bound] = last_audio_set
-            for tag in last_audio_set_per_tag:
-                self.duration_to_id_set_per_tag[lower_bound][tag] = \
-                    last_audio_set_per_tag[tag]
-        # The last `shared` record isn't added to the `shared_duration_bins`.
-        self.shared_duration_bins.append(shared)
-        # We make sure that the while loop above has exhausted through the
-        # `rank` list by checking if the `cur`rent index in `rank` equals
-        # the length of the array, which is the halting condition.
-        assert cur == len(rank)
-        return True
-    def _build_index_from_records(self, tag_list):
-        """ Uses the in-memory records read from the index file to build
-        an in-memory index restricted to the given tag list.
-        :param tag_list: List of tags we are interested in sampling from.
-        :type tag_list: list(str)
-        Updates:
-            self.id_set (set): the set of all audio id's that can be sampled.
-            self.duration_to_list (dict): maps from the duration lower bound
-                to the id's of audios longer than this duration.
-            self.duration_to_list_per_tag (dict): maps from the tag to
-                the same structure as self.duration_to_list. This is to support
-                sampling from a target noise distribution.
-        :return: whether the index was built from scratch
-        """
-        if self.tags == tag_list:
-            return False
-        self.tags = tag_list
-        if len(tag_list) == 0:
-            self.id_set = self.id_set_complete
-        else:
-            self.id_set = set()
-            for tag in tag_list:
-                self.id_set |= self.tag_to_id_set[tag]
-        # Next, we need to take a subset of the audio files
-        for shared in self.shared_duration_bins:
-            # All bins in `shared' have the same index lists
-            # so we can intersect once and set all of them to this list.
-            lb = list(shared)[0]
-            intersected = list(self.id_set & self.duration_to_id_set[lb])
-            duration_to_id_set = self.duration_to_id_set_per_tag[lb]
-            intersected_per_tag = {
-                tag: self.tag_to_id_set[tag] & duration_to_id_set[tag]
-                for tag in duration_to_id_set
-            }
-            for bin_key in shared:
-                self.duration_to_list[bin_key] = intersected
-                for tag in intersected_per_tag:
-                    self.duration_to_list_per_tag[tag][bin_key] = \
-                        intersected_per_tag[tag]
-        assert len(self.duration_to_list) == len(self.duration_to_id_set)
-        return True
-    def refresh_records_from_index_file(self,
-                                        audio_dir,
-                                        idx_fname,
-                                        tag_list,
-                                        bin_size=2.0):
-        """ Loads the index file and populates the records
-        for building the internal index.
-        If the audio directory or index file name has changed, the whole index
-        is reloaded from scratch. If only the tag_list is changed, then the
-        desired index is built from the complete, in-memory record.
-        :param audio_dir: audio directory
-        :type audio_dir: basestring
-        :param idx_fname: audio index file name
-        :type idex_fname: basestring
-        :param tag_list: list of tags we are interested in loading;
-            if empty, we load all.
-        :type tag_list: list
-        :param bin_size: optional argument for controlling the granularity
-            of duration bins
-        :type bin_size: float
-        """
-        if tag_list is None:
-            tag_list = []
-        reloaded_records = self._load_all_records_from_disk(audio_dir,
-                                                            idx_fname, bin_size)
-        if reloaded_records or self.tags != tag_list:
-            self._build_index_from_records(tag_list)
-            logger.info('loaded {} audio files from {}'
-                        .format(len(self.id_set), idx_fname))
-    def sample_audio(self, duration, rng=None, distr=None):
-        """ Uniformly draws an audio record of at least the desired duration
-        :param duration: minimum desired audio duration
-        :type duration: float
-        :param rng: random number generator
-        :type rng: random.Random
-        :param distr: target distribution of audio tags. If not provided,
-        :type distr: dict
-        all audio files are sampled uniformly at random.
-        :returns: success, (duration, file_size, path)
-        """
-        if duration < 0.0:
-            duration = self.min_duration
-        i = bisect.bisect_left(self.duration_bins, duration)
-        if i == len(self.duration_bins):
-            return False, None
-        bin_key = self.duration_bins[i]
-        if distr is None:
-            indices = self.duration_to_list[bin_key]
-        else:
-            # If a desired audio distribution is given, we sample from it.
-            if rng is None:
-                rng = random.Random()
-            nprng = np.random.RandomState(rng.getrandbits(32))
-            prob_masses = distr.values()
-            prob_masses /= np.sum(prob_masses)
-            tag = nprng.choice(distr.keys(), p=prob_masses)
-            indices = self.duration_to_list_per_tag[tag][bin_key]
-        if len(indices) == 0:
-            return False, None
-        else:
-            if rng is None:
-                rng = random.Random()
-            # duration, file size and relative path from root
-            s = self.idx_to_record[rng.sample(indices, 1)[0]]
-            s = (s[0], s[1], os.path.join(self.audio_dir, s[2]))
-            return True, s
--- a/deep_speech_2/data_utils/augmentor/augmentation.py
+++ b/deep_speech_2/data_utils/augmentor/augmentation.py
@@ -6,11 +6,6 @@ from __future__ import print_function
 import json
 import random
 from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor
-from data_utils.augmentor.resamler import ResamplerAugmentor
-from data_utils.augmentor.speed_perturb import SpeedPerturbatioAugmentor
-from data_utils.augmentor.online_bayesian_normalization import OnlineBayesianNormalizationAugmentor
-from data_utils.augmentor.Impulse_response import ImpulseResponseAugmentor
-from data_utils.augmentor.noise_speech import NoiseSpeechAugmentor
 class AugmentationPipeline(object):
@@ -81,15 +76,5 @@ class AugmentationPipeline(object):
        """Return an augmentation model by the type name, and pass in params."""
        if augmentor_type == "volume":
            return VolumePerturbAugmentor(self._rng, **params)
-        if augmentor_type == "resamle":
-            return ResamplerAugmentor(self._rng, **params)
-        if augmentor_type == "speed":
-            return SpeedPerturbatioAugmentor(self._rng, **params)
-        if augmentor_type == "online_bayesian_normalization":
-            return OnlineBayesianNormalizationAugmentor(self._rng, **params)
-        if augmentor_type == "Impulse_response":
-            return ImpulseResponseAugmentor(self._rng, **params)
-        if augmentor_type == "noise_speech":
-            return NoiseSpeechAugmentor(self._rng, **params)
        else:
            raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
--- a/deep_speech_2/data_utils/augmentor/implus_response.py
+++ b/deep_speech_2/data_utils/augmentor/implus_response.py
-""" Impulse response"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from . import base
-from . import audio_database
-from data_utils.speech import SpeechSegment
-class ImpulseResponseAugmentor(base.AugmentorBase):
-    """ Instantiates an impulse response model
-    :param ir_dir: directory containing impulse responses
-    :type ir_dir: basestring
-    :param tags: optional parameter for specifying what
-            particular impulse responses to apply.
-    :type tags: list
-    :parm tag_distr: optional noise distribution
-    :type tag_distr: dict
-    """
-    def __init__(self, rng, ir_dir, index_file, tags=None, tag_distr=None):
-        # Define all required parameter maps here.
-        self.ir_dir = ir_dir
-        self.index_file = index_file
-        self.tags = tags
-        self.tag_distr = tag_distr
-        self.audio_index = audio_database.AudioIndex()
-        self.rng = rng
-    def _init_data(self):
-        """ Preloads stuff from disk in an attempt (e.g. list of files, etc)
-        to make later loading faster. If the data configuration remains the
-        same, this function does nothing.
-        """
-        self.audio_index.refresh_records_from_index_file(
-            self.ir_dir, self.index_file, self.tags)
-    def transform_audio(self, audio_segment):
-        """ Convolves the input audio with an impulse response.
-        :param audio_segment: input audio
-        :type audio_segment: AudioSegemnt
-        """
-        # This handles the cases where the data source or directories change.
-        self._init_data()
-        read_size = 0
-        tag_distr = self.tag_distr
-        if not self.audio_index.has_audio(tag_distr):
-            if tag_distr is None:
-                if not self.tags:
-                    raise RuntimeError("The ir index does not have audio "
-                                       "files to sample from.")
-                else:
-                    raise RuntimeError("The ir index does not have audio "
-                                       "files of the given tags to sample "
-                                       "from.")
-            else:
-                raise RuntimeError("The ir index does not have audio "
-                                   "files to match the target ir "
-                                   "distribution.")
-        else:
-            # Querying with a negative duration triggers the index to search
-            # from all impulse responses.
-            success, record = self.audio_index.sample_audio(
-                -1.0, rng=self.rng, distr=tag_distr)
-            if success is True:
-                _, read_size, ir_fname = record
-                ir_wav = SpeechSegment.from_file(ir_fname)
-                audio_segment.convolve(ir_wav, allow_resampling=True)
--- a/deep_speech_2/data_utils/augmentor/noise_speech.py
+++ b/deep_speech_2/data_utils/augmentor/noise_speech.py
-""" noise speech
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import logging
-import numpy as np
-import os
-from collections import defaultdict
-from . import base
-from . import audio_database
-from data_utils.speech import SpeechSegment
-TURK = "turk"
-USE_AUDIO_DATABASE_SOURCES = frozenset(["freesound", "chime"])
-HALF_NOISE_LENGTH_MIN_THRESHOLD = 3.0
-FIND_NOISE_MAX_ATTEMPTS = 20
-logger = logging.getLogger(__name__)
-def get_first_smaller(items, value):
-    index = bisect.bisect_left(items, value) - 1
-    assert items[index] < value, \
-        'get_first_smaller failed! %d %d' % (items[index], value)
-    return items[index]
-def get_first_larger(items, value):
-    'Find leftmost value greater than value'
-    index = bisect.bisect_right(items, value)
-    assert index < len(items), \
-        "no noise bin exists for this audio length (%f)" % value
-    assert items[index] > value, \
-        'get_first_larger failed! %d %d' % (items[index], value)
-    return items[index]
-def _get_turk_noise_files(noise_dir, index_file):
-    """ Creates a map from duration => a list of noise filenames
-    :param noise_dir: Directory of noise files which contains
-        "noise-samples-list"
-    :type noise_dir: basestring
-    :param index_file: Noise list
-    :type index_file: basestring
-    returns:noise_files (defaultdict): A map of bins to noise files.
-        Each key is the duration, and the value is a list of noise
-        files binned to this duration. Each bin is 2 secs.
-    Note: noise-samples-list should contain one line per noise (wav) file
-        along with its duration in milliseconds
-    """
-    noise_files = defaultdict(list)
-    if not os.path.exists(index_file):
-        logger.error('No noise files were found at {}'.format(index_file))
-        return noise_files
-    num_noise_files = 0
-    rounded_durations = list(range(0, 65, 2))
-    with open(index_file, 'r') as fl:
-        for line in fl:
-            fname = os.path.join(noise_dir, line.strip().split()[0])
-            duration = float(line.strip().split()[1]) / 1000
-            # bin the noise files into length bins rounded by 2 sec
-            bin_id = get_first_smaller(rounded_durations, duration)
-            noise_files[bin_id].append(fname)
-            num_noise_files += 1
-    logger.info('Loaded {} turk noise files'.format(num_noise_files))
-    return noise_files
-class NoiseSpeechAugmentor(base.AugmentorBase):
-    """ Noise addition block
-    :param snr_min: minimum signal-to-noise ratio
-    :type snr_min: float
-    :param snr_max: maximum signal-to-noise ratio
-    :type snr_max: float
-    :param noise_dir: root of where noise files are stored
-    :type noise_fir: basestring
-    :param index_file: index of noises of interest in noise_dir
-    :type index_file: basestring
-    :param source: select one from
-        - turk
-        - freesound
-        - chime
-        Note that this field is no longer required for the freesound
-        and chime
-    :type source: string
-    :param tags: optional parameter for specifying what
-        particular noises we want to add. See above for the available tags.
-    :type tags: list
-    :param tag_distr: optional noise distribution
-    :type tag_distr: dict
-    """
-    def __init__(self,
-                 rng,
-                 snr_min,
-                 snr_max,
-                 noise_dir,
-                 source,
-                 allow_downsampling=None,
-                 index_file=None,
-                 tags=None,
-                 tag_distr=None):
-        # Define all required parameter maps here.
-        self.rng = rng
-        self.snr_min = snr_min
-        self.snr_max = snr_max
-        self.noise_dir = noise_dir
-        self.source = source
-        self.allow_downsampling = allow_downsampling
-        self.index_file = index_file
-        self.tags = tags
-        self.tag_distr = tag_distr
-        # When new noise sources are added, make sure to define the
-        # associated bookkeeping variables here.
-        self.turk_noise_files = []
-        self.turk_noise_dir = None
-        self.audio_index = audio_database.AudioIndex()
-    def _init_data(self):
-        """ Preloads stuff from disk in an attempt (e.g. list of files, etc)
-        to make later loading faster. If the data configuration remains the
-        same, this function does nothing.
-        """
-        noise_dir = self.noise_dir
-        index_file = self.index_file
-        source = self.source
-        if not index_file:
-            if source == TURK:
-                index_file = os.path.join(noise_dir, 'noise-samples-list')
-                logger.debug("index_file not provided; " + "defaulting to " +
-                             index_file)
-            else:
-                if source != "":
-                    assert source in USE_AUDIO_DATABASE_SOURCES, \
-                        "{} not supported by audio_database".format(source)
-                index_file = os.path.join(noise_dir,
-                                          "audio_index_commercial.txt")
-                logger.debug("index_file not provided; " + "defaulting to " +
-                             index_file)
-        if source == TURK:
-            if self.turk_noise_dir != noise_dir:
-                self.turk_noise_dir = noise_dir
-                self.turk_noise_files = _get_turk_noise_files(noise_dir,
-                                                              index_file)
-        # elif source == TODO_SUPPORT_NON_AUDIO_DATABASE_BASED_SOURCES:
-        else:
-            if source != "":
-                assert source in USE_AUDIO_DATABASE_SOURCES, \
-                    "{} not supported by audio_database".format(source)
-            self.audio_index.refresh_records_from_index_file(
-                self.noise_dir, index_file, self.tags)
-    def transform_audio(self, audio_segment):
-        """Adds walla noise
-        :param audio_segment: Input audio
-        :type audio_segment: SpeechSegment
-        """
-        # This handles the cases where the data source or directories change.
-        self._init_data
-        source = self.source
-        allow_downsampling = self.allow_downsampling
-        if source == TURK:
-            self._add_turk_noise(audio_segment, self.rng, allow_downsampling)
-        # elif source == TODO_SUPPORT_NON_AUDIO_DATABASE_BASED_SOURCES:
-        else:
-            self._add_noise(audio_segment, self.rng, allow_downsampling)
-    def _sample_snr(self):
-        """ Returns a float sampled in [`self.snr_min`, `self.snr_max`]
-        if both `self.snr_min` and `self.snr_max` are non-zero.
-        """
-        snr_min = self.snr_min
-        snr_max = self.snr_max
-        sampled_snr = self.rng.uniform(snr_min, snr_max)
-        return sampled_snr
-    def _add_turk_noise(self, audio_segment, allow_downsampling):
-        """ Adds a turk noise to the input audio.
-        :param audio_segment: input audio
-        :type audio_segment: audiosegment
-        :param allow_downsampling: indicates whether downsampling
-            is allowed
-        :type allow_downsampling: boolean 
-        """
-        read_size = 0
-        if len(self.turk_noise_files) > 0:
-            snr = self._sample_snr(self.rng)
-            # Draw the noise file randomly from noise files that are
-            # slightly longer than the utterance
-            noise_bins = sorted(self.turk_noise_files.keys())
-            # note some bins can be empty, so we can't just round up
-            # to the nearest 2-sec interval
-            rounded_duration = get_first_larger(noise_bins,
-                                                audio_segment.duration)
-            noise_fname = \
-                self.rng.sample(self.turk_noise_files[rounded_duration], 1)[0]
-            noise = SpeechSegment.from_wav_file(noise_fname)
-            logger.debug('noise_fname {}'.format(noise_fname))
-            logger.debug('snr {}'.format(snr))
-            read_size = len(noise) * 2
-            # May throw exceptions, but this is caught by
-            # AudioFeaturizer.get_audio_files.
-            audio_segment.add_noise(
-                noise, snr, rng=self.rng, allow_downsampling=allow_downsampling)
-    def _add_noise(self, audio_segment, allow_downsampling):
-        """ Adds a noise indexed in audio_database.AudioIndex.
-        :param audio_segment: input audio
-        :type audio_segment: SpeechSegment
-        :param allow_downsampling: indicates whether downsampling
-            is allowed
-        :type allow_downsampling: boolean
-        Returns:
-            (SpeechSegment, int)
-                - sound with turk noise added
-                - number of bytes read from disk
-        """
-        read_size = 0
-        tag_distr = self.tag_distr
-        if not self.audio_index.has_audio(tag_distr):
-            if tag_distr is None:
-                if not self.tags:
-                    raise RuntimeError("The noise index does not have audio "
-                                       "files to sample from.")
-                else:
-                    raise RuntimeError("The noise index does not have audio "
-                                       "files of the given tags to sample "
-                                       "from.")
-            else:
-                raise RuntimeError("The noise index does not have audio "
-                                   "files to match the target noise "
-                                   "distribution.")
-        else:
-            # Compute audio segment related statistics
-            audio_duration = audio_segment.duration
-            # Sample relevant augmentation parameters.
-            snr = self._sample_snr(self.rng)
-            # Perhaps, we may not have a sufficiently long noise, so we need
-            # to search iteratively.
-            min_duration = audio_duration + 0.25
-            for _ in range(FIND_NOISE_MAX_ATTEMPTS):
-                logger.debug("attempting to find noise of length "
-                             "at least {}".format(min_duration))
-                success, record = \
-                    self.audio_index.sample_audio(min_duration,
-                                                  rng=self.rng,
-                                                  distr=tag_distr)
-                if success is True:
-                    noise_duration, read_size, noise_fname = record
-                    # Assert after logging so we know
-                    # what caused augmentation to fail.
-                    logger.debug("noise_fname {}".format(noise_fname))
-                    logger.debug("snr {}".format(snr))
-                    assert noise_duration >= min_duration
-                    break
-                # Decrease the desired minimum duration linearly.
-                # If the value becomes smaller than some threshold,
-                # we half the value instead.
-                if min_duration > HALF_NOISE_LENGTH_MIN_THRESHOLD:
-                    min_duration -= 2.0
-                else:
-                    min_duration *= 0.5
-            if success is False:
-                logger.info("Failed to find a noise file")
-                return
-            diff_duration = audio_duration + 0.25 - noise_duration
-            if diff_duration >= 0.0:
-                # Here, the noise is shorter than the audio file, so
-                # we pad with zeros to make sure the noise sound is applied
-                # with a uniformly random shift.
-                noise = SpeechSegment.from_file(noise_fname)
-                noise = noise.pad_silence(diff_duration, sides="both")
-            else:
-                # The noise clip is at least ~25 ms longer than the audio
-                # segment here.
-                diff_duration = int(noise_duration * audio_segment.sample_rate) - \
-                    int(audio_duration * audio_segment.sample_rate) - \
-                    int(0.02 * audio_segment.sample_rate)
-                start = float(self.rng.randint(0, diff_duration)) / \
-                    audio.sample_rate
-                finish = min(start + audio_duration + 0.2, noise_duration)
-                noise = SpeechSegment.slice_from_file(noise_fname, start,
-                                                      finish)
-            if len(noise) < len(audio_segment):
-                # This is to ensure that the noise clip is at least as
-                # long as the audio segment.
-                num_samples_to_pad = len(audio_segment) - len(noise)
-                # Padding this amount of silence on both ends ensures that
-                # the placement of the noise clip is uniformly random.
-                silence = SpeechSegment(
-                    np.zeros(num_samples_to_pad), audio_segment.sample_rate)
-                noise = SpeechSegment.concatenate(silence, noise, silence)
-            audio_segment.add_noise(
-                noise, snr, rng=self.rng, allow_downsampling=allow_downsampling)
--- a/deep_speech_2/data_utils/augmentor/online_bayesian_normalization.py
+++ b/deep_speech_2/data_utils/augmentor/online_bayesian_normalization.py
-""" Online bayesian normalization
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from . import base
-class OnlineBayesianNormalizationAugmentor(base.AugmentorBase):
-    """ 
-    Instantiates an online bayesian normalization module.
-    :param target_db: Target RMS value in decibels
-            :type target_db: func[int->scalar]
-            :param prior_db: Prior RMS estimate in decibels
-            :type prior_db: func[int->scalar]
-            :param prior_samples: Prior strength in number of samples
-            :type prior_samples: func[int->scalar]
-            :param startup_delay: Start-up delay in seconds during
-                which normalization statistics is accrued.
-            :type starup_delay: func[int->scalar]
-    """
-    def __init__(self,
-                 rng,
-                 target_db,
-                 prior_db,
-                 prior_samples,
-                 startup_delay=base.parse_parameter_from(0.0)):
-        self.target_db = target_db
-        self.prior_db = prior_db
-        self.prior_samples = prior_samples
-        self.startup_delay = startup_delay
-        self.rng = rng
-    def transform_audio(self, audio_segment):
-        """
-        Normalizes the input audio using the online Bayesian approach.
-        :param audio_segment: input audio
-        :type audio_segment: SpeechSegment
-        :param iteration: current iteration
-        :type iteration: int
-        :param text: audio transcription
-        :type text: basestring
-        :param rng: RNG to use for augmentation
-        :type rng: random.Random
-        """
-        read_size = 0
-        target_db = self.target_db(iteration)
-        prior_db = self.prior_db(iteration)
-        prior_samples = self.prior_samples(iteration)
-        startup_delay = self.startup_delay(iteration)
-        audio.normalize_online_bayesian(
-            target_db, prior_db, prior_samples, startup_delay=startup_delay)
--- a/deep_speech_2/data_utils/augmentor/resampler.py
+++ b/deep_speech_2/data_utils/augmentor/resampler.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from . import base
-class ResamplerAugmentor(base.AugmentorBase):
-    """ Instantiates a resampler module.
-    :param new_sample_rate: New sample rate in Hz
-    :type new_sample_rate: func[int->scalar]
-    :param rng: Random generator object.
-    :type rng: random.Random
-    """
-    def __init__(self, rng, new_sample_rate):
-        self.new_sample_rate = new_sample_rate
-        self._rng = rng
-    def transform_audio(self, audio_segment):
-        """ Resamples the input audio to the target sample rate.
-        Note that this is an in-place transformation.
-        :param audio: input audio
-        :type audio: SpeechDLSegment
-        """
-        new_sample_rate = self.new_sample_rate
-        audio.resample(new_sample_rate)
\ No newline at end of file
--- a/deep_speech_2/data_utils/augmentor/speed_perturb.py
+++ b/deep_speech_2/data_utils/augmentor/speed_perturb.py
-"""Speed perturbation module for making ASR robust to different voice
-types (high pitched, low pitched, etc)
-Samples uniformly between speed_min and speed_max
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from . import base
-class SpeedPerturbatioAugmentor(base.AugmentorBase):
-    """ 
-    Instantiates a speed perturbation module.
-    See reference paper here:
-    http://www.danielpovey.com/files/2015_interspeech_augmentation.pdf
-    :param speed_min: Lower bound on new rate to sample
-    :type speed_min: func[int->scalar]
-    :param speed_max: Upper bound on new rate to sample
-    :type speed_max: func[int->scalar]
-    """
-    def __init__(self, rng, speed_min, speed_max):
-        if (speed_min < 0.9):
-            raise ValueError(
-                "Sampling speed below 0.9 can cause unnatural effects")
-        if (speed_min > 1.1):
-            raise ValueError(
-                "Sampling speed above 1.1 can cause unnatural effects")
-        self.speed_min = speed_min
-        self.speed_max = speed_max
-        self.rng = rng
-    def transform_audio(self, audio_segment):
-        """ 
-        Samples a new speed rate from the given range and
-        changes the speed of the given audio clip.
-        Note that this is an in-place transformation.
-        :param audio_segment: input audio
-        :type audio_segment: SpeechDLSegment
-        """
-        read_size = 0
-        speed_min = self.speed_min(iteration)
-        speed_max = self.speed_max(iteration)
-        sampled_speed = rng.uniform(speed_min, speed_max)
-        audio = audio.change_speed(sampled_speed)
--- a/deep_speech_2/data_utils/augmentor/volume_perturb.py
+++ b/deep_speech_2/data_utils/augmentor/volume_perturb.py
@@ -3,10 +3,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-from . import base
+from data_utils.augmentor.base import AugmentorBase
-class VolumePerturbAugmentor(base.AugmentorBase):
+class VolumePerturbAugmentor(AugmentorBase):
    """Augmentation model for adding random volume perturbation.
    This is used for multi-loudness training of PCEN. See

--- a/deep_speech_2/requirements.txt
+++ b/deep_speech_2/requirements.txt
 SoundFile==0.9.0.post1
 wget==3.2
+scikits.samplerate==0.3.3
+scipy==0.13.0b1