add audio part

42ba74ef · chrisxu2014 · a84bdf64 · 42ba74ef · a84bdf64 · 42ba74ef
10 changed file
--- a/deep_speech_2/data_utils/audio.py
+++ b/deep_speech_2/data_utils/audio.py
--- a/deep_speech_2/data_utils/augmentor/audio_database.py
+++ b/deep_speech_2/data_utils/augmentor/audio_database.py
-from __future__ import print_function
-from collections import defaultdict
-import bisect
-import logging
-import numpy as np
-import os
-import random
-import sys
-UNK_TAG = "<UNK>"
-def stream_audio_index(fname, UNK=UNK_TAG):
-    """Reads an audio index file and emits one record in the index at a time.
-    :param fname: audio index path
-    :type fname: basestring
-    :param UNK: UNK token to denote that certain audios are not tagged.
-    :type UNK: basesring
-    Yields:
-        idx, duration, size, relpath, tags (int, float, int, str, list(str)):
-            audio file id, length of the audio in seconds, size in byte,
-            relative path w.r.t. to the root noise directory, list of tags
-    """
-    with open(fname) as audio_index_file:
-        for i, line in enumerate(audio_index_file):
-            tok = line.strip().split("\t")
-            assert len(tok) >= 4, \
-                "Invalid line at line {} in file {}".format(
-                    i + 1, audio_index_file)
-            idx = int(tok[0])
-            duration = float(tok[1])
-            # Sometimes, the duration can round down to 0.0
-            assert duration >= 0.0, \
-                "Invalid duration at line {} in file {}".format(
-                    i + 1, audio_index_file)
-            size = int(tok[2])
-            assert size > 0, \
-                "Invalid size at line {} in file {}".format(
-                    i + 1, audio_index_file)
-            relpath = tok[3]
-            if len(tok) == 4:
-                tags = [UNK_TAG]
-            else:
-                tags = tok[4:]
-            yield idx, duration, size, relpath, tags
-def truncate_float(val, ndigits=6):
-    """ Truncates a floating-point value to have the desired number of
-    digits after the decimal point.
-    :param val: input value.
-    :type val: float
-    :parma ndigits: desired number of digits.
-    :type ndigits: int
-    :return: truncated value
-    :rtype: float
-    """
-    p = 10.0**ndigits
-    return float(int(val * p)) / p
-def print_audio_index(idx, duration, size, relpath, tags, file=sys.stdout):
-    """Prints an audio record to the index file.
-    :param idx: Audio file id.
-    :type idx: int
-    :param duration: length of the audio in seconds
-    :type duration: float
-    :param size: size of the file in bytes
-    :type size: int
-    :param relpath: relative path w.r.t. to the root noise directory.
-    :type relpath:  basestring
-    :parma tags: list of tags
-    :parma tags: list(str)
-    :parma file: file to which we want to write an audio record.
-    :type file: sys.stdout
-    """
-    file.write("{}\t{:.6f}\t{}\t{}"
-               .format(idx, truncate_float(duration, ndigits=6), size, relpath))
-    for tag in tags:
-        file.write("\t{}".format(tag))
-    file.write("\n")
-class AudioIndex(object):
-    """ In-memory index of audio files that do not have annotations.
-    This supports duration-based sampling and sampling from a target
-    distribution.
-    Each line in the index file consists of the following fields:
-        (id (int), duration (float), size (int), relative path (str),
-         list of tags ([str]))
-    """
-    def __init__(self):
-        self.audio_dir = None
-        self.index_fname = None
-        self.tags = None
-        self.bin_size = 2.0
-        self.clear()
-    def clear(self):
-        """ Clears the index
-        Returns:
-            None
-        """
-        self.idx_to_record = {}
-        # The list of indices correspond to audio files whose duration is
-        # greater than or equal to the key.
-        self.duration_to_id_set = {}
-        self.duration_to_id_set_per_tag = defaultdict(lambda: {})
-        self.duration_to_list = defaultdict(lambda: [])
-        self.duration_to_list_per_tag = defaultdict(
-            lambda: defaultdict(lambda: []))
-        self.tag_to_id_set = defaultdict(lambda: set())
-        self.shared_duration_bins = []
-        self.id_set_complete = set()
-        self.id_set = set()
-        self.duration_bins = []
-    def has_audio(self, distr=None):
-        """
-        :param distr: The target distribution of audio tags that we want to
-            match. If this is not supplied, the function simply checks that
-            there are some audio files.
-        :parma distr: dict
-        :return: True if there are audio files.
-        :rtype: boolean
-        """
-        if distr is None:
-            return len(self.id_set) > 0
-        else:
-            for tag in distr:
-                if tag not in self.duration_to_list_per_tag:
-                    return False
-            return True
-    def _load_all_records_from_disk(self, audio_dir, idx_fname, bin_size):
-        """Loads all audio records from the disk into memory and groups them
-        into chunks based on their duration and the bin_size granalarity.
-        Once all the records are read, indices are built from these records
-        by another function so that the audio samples can be drawn efficiently.
-        Updates:
-            self.audio_dir (path): audio root directory
-            self.idx_fname (path): audio database index filename
-            self.bin_size (float): granularity of bins
-            self.idx_to_record (dict): maps from the audio id to
-                (duration, file_size, relative_path, tags)
-            self.tag_to_id_set (dict): maps from the tag to
-                the set of id's of audios that have this tag.
-            self.id_set_complete (set): set of all audio id's in the index file
-            self.min_duration (float): minimum audio duration observed in the
-                index file
-            self.duration_bins (list): the lower bounds on the duration of
-                audio files falling in each bin
-            self.duration_to_id_set (dict): contains (k, v) where v is the set
-                of id's of audios whose lengths are longer than or equal to k.
-                (e.g. k is the duration lower bound of this bin).
-            self.duration_to_id_set_per_tag (dict): Something like above but
-                has a finer granularity mapping from the tag to
-                duration_to_id_set.
-            self.shared_duration_bins (list): list of sets where each set
-                contains duration lower bounds whose audio id sets are the
-                same. The rationale for having this is that there are a few
-                but extremely long audio files which lead to a lot of bins.
-                When the id sets do not change across various minimum duration
-                boundaries, we
-                cluster these together and make them point to the same id set
-                reference.
-        :return: whether the records were read from the disk. The assumption is
-            that the audio index file on disk and the actual audio files
-            are constructed once and never change during training. We only
-            re-read when either the directory or the index file path change.
-        """
-        if self.audio_dir == audio_dir and self.idx_fname == idx_fname and \
-           self.bin_size == bin_size:
-            # The audio directory and/or the list of audio files
-            # haven't changed. No need to load the list again.
-            return False
-        # Remember where the audio index is most recently read from.
-        self.audio_dir = audio_dir
-        self.idx_fname = idx_fname
-        self.bin_size = bin_size
-        # Read in the idx and compute the number of bins necessary
-        self.clear()
-        rank = []
-        min_duration = float('inf')
-        max_duration = float('-inf')
-        for idx, duration, file_size, relpath, tags in \
-                stream_audio_index(idx_fname):
-            self.idx_to_record[idx] = (duration, file_size, relpath, tags)
-            max_duration = max(max_duration, duration)
-            min_duration = min(min_duration, duration)
-            rank.append((duration, idx))
-            for tag in tags:
-                self.tag_to_id_set[tag].add(idx)
-        if len(rank) == 0:
-            # file is empty
-            raise IOError("Index file {} is empty".format(idx_fname))
-        for tag in self.tag_to_id_set:
-            self.id_set_complete |= self.tag_to_id_set[tag]
-        dur = min_duration
-        self.min_duration = min_duration
-        while dur < max_duration + bin_size:
-            self.duration_bins.append(dur)
-            dur += bin_size
-        # Sort in decreasing order of duration and populate
-        # the cumulative indices lists.
-        rank.sort(reverse=True)
-        # These are indices for `rank` and used to keep track of whether
-        # there are new records to add in the current bin.
-        last = 0
-        cur = 0
-        # The set of audios falling in the previous bin; in the case,
-        # where we don't find new audios for the current bin, we store
-        # the reference to the last set so as to conserve memory.
-        # This is not such a big problem if the audio duration is
-        # bounded by a small number like 30 seconds and the
-        # bin size is big enough. But, for raw freesound audios,
-        # some audios can be as long as a few hours!
-        last_audio_set = set()
-        # The same but for each tag so that we can pick audios based on
-        # tags and also some user-specified tag distribution.
-        last_audio_set_per_tag = defaultdict(lambda: set())
-        # Set of lists of bins sharing the same audio sets.
-        shared = set()
-        for i in range(len(self.duration_bins) - 1, -1, -1):
-            lower_bound = self.duration_bins[i]
-            new_audio_idxs = set()
-            new_audio_idxs_per_tag = defaultdict(lambda: set())
-            while cur < len(rank) and rank[cur][0] >= lower_bound:
-                idx = rank[cur][1]
-                tags = self.idx_to_record[idx][3]
-                new_audio_idxs.add(idx)
-                for tag in tags:
-                    new_audio_idxs_per_tag[tag].add(idx)
-                cur += 1
-            # This makes certain that the same list is shared across
-            # different bins if no new indices are added.
-            if cur == last:
-                shared.add(lower_bound)
-            else:
-                last_audio_set = last_audio_set | new_audio_idxs
-                for tag in new_audio_idxs_per_tag:
-                    last_audio_set_per_tag[tag] = \
-                        last_audio_set_per_tag[tag] | \
-                        new_audio_idxs_per_tag[tag]
-                if len(shared) > 0:
-                    self.shared_duration_bins.append(shared)
-                shared = set([lower_bound])
-                ### last_audio_set = set()  should set blank
-            last = cur
-            self.duration_to_id_set[lower_bound] = last_audio_set
-            for tag in last_audio_set_per_tag:
-                self.duration_to_id_set_per_tag[lower_bound][tag] = \
-                    last_audio_set_per_tag[tag]
-        # The last `shared` record isn't added to the `shared_duration_bins`.
-        self.shared_duration_bins.append(shared)
-        # We make sure that the while loop above has exhausted through the
-        # `rank` list by checking if the `cur`rent index in `rank` equals
-        # the length of the array, which is the halting condition.
-        assert cur == len(rank)
-        return True
-    def _build_index_from_records(self, tag_list):
-        """ Uses the in-memory records read from the index file to build
-        an in-memory index restricted to the given tag list.
-        :param tag_list: List of tags we are interested in sampling from.
-        :type tag_list: list(str)
-        Updates:
-            self.id_set (set): the set of all audio id's that can be sampled.
-            self.duration_to_list (dict): maps from the duration lower bound
-                to the id's of audios longer than this duration.
-            self.duration_to_list_per_tag (dict): maps from the tag to
-                the same structure as self.duration_to_list. This is to support
-                sampling from a target noise distribution.
-        :return: whether the index was built from scratch
-        """
-        if self.tags == tag_list:
-            return False
-        self.tags = tag_list
-        if len(tag_list) == 0:
-            self.id_set = self.id_set_complete
-        else:
-            self.id_set = set()
-            for tag in tag_list:
-                self.id_set |= self.tag_to_id_set[tag]
-        # Next, we need to take a subset of the audio files
-        for shared in self.shared_duration_bins:
-            # All bins in `shared' have the same index lists
-            # so we can intersect once and set all of them to this list.
-            lb = list(shared)[0]
-            intersected = list(self.id_set & self.duration_to_id_set[lb])
-            duration_to_id_set = self.duration_to_id_set_per_tag[lb]
-            intersected_per_tag = {
-                tag: self.tag_to_id_set[tag] & duration_to_id_set[tag]
-                for tag in duration_to_id_set
-            }
-            for bin_key in shared:
-                self.duration_to_list[bin_key] = intersected
-                for tag in intersected_per_tag:
-                    self.duration_to_list_per_tag[tag][bin_key] = \
-                        intersected_per_tag[tag]
-        assert len(self.duration_to_list) == len(self.duration_to_id_set)
-        return True
-    def refresh_records_from_index_file(self,
-                                        audio_dir,
-                                        idx_fname,
-                                        tag_list,
-                                        bin_size=2.0):
-        """ Loads the index file and populates the records
-        for building the internal index.
-        If the audio directory or index file name has changed, the whole index
-        is reloaded from scratch. If only the tag_list is changed, then the
-        desired index is built from the complete, in-memory record.
-        :param audio_dir: audio directory
-        :type audio_dir: basestring
-        :param idx_fname: audio index file name
-        :type idex_fname: basestring
-        :param tag_list: list of tags we are interested in loading;
-            if empty, we load all.
-        :type tag_list: list
-        :param bin_size: optional argument for controlling the granularity
-            of duration bins
-        :type bin_size: float
-        """
-        if tag_list is None:
-            tag_list = []
-        reloaded_records = self._load_all_records_from_disk(audio_dir,
-                                                            idx_fname, bin_size)
-        if reloaded_records or self.tags != tag_list:
-            self._build_index_from_records(tag_list)
-            logger.info('loaded {} audio files from {}'
-                        .format(len(self.id_set), idx_fname))
-    def sample_audio(self, duration, rng=None, distr=None):
-        """ Uniformly draws an audio record of at least the desired duration
-        :param duration: minimum desired audio duration
-        :type duration: float
-        :param rng: random number generator
-        :type rng: random.Random
-        :param distr: target distribution of audio tags. If not provided,
-        :type distr: dict
-        all audio files are sampled uniformly at random.
-        :returns: success, (duration, file_size, path)
-        """
-        if duration < 0.0:
-            duration = self.min_duration
-        i = bisect.bisect_left(self.duration_bins, duration)
-        if i == len(self.duration_bins):
-            return False, None
-        bin_key = self.duration_bins[i]
-        if distr is None:
-            indices = self.duration_to_list[bin_key]
-        else:
-            # If a desired audio distribution is given, we sample from it.
-            if rng is None:
-                rng = random.Random()
-            nprng = np.random.RandomState(rng.getrandbits(32))
-            prob_masses = distr.values()
-            prob_masses /= np.sum(prob_masses)
-            tag = nprng.choice(distr.keys(), p=prob_masses)
-            indices = self.duration_to_list_per_tag[tag][bin_key]
-        if len(indices) == 0:
-            return False, None
-        else:
-            if rng is None:
-                rng = random.Random()
-            # duration, file size and relative path from root
-            s = self.idx_to_record[rng.sample(indices, 1)[0]]
-            s = (s[0], s[1], os.path.join(self.audio_dir, s[2]))
-            return True, s
--- a/deep_speech_2/data_utils/augmentor/augmentation.py
+++ b/deep_speech_2/data_utils/augmentor/augmentation.py
@@ -6,11 +6,6 @@ from __future__ import print_function
 import json
 import random
 from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor
-from data_utils.augmentor.resamler import ResamplerAugmentor
-from data_utils.augmentor.speed_perturb import SpeedPerturbatioAugmentor
-from data_utils.augmentor.online_bayesian_normalization import OnlineBayesianNormalizationAugmentor
-from data_utils.augmentor.Impulse_response import ImpulseResponseAugmentor
-from data_utils.augmentor.noise_speech import NoiseSpeechAugmentor
 class AugmentationPipeline(object):
@@ -81,15 +76,5 @@ class AugmentationPipeline(object):
        """Return an augmentation model by the type name, and pass in params."""
        if augmentor_type == "volume":
            return VolumePerturbAugmentor(self._rng, **params)
-        if augmentor_type == "resamle":
-            return ResamplerAugmentor(self._rng, **params)
-        if augmentor_type == "speed":
-            return SpeedPerturbatioAugmentor(self._rng, **params)
-        if augmentor_type == "online_bayesian_normalization":
-            return OnlineBayesianNormalizationAugmentor(self._rng, **params)
-        if augmentor_type == "Impulse_response":
-            return ImpulseResponseAugmentor(self._rng, **params)
-        if augmentor_type == "noise_speech":
-            return NoiseSpeechAugmentor(self._rng, **params)
        else:
            raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
--- a/deep_speech_2/data_utils/augmentor/implus_response.py
+++ b/deep_speech_2/data_utils/augmentor/implus_response.py
-""" Impulse response"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from . import base
-from . import audio_database
-from data_utils.speech import SpeechSegment
-class ImpulseResponseAugmentor(base.AugmentorBase):
-    """ Instantiates an impulse response model
-    :param ir_dir: directory containing impulse responses
-    :type ir_dir: basestring
-    :param tags: optional parameter for specifying what
-            particular impulse responses to apply.
-    :type tags: list
-    :parm tag_distr: optional noise distribution
-    :type tag_distr: dict
-    """
-    def __init__(self, rng, ir_dir, index_file, tags=None, tag_distr=None):
-        # Define all required parameter maps here.
-        self.ir_dir = ir_dir
-        self.index_file = index_file
-        self.tags = tags
-        self.tag_distr = tag_distr
-        self.audio_index = audio_database.AudioIndex()
-        self.rng = rng
-    def _init_data(self):
-        """ Preloads stuff from disk in an attempt (e.g. list of files, etc)
-        to make later loading faster. If the data configuration remains the
-        same, this function does nothing.
-        """
-        self.audio_index.refresh_records_from_index_file(
-            self.ir_dir, self.index_file, self.tags)
-    def transform_audio(self, audio_segment):
-        """ Convolves the input audio with an impulse response.
-        :param audio_segment: input audio
-        :type audio_segment: AudioSegemnt
-        """
-        # This handles the cases where the data source or directories change.
-        self._init_data()
-        read_size = 0
-        tag_distr = self.tag_distr
-        if not self.audio_index.has_audio(tag_distr):
-            if tag_distr is None:
-                if not self.tags:
-                    raise RuntimeError("The ir index does not have audio "
-                                       "files to sample from.")
-                else:
-                    raise RuntimeError("The ir index does not have audio "
-                                       "files of the given tags to sample "
-                                       "from.")
-            else:
-                raise RuntimeError("The ir index does not have audio "
-                                   "files to match the target ir "
-                                   "distribution.")
-        else:
-            # Querying with a negative duration triggers the index to search
-            # from all impulse responses.
-            success, record = self.audio_index.sample_audio(
-                -1.0, rng=self.rng, distr=tag_distr)
-            if success is True:
-                _, read_size, ir_fname = record
-                ir_wav = SpeechSegment.from_file(ir_fname)
-                audio_segment.convolve(ir_wav, allow_resampling=True)
--- a/deep_speech_2/data_utils/augmentor/noise_speech.py
+++ b/deep_speech_2/data_utils/augmentor/noise_speech.py
-""" noise speech
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import logging
-import numpy as np
-import os
-from collections import defaultdict
-from . import base
-from . import audio_database
-from data_utils.speech import SpeechSegment
-TURK = "turk"
-USE_AUDIO_DATABASE_SOURCES = frozenset(["freesound", "chime"])
-HALF_NOISE_LENGTH_MIN_THRESHOLD = 3.0
-FIND_NOISE_MAX_ATTEMPTS = 20
-logger = logging.getLogger(__name__)
-def get_first_smaller(items, value):
-    index = bisect.bisect_left(items, value) - 1
-    assert items[index] < value, \
-        'get_first_smaller failed! %d %d' % (items[index], value)
-    return items[index]
-def get_first_larger(items, value):
-    'Find leftmost value greater than value'
-    index = bisect.bisect_right(items, value)
-    assert index < len(items), \
-        "no noise bin exists for this audio length (%f)" % value
-    assert items[index] > value, \
-        'get_first_larger failed! %d %d' % (items[index], value)
-    return items[index]
-def _get_turk_noise_files(noise_dir, index_file):
-    """ Creates a map from duration => a list of noise filenames
-    :param noise_dir: Directory of noise files which contains
-        "noise-samples-list"
-    :type noise_dir: basestring
-    :param index_file: Noise list
-    :type index_file: basestring
-    returns:noise_files (defaultdict): A map of bins to noise files.
-        Each key is the duration, and the value is a list of noise
-        files binned to this duration. Each bin is 2 secs.
-    Note: noise-samples-list should contain one line per noise (wav) file
-        along with its duration in milliseconds
-    """
-    noise_files = defaultdict(list)
-    if not os.path.exists(index_file):
-        logger.error('No noise files were found at {}'.format(index_file))
-        return noise_files
-    num_noise_files = 0
-    rounded_durations = list(range(0, 65, 2))
-    with open(index_file, 'r') as fl:
-        for line in fl:
-            fname = os.path.join(noise_dir, line.strip().split()[0])
-            duration = float(line.strip().split()[1]) / 1000
-            # bin the noise files into length bins rounded by 2 sec
-            bin_id = get_first_smaller(rounded_durations, duration)
-            noise_files[bin_id].append(fname)
-            num_noise_files += 1
-    logger.info('Loaded {} turk noise files'.format(num_noise_files))
-    return noise_files
-class NoiseSpeechAugmentor(base.AugmentorBase):
-    """ Noise addition block
-    :param snr_min: minimum signal-to-noise ratio
-    :type snr_min: float
-    :param snr_max: maximum signal-to-noise ratio
-    :type snr_max: float
-    :param noise_dir: root of where noise files are stored
-    :type noise_fir: basestring
-    :param index_file: index of noises of interest in noise_dir
-    :type index_file: basestring
-    :param source: select one from
-        - turk
-        - freesound
-        - chime
-        Note that this field is no longer required for the freesound
-        and chime
-    :type source: string
-    :param tags: optional parameter for specifying what
-        particular noises we want to add. See above for the available tags.
-    :type tags: list
-    :param tag_distr: optional noise distribution
-    :type tag_distr: dict
-    """
-    def __init__(self,
-                 rng,
-                 snr_min,
-                 snr_max,
-                 noise_dir,
-                 source,
-                 allow_downsampling=None,
-                 index_file=None,
-                 tags=None,
-                 tag_distr=None):
-        # Define all required parameter maps here.
-        self.rng = rng
-        self.snr_min = snr_min
-        self.snr_max = snr_max
-        self.noise_dir = noise_dir
-        self.source = source
-        self.allow_downsampling = allow_downsampling
-        self.index_file = index_file
-        self.tags = tags
-        self.tag_distr = tag_distr
-        # When new noise sources are added, make sure to define the
-        # associated bookkeeping variables here.
-        self.turk_noise_files = []
-        self.turk_noise_dir = None
-        self.audio_index = audio_database.AudioIndex()
-    def _init_data(self):
-        """ Preloads stuff from disk in an attempt (e.g. list of files, etc)
-        to make later loading faster. If the data configuration remains the
-        same, this function does nothing.
-        """
-        noise_dir = self.noise_dir
-        index_file = self.index_file
-        source = self.source
-        if not index_file:
-            if source == TURK:
-                index_file = os.path.join(noise_dir, 'noise-samples-list')
-                logger.debug("index_file not provided; " + "defaulting to " +
-                             index_file)
-            else:
-                if source != "":
-                    assert source in USE_AUDIO_DATABASE_SOURCES, \
-                        "{} not supported by audio_database".format(source)
-                index_file = os.path.join(noise_dir,
-                                          "audio_index_commercial.txt")
-                logger.debug("index_file not provided; " + "defaulting to " +
-                             index_file)
-        if source == TURK:
-            if self.turk_noise_dir != noise_dir:
-                self.turk_noise_dir = noise_dir
-                self.turk_noise_files = _get_turk_noise_files(noise_dir,
-                                                              index_file)
-        # elif source == TODO_SUPPORT_NON_AUDIO_DATABASE_BASED_SOURCES:
-        else:
-            if source != "":
-                assert source in USE_AUDIO_DATABASE_SOURCES, \
-                    "{} not supported by audio_database".format(source)
-            self.audio_index.refresh_records_from_index_file(
-                self.noise_dir, index_file, self.tags)
-    def transform_audio(self, audio_segment):
-        """Adds walla noise
-        :param audio_segment: Input audio
-        :type audio_segment: SpeechSegment
-        """
-        # This handles the cases where the data source or directories change.
-        self._init_data
-        source = self.source
-        allow_downsampling = self.allow_downsampling
-        if source == TURK:
-            self._add_turk_noise(audio_segment, self.rng, allow_downsampling)
-        # elif source == TODO_SUPPORT_NON_AUDIO_DATABASE_BASED_SOURCES:
-        else:
-            self._add_noise(audio_segment, self.rng, allow_downsampling)
-    def _sample_snr(self):
-        """ Returns a float sampled in [`self.snr_min`, `self.snr_max`]
-        if both `self.snr_min` and `self.snr_max` are non-zero.
-        """
-        snr_min = self.snr_min
-        snr_max = self.snr_max
-        sampled_snr = self.rng.uniform(snr_min, snr_max)
-        return sampled_snr
-    def _add_turk_noise(self, audio_segment, allow_downsampling):
-        """ Adds a turk noise to the input audio.
-        :param audio_segment: input audio
-        :type audio_segment: audiosegment
-        :param allow_downsampling: indicates whether downsampling
-            is allowed
-        :type allow_downsampling: boolean 
-        """
-        read_size = 0
-        if len(self.turk_noise_files) > 0:
-            snr = self._sample_snr(self.rng)
-            # Draw the noise file randomly from noise files that are
-            # slightly longer than the utterance
-            noise_bins = sorted(self.turk_noise_files.keys())
-            # note some bins can be empty, so we can't just round up
-            # to the nearest 2-sec interval
-            rounded_duration = get_first_larger(noise_bins,
-                                                audio_segment.duration)
-            noise_fname = \
-                self.rng.sample(self.turk_noise_files[rounded_duration], 1)[0]
-            noise = SpeechSegment.from_wav_file(noise_fname)
-            logger.debug('noise_fname {}'.format(noise_fname))
-            logger.debug('snr {}'.format(snr))
-            read_size = len(noise) * 2
-            # May throw exceptions, but this is caught by
-            # AudioFeaturizer.get_audio_files.
-            audio_segment.add_noise(
-                noise, snr, rng=self.rng, allow_downsampling=allow_downsampling)
-    def _add_noise(self, audio_segment, allow_downsampling):
-        """ Adds a noise indexed in audio_database.AudioIndex.
-        :param audio_segment: input audio
-        :type audio_segment: SpeechSegment
-        :param allow_downsampling: indicates whether downsampling
-            is allowed
-        :type allow_downsampling: boolean
-        Returns:
-            (SpeechSegment, int)
-                - sound with turk noise added
-                - number of bytes read from disk
-        """
-        read_size = 0
-        tag_distr = self.tag_distr
-        if not self.audio_index.has_audio(tag_distr):
-            if tag_distr is None:
-                if not self.tags:
-                    raise RuntimeError("The noise index does not have audio "
-                                       "files to sample from.")
-                else:
-                    raise RuntimeError("The noise index does not have audio "
-                                       "files of the given tags to sample "
-                                       "from.")
-            else:
-                raise RuntimeError("The noise index does not have audio "
-                                   "files to match the target noise "
-                                   "distribution.")
-        else:
-            # Compute audio segment related statistics
-            audio_duration = audio_segment.duration
-            # Sample relevant augmentation parameters.
-            snr = self._sample_snr(self.rng)
-            # Perhaps, we may not have a sufficiently long noise, so we need
-            # to search iteratively.
-            min_duration = audio_duration + 0.25
-            for _ in range(FIND_NOISE_MAX_ATTEMPTS):
-                logger.debug("attempting to find noise of length "
-                             "at least {}".format(min_duration))
-                success, record = \
-                    self.audio_index.sample_audio(min_duration,
-                                                  rng=self.rng,
-                                                  distr=tag_distr)
-                if success is True:
-                    noise_duration, read_size, noise_fname = record
-                    # Assert after logging so we know
-                    # what caused augmentation to fail.
-                    logger.debug("noise_fname {}".format(noise_fname))
-                    logger.debug("snr {}".format(snr))
-                    assert noise_duration >= min_duration
-                    break
-                # Decrease the desired minimum duration linearly.
-                # If the value becomes smaller than some threshold,
-                # we half the value instead.
-                if min_duration > HALF_NOISE_LENGTH_MIN_THRESHOLD:
-                    min_duration -= 2.0
-                else:
-                    min_duration *= 0.5
-            if success is False:
-                logger.info("Failed to find a noise file")
-                return
-            diff_duration = audio_duration + 0.25 - noise_duration
-            if diff_duration >= 0.0:
-                # Here, the noise is shorter than the audio file, so
-                # we pad with zeros to make sure the noise sound is applied
-                # with a uniformly random shift.
-                noise = SpeechSegment.from_file(noise_fname)
-                noise = noise.pad_silence(diff_duration, sides="both")
-            else:
-                # The noise clip is at least ~25 ms longer than the audio
-                # segment here.
-                diff_duration = int(noise_duration * audio_segment.sample_rate) - \
-                    int(audio_duration * audio_segment.sample_rate) - \
-                    int(0.02 * audio_segment.sample_rate)
-                start = float(self.rng.randint(0, diff_duration)) / \
-                    audio.sample_rate
-                finish = min(start + audio_duration + 0.2, noise_duration)
-                noise = SpeechSegment.slice_from_file(noise_fname, start,
-                                                      finish)
-            if len(noise) < len(audio_segment):
-                # This is to ensure that the noise clip is at least as
-                # long as the audio segment.
-                num_samples_to_pad = len(audio_segment) - len(noise)
-                # Padding this amount of silence on both ends ensures that
-                # the placement of the noise clip is uniformly random.
-                silence = SpeechSegment(
-                    np.zeros(num_samples_to_pad), audio_segment.sample_rate)
-                noise = SpeechSegment.concatenate(silence, noise, silence)
-            audio_segment.add_noise(
-                noise, snr, rng=self.rng, allow_downsampling=allow_downsampling)
--- a/deep_speech_2/data_utils/augmentor/online_bayesian_normalization.py
+++ b/deep_speech_2/data_utils/augmentor/online_bayesian_normalization.py
-""" Online bayesian normalization
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from . import base
-class OnlineBayesianNormalizationAugmentor(base.AugmentorBase):
-    """ 
-    Instantiates an online bayesian normalization module.
-    :param target_db: Target RMS value in decibels
-            :type target_db: func[int->scalar]
-            :param prior_db: Prior RMS estimate in decibels
-            :type prior_db: func[int->scalar]
-            :param prior_samples: Prior strength in number of samples
-            :type prior_samples: func[int->scalar]
-            :param startup_delay: Start-up delay in seconds during
-                which normalization statistics is accrued.
-            :type starup_delay: func[int->scalar]
-    """
-    def __init__(self,
-                 rng,
-                 target_db,
-                 prior_db,
-                 prior_samples,
-                 startup_delay=base.parse_parameter_from(0.0)):
-        self.target_db = target_db
-        self.prior_db = prior_db
-        self.prior_samples = prior_samples
-        self.startup_delay = startup_delay
-        self.rng = rng
-    def transform_audio(self, audio_segment):
-        """
-        Normalizes the input audio using the online Bayesian approach.
-        :param audio_segment: input audio
-        :type audio_segment: SpeechSegment
-        :param iteration: current iteration
-        :type iteration: int
-        :param text: audio transcription
-        :type text: basestring
-        :param rng: RNG to use for augmentation
-        :type rng: random.Random
-        """
-        read_size = 0
-        target_db = self.target_db(iteration)
-        prior_db = self.prior_db(iteration)
-        prior_samples = self.prior_samples(iteration)
-        startup_delay = self.startup_delay(iteration)
-        audio.normalize_online_bayesian(
-            target_db, prior_db, prior_samples, startup_delay=startup_delay)
--- a/deep_speech_2/data_utils/augmentor/resampler.py
+++ b/deep_speech_2/data_utils/augmentor/resampler.py
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from . import base
-class ResamplerAugmentor(base.AugmentorBase):
-    """ Instantiates a resampler module.
-    :param new_sample_rate: New sample rate in Hz
-    :type new_sample_rate: func[int->scalar]
-    :param rng: Random generator object.
-    :type rng: random.Random
-    """
-    def __init__(self, rng, new_sample_rate):
-        self.new_sample_rate = new_sample_rate
-        self._rng = rng
-    def transform_audio(self, audio_segment):
-        """ Resamples the input audio to the target sample rate.
-        Note that this is an in-place transformation.
-        :param audio: input audio
-        :type audio: SpeechDLSegment
-        """
-        new_sample_rate = self.new_sample_rate
-        audio.resample(new_sample_rate)
\ No newline at end of file
--- a/deep_speech_2/data_utils/augmentor/speed_perturb.py
+++ b/deep_speech_2/data_utils/augmentor/speed_perturb.py
-"""Speed perturbation module for making ASR robust to different voice
-types (high pitched, low pitched, etc)
-Samples uniformly between speed_min and speed_max
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from . import base
-class SpeedPerturbatioAugmentor(base.AugmentorBase):
-    """ 
-    Instantiates a speed perturbation module.
-    See reference paper here:
-    http://www.danielpovey.com/files/2015_interspeech_augmentation.pdf
-    :param speed_min: Lower bound on new rate to sample
-    :type speed_min: func[int->scalar]
-    :param speed_max: Upper bound on new rate to sample
-    :type speed_max: func[int->scalar]
-    """
-    def __init__(self, rng, speed_min, speed_max):
-        if (speed_min < 0.9):
-            raise ValueError(
-                "Sampling speed below 0.9 can cause unnatural effects")
-        if (speed_min > 1.1):
-            raise ValueError(
-                "Sampling speed above 1.1 can cause unnatural effects")
-        self.speed_min = speed_min
-        self.speed_max = speed_max
-        self.rng = rng
-    def transform_audio(self, audio_segment):
-        """ 
-        Samples a new speed rate from the given range and
-        changes the speed of the given audio clip.
-        Note that this is an in-place transformation.
-        :param audio_segment: input audio
-        :type audio_segment: SpeechDLSegment
-        """
-        read_size = 0
-        speed_min = self.speed_min(iteration)
-        speed_max = self.speed_max(iteration)
-        sampled_speed = rng.uniform(speed_min, speed_max)
-        audio = audio.change_speed(sampled_speed)
--- a/deep_speech_2/data_utils/augmentor/volume_perturb.py
+++ b/deep_speech_2/data_utils/augmentor/volume_perturb.py
@@ -3,10 +3,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-from . import base
+from data_utils.augmentor.base import AugmentorBase
-class VolumePerturbAugmentor(base.AugmentorBase):
+class VolumePerturbAugmentor(AugmentorBase):
    """Augmentation model for adding random volume perturbation.
    This is used for multi-loudness training of PCEN. See

--- a/deep_speech_2/requirements.txt
+++ b/deep_speech_2/requirements.txt
 SoundFile==0.9.0.post1
 wget==3.2
+scikits.samplerate==0.3.3
+scipy==0.13.0b1