提交 42ba74ef 编写于 作者: chrisxu2014's avatar chrisxu2014

add audio part

上级 a84bdf64
此差异已折叠。
from __future__ import print_function
from collections import defaultdict
import bisect
import logging
import numpy as np
import os
import random
import sys
UNK_TAG = "<UNK>"
def stream_audio_index(fname, UNK=UNK_TAG):
"""Reads an audio index file and emits one record in the index at a time.
:param fname: audio index path
:type fname: basestring
:param UNK: UNK token to denote that certain audios are not tagged.
:type UNK: basesring
Yields:
idx, duration, size, relpath, tags (int, float, int, str, list(str)):
audio file id, length of the audio in seconds, size in byte,
relative path w.r.t. to the root noise directory, list of tags
"""
with open(fname) as audio_index_file:
for i, line in enumerate(audio_index_file):
tok = line.strip().split("\t")
assert len(tok) >= 4, \
"Invalid line at line {} in file {}".format(
i + 1, audio_index_file)
idx = int(tok[0])
duration = float(tok[1])
# Sometimes, the duration can round down to 0.0
assert duration >= 0.0, \
"Invalid duration at line {} in file {}".format(
i + 1, audio_index_file)
size = int(tok[2])
assert size > 0, \
"Invalid size at line {} in file {}".format(
i + 1, audio_index_file)
relpath = tok[3]
if len(tok) == 4:
tags = [UNK_TAG]
else:
tags = tok[4:]
yield idx, duration, size, relpath, tags
def truncate_float(val, ndigits=6):
""" Truncates a floating-point value to have the desired number of
digits after the decimal point.
:param val: input value.
:type val: float
:parma ndigits: desired number of digits.
:type ndigits: int
:return: truncated value
:rtype: float
"""
p = 10.0**ndigits
return float(int(val * p)) / p
def print_audio_index(idx, duration, size, relpath, tags, file=sys.stdout):
"""Prints an audio record to the index file.
:param idx: Audio file id.
:type idx: int
:param duration: length of the audio in seconds
:type duration: float
:param size: size of the file in bytes
:type size: int
:param relpath: relative path w.r.t. to the root noise directory.
:type relpath: basestring
:parma tags: list of tags
:parma tags: list(str)
:parma file: file to which we want to write an audio record.
:type file: sys.stdout
"""
file.write("{}\t{:.6f}\t{}\t{}"
.format(idx, truncate_float(duration, ndigits=6), size, relpath))
for tag in tags:
file.write("\t{}".format(tag))
file.write("\n")
class AudioIndex(object):
""" In-memory index of audio files that do not have annotations.
This supports duration-based sampling and sampling from a target
distribution.
Each line in the index file consists of the following fields:
(id (int), duration (float), size (int), relative path (str),
list of tags ([str]))
"""
def __init__(self):
self.audio_dir = None
self.index_fname = None
self.tags = None
self.bin_size = 2.0
self.clear()
def clear(self):
""" Clears the index
Returns:
None
"""
self.idx_to_record = {}
# The list of indices correspond to audio files whose duration is
# greater than or equal to the key.
self.duration_to_id_set = {}
self.duration_to_id_set_per_tag = defaultdict(lambda: {})
self.duration_to_list = defaultdict(lambda: [])
self.duration_to_list_per_tag = defaultdict(
lambda: defaultdict(lambda: []))
self.tag_to_id_set = defaultdict(lambda: set())
self.shared_duration_bins = []
self.id_set_complete = set()
self.id_set = set()
self.duration_bins = []
def has_audio(self, distr=None):
"""
:param distr: The target distribution of audio tags that we want to
match. If this is not supplied, the function simply checks that
there are some audio files.
:parma distr: dict
:return: True if there are audio files.
:rtype: boolean
"""
if distr is None:
return len(self.id_set) > 0
else:
for tag in distr:
if tag not in self.duration_to_list_per_tag:
return False
return True
def _load_all_records_from_disk(self, audio_dir, idx_fname, bin_size):
"""Loads all audio records from the disk into memory and groups them
into chunks based on their duration and the bin_size granalarity.
Once all the records are read, indices are built from these records
by another function so that the audio samples can be drawn efficiently.
Updates:
self.audio_dir (path): audio root directory
self.idx_fname (path): audio database index filename
self.bin_size (float): granularity of bins
self.idx_to_record (dict): maps from the audio id to
(duration, file_size, relative_path, tags)
self.tag_to_id_set (dict): maps from the tag to
the set of id's of audios that have this tag.
self.id_set_complete (set): set of all audio id's in the index file
self.min_duration (float): minimum audio duration observed in the
index file
self.duration_bins (list): the lower bounds on the duration of
audio files falling in each bin
self.duration_to_id_set (dict): contains (k, v) where v is the set
of id's of audios whose lengths are longer than or equal to k.
(e.g. k is the duration lower bound of this bin).
self.duration_to_id_set_per_tag (dict): Something like above but
has a finer granularity mapping from the tag to
duration_to_id_set.
self.shared_duration_bins (list): list of sets where each set
contains duration lower bounds whose audio id sets are the
same. The rationale for having this is that there are a few
but extremely long audio files which lead to a lot of bins.
When the id sets do not change across various minimum duration
boundaries, we
cluster these together and make them point to the same id set
reference.
:return: whether the records were read from the disk. The assumption is
that the audio index file on disk and the actual audio files
are constructed once and never change during training. We only
re-read when either the directory or the index file path change.
"""
if self.audio_dir == audio_dir and self.idx_fname == idx_fname and \
self.bin_size == bin_size:
# The audio directory and/or the list of audio files
# haven't changed. No need to load the list again.
return False
# Remember where the audio index is most recently read from.
self.audio_dir = audio_dir
self.idx_fname = idx_fname
self.bin_size = bin_size
# Read in the idx and compute the number of bins necessary
self.clear()
rank = []
min_duration = float('inf')
max_duration = float('-inf')
for idx, duration, file_size, relpath, tags in \
stream_audio_index(idx_fname):
self.idx_to_record[idx] = (duration, file_size, relpath, tags)
max_duration = max(max_duration, duration)
min_duration = min(min_duration, duration)
rank.append((duration, idx))
for tag in tags:
self.tag_to_id_set[tag].add(idx)
if len(rank) == 0:
# file is empty
raise IOError("Index file {} is empty".format(idx_fname))
for tag in self.tag_to_id_set:
self.id_set_complete |= self.tag_to_id_set[tag]
dur = min_duration
self.min_duration = min_duration
while dur < max_duration + bin_size:
self.duration_bins.append(dur)
dur += bin_size
# Sort in decreasing order of duration and populate
# the cumulative indices lists.
rank.sort(reverse=True)
# These are indices for `rank` and used to keep track of whether
# there are new records to add in the current bin.
last = 0
cur = 0
# The set of audios falling in the previous bin; in the case,
# where we don't find new audios for the current bin, we store
# the reference to the last set so as to conserve memory.
# This is not such a big problem if the audio duration is
# bounded by a small number like 30 seconds and the
# bin size is big enough. But, for raw freesound audios,
# some audios can be as long as a few hours!
last_audio_set = set()
# The same but for each tag so that we can pick audios based on
# tags and also some user-specified tag distribution.
last_audio_set_per_tag = defaultdict(lambda: set())
# Set of lists of bins sharing the same audio sets.
shared = set()
for i in range(len(self.duration_bins) - 1, -1, -1):
lower_bound = self.duration_bins[i]
new_audio_idxs = set()
new_audio_idxs_per_tag = defaultdict(lambda: set())
while cur < len(rank) and rank[cur][0] >= lower_bound:
idx = rank[cur][1]
tags = self.idx_to_record[idx][3]
new_audio_idxs.add(idx)
for tag in tags:
new_audio_idxs_per_tag[tag].add(idx)
cur += 1
# This makes certain that the same list is shared across
# different bins if no new indices are added.
if cur == last:
shared.add(lower_bound)
else:
last_audio_set = last_audio_set | new_audio_idxs
for tag in new_audio_idxs_per_tag:
last_audio_set_per_tag[tag] = \
last_audio_set_per_tag[tag] | \
new_audio_idxs_per_tag[tag]
if len(shared) > 0:
self.shared_duration_bins.append(shared)
shared = set([lower_bound])
### last_audio_set = set() should set blank
last = cur
self.duration_to_id_set[lower_bound] = last_audio_set
for tag in last_audio_set_per_tag:
self.duration_to_id_set_per_tag[lower_bound][tag] = \
last_audio_set_per_tag[tag]
# The last `shared` record isn't added to the `shared_duration_bins`.
self.shared_duration_bins.append(shared)
# We make sure that the while loop above has exhausted through the
# `rank` list by checking if the `cur`rent index in `rank` equals
# the length of the array, which is the halting condition.
assert cur == len(rank)
return True
def _build_index_from_records(self, tag_list):
""" Uses the in-memory records read from the index file to build
an in-memory index restricted to the given tag list.
:param tag_list: List of tags we are interested in sampling from.
:type tag_list: list(str)
Updates:
self.id_set (set): the set of all audio id's that can be sampled.
self.duration_to_list (dict): maps from the duration lower bound
to the id's of audios longer than this duration.
self.duration_to_list_per_tag (dict): maps from the tag to
the same structure as self.duration_to_list. This is to support
sampling from a target noise distribution.
:return: whether the index was built from scratch
"""
if self.tags == tag_list:
return False
self.tags = tag_list
if len(tag_list) == 0:
self.id_set = self.id_set_complete
else:
self.id_set = set()
for tag in tag_list:
self.id_set |= self.tag_to_id_set[tag]
# Next, we need to take a subset of the audio files
for shared in self.shared_duration_bins:
# All bins in `shared' have the same index lists
# so we can intersect once and set all of them to this list.
lb = list(shared)[0]
intersected = list(self.id_set & self.duration_to_id_set[lb])
duration_to_id_set = self.duration_to_id_set_per_tag[lb]
intersected_per_tag = {
tag: self.tag_to_id_set[tag] & duration_to_id_set[tag]
for tag in duration_to_id_set
}
for bin_key in shared:
self.duration_to_list[bin_key] = intersected
for tag in intersected_per_tag:
self.duration_to_list_per_tag[tag][bin_key] = \
intersected_per_tag[tag]
assert len(self.duration_to_list) == len(self.duration_to_id_set)
return True
def refresh_records_from_index_file(self,
audio_dir,
idx_fname,
tag_list,
bin_size=2.0):
""" Loads the index file and populates the records
for building the internal index.
If the audio directory or index file name has changed, the whole index
is reloaded from scratch. If only the tag_list is changed, then the
desired index is built from the complete, in-memory record.
:param audio_dir: audio directory
:type audio_dir: basestring
:param idx_fname: audio index file name
:type idex_fname: basestring
:param tag_list: list of tags we are interested in loading;
if empty, we load all.
:type tag_list: list
:param bin_size: optional argument for controlling the granularity
of duration bins
:type bin_size: float
"""
if tag_list is None:
tag_list = []
reloaded_records = self._load_all_records_from_disk(audio_dir,
idx_fname, bin_size)
if reloaded_records or self.tags != tag_list:
self._build_index_from_records(tag_list)
logger.info('loaded {} audio files from {}'
.format(len(self.id_set), idx_fname))
def sample_audio(self, duration, rng=None, distr=None):
""" Uniformly draws an audio record of at least the desired duration
:param duration: minimum desired audio duration
:type duration: float
:param rng: random number generator
:type rng: random.Random
:param distr: target distribution of audio tags. If not provided,
:type distr: dict
all audio files are sampled uniformly at random.
:returns: success, (duration, file_size, path)
"""
if duration < 0.0:
duration = self.min_duration
i = bisect.bisect_left(self.duration_bins, duration)
if i == len(self.duration_bins):
return False, None
bin_key = self.duration_bins[i]
if distr is None:
indices = self.duration_to_list[bin_key]
else:
# If a desired audio distribution is given, we sample from it.
if rng is None:
rng = random.Random()
nprng = np.random.RandomState(rng.getrandbits(32))
prob_masses = distr.values()
prob_masses /= np.sum(prob_masses)
tag = nprng.choice(distr.keys(), p=prob_masses)
indices = self.duration_to_list_per_tag[tag][bin_key]
if len(indices) == 0:
return False, None
else:
if rng is None:
rng = random.Random()
# duration, file size and relative path from root
s = self.idx_to_record[rng.sample(indices, 1)[0]]
s = (s[0], s[1], os.path.join(self.audio_dir, s[2]))
return True, s
......@@ -6,11 +6,6 @@ from __future__ import print_function
import json
import random
from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor
from data_utils.augmentor.resamler import ResamplerAugmentor
from data_utils.augmentor.speed_perturb import SpeedPerturbatioAugmentor
from data_utils.augmentor.online_bayesian_normalization import OnlineBayesianNormalizationAugmentor
from data_utils.augmentor.Impulse_response import ImpulseResponseAugmentor
from data_utils.augmentor.noise_speech import NoiseSpeechAugmentor
class AugmentationPipeline(object):
......@@ -81,15 +76,5 @@ class AugmentationPipeline(object):
"""Return an augmentation model by the type name, and pass in params."""
if augmentor_type == "volume":
return VolumePerturbAugmentor(self._rng, **params)
if augmentor_type == "resamle":
return ResamplerAugmentor(self._rng, **params)
if augmentor_type == "speed":
return SpeedPerturbatioAugmentor(self._rng, **params)
if augmentor_type == "online_bayesian_normalization":
return OnlineBayesianNormalizationAugmentor(self._rng, **params)
if augmentor_type == "Impulse_response":
return ImpulseResponseAugmentor(self._rng, **params)
if augmentor_type == "noise_speech":
return NoiseSpeechAugmentor(self._rng, **params)
else:
raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
""" Impulse response"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from . import base
from . import audio_database
from data_utils.speech import SpeechSegment
class ImpulseResponseAugmentor(base.AugmentorBase):
""" Instantiates an impulse response model
:param ir_dir: directory containing impulse responses
:type ir_dir: basestring
:param tags: optional parameter for specifying what
particular impulse responses to apply.
:type tags: list
:parm tag_distr: optional noise distribution
:type tag_distr: dict
"""
def __init__(self, rng, ir_dir, index_file, tags=None, tag_distr=None):
# Define all required parameter maps here.
self.ir_dir = ir_dir
self.index_file = index_file
self.tags = tags
self.tag_distr = tag_distr
self.audio_index = audio_database.AudioIndex()
self.rng = rng
def _init_data(self):
""" Preloads stuff from disk in an attempt (e.g. list of files, etc)
to make later loading faster. If the data configuration remains the
same, this function does nothing.
"""
self.audio_index.refresh_records_from_index_file(
self.ir_dir, self.index_file, self.tags)
def transform_audio(self, audio_segment):
""" Convolves the input audio with an impulse response.
:param audio_segment: input audio
:type audio_segment: AudioSegemnt
"""
# This handles the cases where the data source or directories change.
self._init_data()
read_size = 0
tag_distr = self.tag_distr
if not self.audio_index.has_audio(tag_distr):
if tag_distr is None:
if not self.tags:
raise RuntimeError("The ir index does not have audio "
"files to sample from.")
else:
raise RuntimeError("The ir index does not have audio "
"files of the given tags to sample "
"from.")
else:
raise RuntimeError("The ir index does not have audio "
"files to match the target ir "
"distribution.")
else:
# Querying with a negative duration triggers the index to search
# from all impulse responses.
success, record = self.audio_index.sample_audio(
-1.0, rng=self.rng, distr=tag_distr)
if success is True:
_, read_size, ir_fname = record
ir_wav = SpeechSegment.from_file(ir_fname)
audio_segment.convolve(ir_wav, allow_resampling=True)
""" noise speech
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import logging
import numpy as np
import os
from collections import defaultdict
from . import base
from . import audio_database
from data_utils.speech import SpeechSegment
TURK = "turk"
USE_AUDIO_DATABASE_SOURCES = frozenset(["freesound", "chime"])
HALF_NOISE_LENGTH_MIN_THRESHOLD = 3.0
FIND_NOISE_MAX_ATTEMPTS = 20
logger = logging.getLogger(__name__)
def get_first_smaller(items, value):
index = bisect.bisect_left(items, value) - 1
assert items[index] < value, \
'get_first_smaller failed! %d %d' % (items[index], value)
return items[index]
def get_first_larger(items, value):
'Find leftmost value greater than value'
index = bisect.bisect_right(items, value)
assert index < len(items), \
"no noise bin exists for this audio length (%f)" % value
assert items[index] > value, \
'get_first_larger failed! %d %d' % (items[index], value)
return items[index]
def _get_turk_noise_files(noise_dir, index_file):
""" Creates a map from duration => a list of noise filenames
:param noise_dir: Directory of noise files which contains
"noise-samples-list"
:type noise_dir: basestring
:param index_file: Noise list
:type index_file: basestring
returns:noise_files (defaultdict): A map of bins to noise files.
Each key is the duration, and the value is a list of noise
files binned to this duration. Each bin is 2 secs.
Note: noise-samples-list should contain one line per noise (wav) file
along with its duration in milliseconds
"""
noise_files = defaultdict(list)
if not os.path.exists(index_file):
logger.error('No noise files were found at {}'.format(index_file))
return noise_files
num_noise_files = 0
rounded_durations = list(range(0, 65, 2))
with open(index_file, 'r') as fl:
for line in fl:
fname = os.path.join(noise_dir, line.strip().split()[0])
duration = float(line.strip().split()[1]) / 1000
# bin the noise files into length bins rounded by 2 sec
bin_id = get_first_smaller(rounded_durations, duration)
noise_files[bin_id].append(fname)
num_noise_files += 1
logger.info('Loaded {} turk noise files'.format(num_noise_files))
return noise_files
class NoiseSpeechAugmentor(base.AugmentorBase):
""" Noise addition block
:param snr_min: minimum signal-to-noise ratio
:type snr_min: float
:param snr_max: maximum signal-to-noise ratio
:type snr_max: float
:param noise_dir: root of where noise files are stored
:type noise_fir: basestring
:param index_file: index of noises of interest in noise_dir
:type index_file: basestring
:param source: select one from
- turk
- freesound
- chime
Note that this field is no longer required for the freesound
and chime
:type source: string
:param tags: optional parameter for specifying what
particular noises we want to add. See above for the available tags.
:type tags: list
:param tag_distr: optional noise distribution
:type tag_distr: dict
"""
def __init__(self,
rng,
snr_min,
snr_max,
noise_dir,
source,
allow_downsampling=None,
index_file=None,
tags=None,
tag_distr=None):
# Define all required parameter maps here.
self.rng = rng
self.snr_min = snr_min
self.snr_max = snr_max
self.noise_dir = noise_dir
self.source = source
self.allow_downsampling = allow_downsampling
self.index_file = index_file
self.tags = tags
self.tag_distr = tag_distr
# When new noise sources are added, make sure to define the
# associated bookkeeping variables here.
self.turk_noise_files = []
self.turk_noise_dir = None
self.audio_index = audio_database.AudioIndex()
def _init_data(self):
""" Preloads stuff from disk in an attempt (e.g. list of files, etc)
to make later loading faster. If the data configuration remains the
same, this function does nothing.
"""
noise_dir = self.noise_dir
index_file = self.index_file
source = self.source
if not index_file:
if source == TURK:
index_file = os.path.join(noise_dir, 'noise-samples-list')
logger.debug("index_file not provided; " + "defaulting to " +
index_file)
else:
if source != "":
assert source in USE_AUDIO_DATABASE_SOURCES, \
"{} not supported by audio_database".format(source)
index_file = os.path.join(noise_dir,
"audio_index_commercial.txt")
logger.debug("index_file not provided; " + "defaulting to " +
index_file)
if source == TURK:
if self.turk_noise_dir != noise_dir:
self.turk_noise_dir = noise_dir
self.turk_noise_files = _get_turk_noise_files(noise_dir,
index_file)
# elif source == TODO_SUPPORT_NON_AUDIO_DATABASE_BASED_SOURCES:
else:
if source != "":
assert source in USE_AUDIO_DATABASE_SOURCES, \
"{} not supported by audio_database".format(source)
self.audio_index.refresh_records_from_index_file(
self.noise_dir, index_file, self.tags)
def transform_audio(self, audio_segment):
"""Adds walla noise
:param audio_segment: Input audio
:type audio_segment: SpeechSegment
"""
# This handles the cases where the data source or directories change.
self._init_data
source = self.source
allow_downsampling = self.allow_downsampling
if source == TURK:
self._add_turk_noise(audio_segment, self.rng, allow_downsampling)
# elif source == TODO_SUPPORT_NON_AUDIO_DATABASE_BASED_SOURCES:
else:
self._add_noise(audio_segment, self.rng, allow_downsampling)
def _sample_snr(self):
""" Returns a float sampled in [`self.snr_min`, `self.snr_max`]
if both `self.snr_min` and `self.snr_max` are non-zero.
"""
snr_min = self.snr_min
snr_max = self.snr_max
sampled_snr = self.rng.uniform(snr_min, snr_max)
return sampled_snr
def _add_turk_noise(self, audio_segment, allow_downsampling):
""" Adds a turk noise to the input audio.
:param audio_segment: input audio
:type audio_segment: audiosegment
:param allow_downsampling: indicates whether downsampling
is allowed
:type allow_downsampling: boolean
"""
read_size = 0
if len(self.turk_noise_files) > 0:
snr = self._sample_snr(self.rng)
# Draw the noise file randomly from noise files that are
# slightly longer than the utterance
noise_bins = sorted(self.turk_noise_files.keys())
# note some bins can be empty, so we can't just round up
# to the nearest 2-sec interval
rounded_duration = get_first_larger(noise_bins,
audio_segment.duration)
noise_fname = \
self.rng.sample(self.turk_noise_files[rounded_duration], 1)[0]
noise = SpeechSegment.from_wav_file(noise_fname)
logger.debug('noise_fname {}'.format(noise_fname))
logger.debug('snr {}'.format(snr))
read_size = len(noise) * 2
# May throw exceptions, but this is caught by
# AudioFeaturizer.get_audio_files.
audio_segment.add_noise(
noise, snr, rng=self.rng, allow_downsampling=allow_downsampling)
def _add_noise(self, audio_segment, allow_downsampling):
""" Adds a noise indexed in audio_database.AudioIndex.
:param audio_segment: input audio
:type audio_segment: SpeechSegment
:param allow_downsampling: indicates whether downsampling
is allowed
:type allow_downsampling: boolean
Returns:
(SpeechSegment, int)
- sound with turk noise added
- number of bytes read from disk
"""
read_size = 0
tag_distr = self.tag_distr
if not self.audio_index.has_audio(tag_distr):
if tag_distr is None:
if not self.tags:
raise RuntimeError("The noise index does not have audio "
"files to sample from.")
else:
raise RuntimeError("The noise index does not have audio "
"files of the given tags to sample "
"from.")
else:
raise RuntimeError("The noise index does not have audio "
"files to match the target noise "
"distribution.")
else:
# Compute audio segment related statistics
audio_duration = audio_segment.duration
# Sample relevant augmentation parameters.
snr = self._sample_snr(self.rng)
# Perhaps, we may not have a sufficiently long noise, so we need
# to search iteratively.
min_duration = audio_duration + 0.25
for _ in range(FIND_NOISE_MAX_ATTEMPTS):
logger.debug("attempting to find noise of length "
"at least {}".format(min_duration))
success, record = \
self.audio_index.sample_audio(min_duration,
rng=self.rng,
distr=tag_distr)
if success is True:
noise_duration, read_size, noise_fname = record
# Assert after logging so we know
# what caused augmentation to fail.
logger.debug("noise_fname {}".format(noise_fname))
logger.debug("snr {}".format(snr))
assert noise_duration >= min_duration
break
# Decrease the desired minimum duration linearly.
# If the value becomes smaller than some threshold,
# we half the value instead.
if min_duration > HALF_NOISE_LENGTH_MIN_THRESHOLD:
min_duration -= 2.0
else:
min_duration *= 0.5
if success is False:
logger.info("Failed to find a noise file")
return
diff_duration = audio_duration + 0.25 - noise_duration
if diff_duration >= 0.0:
# Here, the noise is shorter than the audio file, so
# we pad with zeros to make sure the noise sound is applied
# with a uniformly random shift.
noise = SpeechSegment.from_file(noise_fname)
noise = noise.pad_silence(diff_duration, sides="both")
else:
# The noise clip is at least ~25 ms longer than the audio
# segment here.
diff_duration = int(noise_duration * audio_segment.sample_rate) - \
int(audio_duration * audio_segment.sample_rate) - \
int(0.02 * audio_segment.sample_rate)
start = float(self.rng.randint(0, diff_duration)) / \
audio.sample_rate
finish = min(start + audio_duration + 0.2, noise_duration)
noise = SpeechSegment.slice_from_file(noise_fname, start,
finish)
if len(noise) < len(audio_segment):
# This is to ensure that the noise clip is at least as
# long as the audio segment.
num_samples_to_pad = len(audio_segment) - len(noise)
# Padding this amount of silence on both ends ensures that
# the placement of the noise clip is uniformly random.
silence = SpeechSegment(
np.zeros(num_samples_to_pad), audio_segment.sample_rate)
noise = SpeechSegment.concatenate(silence, noise, silence)
audio_segment.add_noise(
noise, snr, rng=self.rng, allow_downsampling=allow_downsampling)
""" Online bayesian normalization
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from . import base
class OnlineBayesianNormalizationAugmentor(base.AugmentorBase):
"""
Instantiates an online bayesian normalization module.
:param target_db: Target RMS value in decibels
:type target_db: func[int->scalar]
:param prior_db: Prior RMS estimate in decibels
:type prior_db: func[int->scalar]
:param prior_samples: Prior strength in number of samples
:type prior_samples: func[int->scalar]
:param startup_delay: Start-up delay in seconds during
which normalization statistics is accrued.
:type starup_delay: func[int->scalar]
"""
def __init__(self,
rng,
target_db,
prior_db,
prior_samples,
startup_delay=base.parse_parameter_from(0.0)):
self.target_db = target_db
self.prior_db = prior_db
self.prior_samples = prior_samples
self.startup_delay = startup_delay
self.rng = rng
def transform_audio(self, audio_segment):
"""
Normalizes the input audio using the online Bayesian approach.
:param audio_segment: input audio
:type audio_segment: SpeechSegment
:param iteration: current iteration
:type iteration: int
:param text: audio transcription
:type text: basestring
:param rng: RNG to use for augmentation
:type rng: random.Random
"""
read_size = 0
target_db = self.target_db(iteration)
prior_db = self.prior_db(iteration)
prior_samples = self.prior_samples(iteration)
startup_delay = self.startup_delay(iteration)
audio.normalize_online_bayesian(
target_db, prior_db, prior_samples, startup_delay=startup_delay)
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from . import base
class ResamplerAugmentor(base.AugmentorBase):
""" Instantiates a resampler module.
:param new_sample_rate: New sample rate in Hz
:type new_sample_rate: func[int->scalar]
:param rng: Random generator object.
:type rng: random.Random
"""
def __init__(self, rng, new_sample_rate):
self.new_sample_rate = new_sample_rate
self._rng = rng
def transform_audio(self, audio_segment):
""" Resamples the input audio to the target sample rate.
Note that this is an in-place transformation.
:param audio: input audio
:type audio: SpeechDLSegment
"""
new_sample_rate = self.new_sample_rate
audio.resample(new_sample_rate)
\ No newline at end of file
"""Speed perturbation module for making ASR robust to different voice
types (high pitched, low pitched, etc)
Samples uniformly between speed_min and speed_max
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from . import base
class SpeedPerturbatioAugmentor(base.AugmentorBase):
"""
Instantiates a speed perturbation module.
See reference paper here:
http://www.danielpovey.com/files/2015_interspeech_augmentation.pdf
:param speed_min: Lower bound on new rate to sample
:type speed_min: func[int->scalar]
:param speed_max: Upper bound on new rate to sample
:type speed_max: func[int->scalar]
"""
def __init__(self, rng, speed_min, speed_max):
if (speed_min < 0.9):
raise ValueError(
"Sampling speed below 0.9 can cause unnatural effects")
if (speed_min > 1.1):
raise ValueError(
"Sampling speed above 1.1 can cause unnatural effects")
self.speed_min = speed_min
self.speed_max = speed_max
self.rng = rng
def transform_audio(self, audio_segment):
"""
Samples a new speed rate from the given range and
changes the speed of the given audio clip.
Note that this is an in-place transformation.
:param audio_segment: input audio
:type audio_segment: SpeechDLSegment
"""
read_size = 0
speed_min = self.speed_min(iteration)
speed_max = self.speed_max(iteration)
sampled_speed = rng.uniform(speed_min, speed_max)
audio = audio.change_speed(sampled_speed)
......@@ -3,10 +3,10 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from . import base
from data_utils.augmentor.base import AugmentorBase
class VolumePerturbAugmentor(base.AugmentorBase):
class VolumePerturbAugmentor(AugmentorBase):
"""Augmentation model for adding random volume perturbation.
This is used for multi-loudness training of PCEN. See
......
SoundFile==0.9.0.post1
wget==3.2
scikits.samplerate==0.3.3
scipy==0.13.0b1
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册