Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
models
提交
42ba74ef
M
models
项目概览
PaddlePaddle
/
models
大约 1 年 前同步成功
通知
222
Star
6828
Fork
2962
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
602
列表
看板
标记
里程碑
合并请求
255
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
models
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
602
Issue
602
列表
看板
标记
里程碑
合并请求
255
合并请求
255
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
42ba74ef
编写于
6月 16, 2017
作者:
chrisxu2014
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add audio part
上级
a84bdf64
变更
10
显示空白变更内容
内联
并排
Showing
10 changed file
with
215 addition
and
1198 deletion
+215
-1198
deep_speech_2/data_utils/audio.py
deep_speech_2/data_utils/audio.py
+211
-246
deep_speech_2/data_utils/augmentor/audio_database.py
deep_speech_2/data_utils/augmentor/audio_database.py
+0
-401
deep_speech_2/data_utils/augmentor/augmentation.py
deep_speech_2/data_utils/augmentor/augmentation.py
+0
-15
deep_speech_2/data_utils/augmentor/implus_response.py
deep_speech_2/data_utils/augmentor/implus_response.py
+0
-76
deep_speech_2/data_utils/augmentor/noise_speech.py
deep_speech_2/data_utils/augmentor/noise_speech.py
+0
-318
deep_speech_2/data_utils/augmentor/online_bayesian_normalization.py
...h_2/data_utils/augmentor/online_bayesian_normalization.py
+0
-57
deep_speech_2/data_utils/augmentor/resampler.py
deep_speech_2/data_utils/augmentor/resampler.py
+0
-30
deep_speech_2/data_utils/augmentor/speed_perturb.py
deep_speech_2/data_utils/augmentor/speed_perturb.py
+0
-53
deep_speech_2/data_utils/augmentor/volume_perturb.py
deep_speech_2/data_utils/augmentor/volume_perturb.py
+2
-2
deep_speech_2/requirements.txt
deep_speech_2/requirements.txt
+2
-0
未找到文件。
deep_speech_2/data_utils/audio.py
浏览文件 @
42ba74ef
...
@@ -8,6 +8,7 @@ import io
...
@@ -8,6 +8,7 @@ import io
import
soundfile
import
soundfile
import
scikits.samplerate
import
scikits.samplerate
from
scipy
import
signal
from
scipy
import
signal
import
random
class
AudioSegment
(
object
):
class
AudioSegment
(
object
):
...
@@ -46,6 +47,32 @@ class AudioSegment(object):
...
@@ -46,6 +47,32 @@ class AudioSegment(object):
"""Return whether two objects are unequal."""
"""Return whether two objects are unequal."""
return
not
self
.
__eq__
(
other
)
return
not
self
.
__eq__
(
other
)
def
__len__
(
self
):
"""Returns length of segment in samples."""
return
self
.
num_samples
def
__add__
(
self
,
other
):
"""Add samples from another segment to those of this segment and return
a new segment (sample-wise addition, not segment concatenation).
:param other: Segment containing samples to be
added in.
:type other: AudioSegment
:return: New segment containing resulting samples.
:rtype: AudioSegment
:raise TypeError: If sample rates of segments don't match,
or if length of segments don't match.
"""
if
type
(
self
)
!=
type
(
other
):
raise
TypeError
(
"Cannot add segment of different type: {}"
.
format
(
type
(
other
)))
if
self
.
_sample_rate
!=
other
.
_sample_rate
:
raise
TypeError
(
"Sample rates must match to add segments."
)
if
len
(
self
.
_samples
)
!=
len
(
other
.
_samples
):
raise
TypeError
(
"Segment lengths must match to add segments."
)
samples
=
self
.
samples
+
other
.
samples
return
type
(
self
)(
samples
,
sample_rate
=
self
.
_sample_rate
)
def
__str__
(
self
):
def
__str__
(
self
):
"""Return human-readable representation of segment."""
"""Return human-readable representation of segment."""
return
(
"%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, "
return
(
"%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, "
...
@@ -64,69 +91,6 @@ class AudioSegment(object):
...
@@ -64,69 +91,6 @@ class AudioSegment(object):
samples
,
sample_rate
=
soundfile
.
read
(
file
,
dtype
=
'float32'
)
samples
,
sample_rate
=
soundfile
.
read
(
file
,
dtype
=
'float32'
)
return
cls
(
samples
,
sample_rate
)
return
cls
(
samples
,
sample_rate
)
@
classmethod
def
slice_from_file
(
cls
,
fname
,
start
=
None
,
end
=
None
):
"""
Loads a small section of an audio without having to load
the entire file into the memory which can be incredibly wasteful.
:param fname: input audio file name
:type fname: bsaestring
:param start: start time in seconds (supported granularity is ms)
If start is negative, it wraps around from the end. If not
provided, this function reads from the very beginning.
:type start: float
:param end: start time in seconds (supported granularity is ms)
If end is negative, it wraps around from the end. If not
provided, the default behvaior is to read to the end of the
file.
:type end: float
:return:the specified slice of input audio in the audio.AudioSegment
format.
"""
sndfile
=
soundfile
.
SoundFile
(
fname
)
sample_rate
=
sndfile
.
samplerate
if
sndfile
.
channels
!=
1
:
raise
TypeError
(
"{} has more than 1 channel."
.
format
(
fname
))
duration
=
float
(
len
(
sndfile
))
/
sample_rate
if
start
is
None
:
start
=
0.0
if
end
is
None
:
end
=
duration
if
start
<
0.0
:
start
+=
duration
if
end
<
0.0
:
end
+=
duration
if
start
<
0.0
:
raise
IndexError
(
"The slice start position ({} s) is out of "
"bounds. Filename: {}"
.
format
(
start
,
fname
))
if
end
<
0.0
:
raise
IndexError
(
"The slice end position ({} s) is out of bounds "
"Filename: {}"
.
format
(
end
,
fname
))
if
start
>
end
:
raise
IndexError
(
"The slice start position ({} s) is later than "
"the slice end position ({} s)."
.
format
(
start
,
end
))
if
end
>
duration
:
raise
ValueError
(
"The slice end time ({} s) is out of "
"bounds (> {} s) Filename: {}"
.
format
(
end
,
duration
,
fname
))
start_frame
=
int
(
start
*
sample_rate
)
end_frame
=
int
(
end
*
sample_rate
)
sndfile
.
seek
(
start_frame
)
data
=
sndfile
.
read
(
frames
=
end_frame
-
start_frame
,
dtype
=
'float32'
)
return
cls
(
data
,
sample_rate
)
@
classmethod
@
classmethod
def
from_bytes
(
cls
,
bytes
):
def
from_bytes
(
cls
,
bytes
):
"""Create audio segment from a byte string containing audio samples.
"""Create audio segment from a byte string containing audio samples.
...
@@ -140,43 +104,30 @@ class AudioSegment(object):
...
@@ -140,43 +104,30 @@ class AudioSegment(object):
io
.
BytesIO
(
bytes
),
dtype
=
'float32'
)
io
.
BytesIO
(
bytes
),
dtype
=
'float32'
)
return
cls
(
samples
,
sample_rate
)
return
cls
(
samples
,
sample_rate
)
@
classmethod
def
concatenate
(
self
,
*
segments
):
def
make_silence
(
cls
,
duration
,
sample_rate
):
"""Creates a silent audio segment of the given duration and
sample rate.
:param duration: length of silence in seconds
:type duration: scalar
:param sample_rate: sample rate
:type sample_rate: scalar
:returns: silence of the given duration
:rtype: AudioSegment
"""
samples
=
np
.
zeros
(
int
(
float
(
duration
)
*
sample_rate
))
return
cls
(
samples
,
sample_rate
)
@
classmethod
def
concatenate
(
cls
,
*
segments
):
"""Concatenate an arbitrary number of audio segments together.
"""Concatenate an arbitrary number of audio segments together.
:param *segments: input audio segments
:param *segments: Input audio segments
:type *segments: [AudioSegment]
:type *segments: AudioSegment
:return: Audio segment instance.
:rtype: AudioSegment
:raises ValueError: If number of segments is zero, or if sample_rate
not match between two audio segments
:raises TypeError: If item of segments is not Audiosegment instance
"""
"""
# Perform basic sanity-checks.
# Perform basic sanity-checks.
N
=
len
(
segments
)
if
len
(
segments
)
==
0
:
if
N
==
0
:
raise
ValueError
(
"No audio segments are given to concatenate."
)
raise
ValueError
(
"No audio segments are given to concatenate."
)
sample_rate
=
segments
[
0
].
_sample_rate
sample_rate
=
segments
[
0
].
_sample_rate
for
seg
ment
in
segments
:
for
seg
in
segments
:
if
sample_rate
!=
seg
ment
.
_sample_rate
:
if
sample_rate
!=
seg
.
_sample_rate
:
raise
ValueError
(
"Can't concatenate segments with "
raise
ValueError
(
"Can't concatenate segments with "
"different sample rates"
)
"different sample rates"
)
if
type
(
seg
ment
)
is
not
cls
:
if
type
(
seg
)
is
not
type
(
self
)
:
raise
TypeError
(
"Only audio segments of the same type "
raise
TypeError
(
"Only audio segments of the same type "
"instance can be concatenated."
)
"instance can be concatenated."
)
samples
=
np
.
concatenate
([
seg
.
samples
for
seg
in
segments
])
samples
=
np
.
concatenate
([
seg
.
samples
for
seg
in
segments
])
return
cls
(
samples
,
sample_rate
)
return
type
(
self
)
(
samples
,
sample_rate
)
def
to_wav_file
(
self
,
filepath
,
dtype
=
'float32'
):
def
to_wav_file
(
self
,
filepath
,
dtype
=
'float32'
):
"""Save audio segment to disk as wav file.
"""Save audio segment to disk as wav file.
...
@@ -203,6 +154,65 @@ class AudioSegment(object):
...
@@ -203,6 +154,65 @@ class AudioSegment(object):
format
=
'WAV'
,
format
=
'WAV'
,
subtype
=
subtype_map
[
dtype
])
subtype
=
subtype_map
[
dtype
])
def
slice_from_file
(
self
,
file
,
start
=
None
,
end
=
None
):
"""Loads a small section of an audio without having to load
the entire file into the memory which can be incredibly wasteful.
:param file: Input audio filepath
:type file: basestring
:param start: Start time in seconds. If start is negative, it wraps
around from the end. If not provided, this function
reads from the very beginning.
:type start: float
:param end: End time in seconds. If end is negative, it wraps around
from the end. If not provided, the default behvaior is
to read to the end of the file.
:type end: float
:return: The specified slice of input audio in the audio.AudioSegment format.
:rtype: AudioSegment
:rainse ValueError: If the position is error, or if the time is out bounds.
"""
sndfile
=
soundfile
.
SoundFile
(
file
)
sample_rate
=
sndfile
.
samplerate
duration
=
float
(
len
(
sndfile
))
/
sample_rate
start
=
0.
if
start
is
None
else
start
end
=
0.
if
end
is
None
else
end
if
start
<
0.0
:
start
+=
duration
if
end
<
0.0
:
end
+=
duration
if
start
<
0.0
:
raise
ValueError
(
"The slice start position (%f s) is out of "
"bounds. Filename: %s"
%
(
start
,
file
))
if
end
<
0.0
:
raise
ValueError
(
"The slice end position (%f s) is out of bounds "
"Filename: %s"
%
(
end
,
file
))
if
start
>
end
:
raise
ValueError
(
"The slice start position (%f s) is later than "
"the slice end position (%f s)."
%
(
start
,
end
))
if
end
>
duration
:
raise
ValueError
(
"The slice end time (%f s) is out of bounds "
"(> %f s) Filename: %s"
%
(
end
,
duration
,
file
))
start_frame
=
int
(
start
*
sample_rate
)
end_frame
=
int
(
end
*
sample_rate
)
sndfile
.
seek
(
start_frame
)
data
=
sndfile
.
read
(
frames
=
end_frame
-
start_frame
,
dtype
=
'float32'
)
return
type
(
self
)(
data
,
sample_rate
)
def
make_silence
(
self
,
duration
,
sample_rate
):
"""Creates a silent audio segment of the given duration and
sample rate.
:param duration: Length of silence in seconds
:type duration: float
:param sample_rate: Sample rate
:type sample_rate: float
:return: Silence of the given duration
:rtype: AudioSegment
"""
samples
=
np
.
zeros
(
int
(
duration
*
sample_rate
))
return
type
(
self
)(
samples
,
sample_rate
)
def
to_bytes
(
self
,
dtype
=
'float32'
):
def
to_bytes
(
self
,
dtype
=
'float32'
):
"""Create a byte string containing the audio content.
"""Create a byte string containing the audio content.
...
@@ -247,52 +257,49 @@ class AudioSegment(object):
...
@@ -247,52 +257,49 @@ class AudioSegment(object):
self
.
_samples
=
np
.
interp
(
new_indices
,
old_indices
,
self
.
_samples
)
self
.
_samples
=
np
.
interp
(
new_indices
,
old_indices
,
self
.
_samples
)
def
normalize
(
self
,
target_db
=-
20
,
max_gain_db
=
300.0
):
def
normalize
(
self
,
target_db
=-
20
,
max_gain_db
=
300.0
):
"""Normalize audio to desired RMS value in decibels.
"""Normalize audio to
be
desired RMS value in decibels.
Note that this is an in-place transformation.
Note that this is an in-place transformation.
:param target_db: Target RMS value in decibels.
This value
:param target_db: Target RMS value in decibels.
This value should
should
be less than 0.0 as 0.0 is full-scale audio.
be less than 0.0 as 0.0 is full-scale audio.
:type target_db: float
, optional
:type target_db: float
:param max_gain_db: Max amount of gain in dB that can be applied
:param max_gain_db: Max amount of gain in dB that can be applied
for
for normalization.
This is to prevent nans when attempting
normalization.
This is to prevent nans when attempting
to normalize a signal consisting of all zeros.
to normalize a signal consisting of all zeros.
:type max_gain_db: float, optional
:type max_gain_db: float
:raises ValueError: If the required gain to normalize the segment to
:raises NormalizationWarning: if the required gain to normalize the
the target_db value exceeds max_gain_db.
segment to the target_db value exceeds max_gain_db.
"""
"""
gain
=
target_db
-
self
.
rms_db
gain
=
target_db
-
self
.
rms_db
if
gain
>
max_gain_db
:
if
gain
>
max_gain_db
:
raise
ValueError
(
raise
ValueError
(
"Unable to normalize segment to {} dB because it has an RMS "
"Unable to normalize segment to %f dB because it has an RMS "
"value of {} dB and the difference exceeds max_gain_db ({} dB)"
"value of %f dB and the difference exceeds max_gain_db (%f dB)"
.
format
(
target_db
,
self
.
rms_db
,
max_gain_db
))
%
(
target_db
,
self
.
rms_db
,
max_gain_db
))
gain
=
min
(
max_gain_db
,
target_db
-
self
.
rms_db
)
self
.
apply_gain
(
min
(
max_gain_db
,
target_db
-
self
.
rms_db
))
self
.
apply_gain
(
gain
)
def
normalize_online_bayesian
(
self
,
def
normalize_online_bayesian
(
self
,
target_db
,
target_db
,
prior_db
,
prior_db
,
prior_samples
,
prior_samples
,
startup_delay
=
0.0
):
startup_delay
=
0.0
):
"""
"""Normalize audio using a production-compatible online/causal algorithm.
Normalize audio using a production-compatible online/causal algorithm.
This uses an exponential likelihood and gamma prior to make online estimates
This uses an exponential likelihood and gamma prior to make
of the RMS even when there are very few samples.
online estimates of the RMS even when there are very few samples.
Note that this is an in-place transformation.
Note that this is an in-place transformation.
:param target_db: Target RMS value in decibels
:param target_db: Target RMS value in decibels
:type target_bd:
scalar
:type target_bd:
float
:param prior_db: Prior RMS estimate in decibels
:param prior_db: Prior RMS estimate in decibels
:type prior_db:
scalar
:type prior_db:
float
:param prior_samples: Prior strength in number of samples
:param prior_samples: Prior strength in number of samples
:type prior_samples:
scalar
:type prior_samples:
float
:param startup_delay: Default
: 0.0 s. If provided, this
:param startup_delay: Default
0.0 s. If provided, this function will accrue
function will accrue statistics for the first startup_delay
statistics for the first startup_delay seconds before
seconds before
applying online normalization.
applying online normalization.
:type startup_delay:
scalar
:type startup_delay:
float
"""
"""
# Estimate total RMS online
# Estimate total RMS online
startup_sample_idx
=
min
(
self
.
num_samples
-
1
,
startup_sample_idx
=
min
(
self
.
num_samples
-
1
,
...
@@ -309,52 +316,18 @@ class AudioSegment(object):
...
@@ -309,52 +316,18 @@ class AudioSegment(object):
mean_squared_estimate
=
((
cumsum_of_squares
+
prior_sum_of_squares
)
/
mean_squared_estimate
=
((
cumsum_of_squares
+
prior_sum_of_squares
)
/
(
sample_count
+
prior_samples
))
(
sample_count
+
prior_samples
))
rms_estimate_db
=
10
*
np
.
log10
(
mean_squared_estimate
)
rms_estimate_db
=
10
*
np
.
log10
(
mean_squared_estimate
)
# Compute required time-varying gain
# Compute required time-varying gain
gain_db
=
target_db
-
rms_estimate_db
gain_db
=
target_db
-
rms_estimate_db
# Apply gain to new segment
self
.
apply_gain
(
gain_db
)
def
normalize_ewma
(
self
,
target_db
,
decay_rate
,
startup_delay
,
rms_eps
=
1e-6
,
max_gain_db
=
300.0
):
startup_sample_idx
=
min
(
self
.
num_samples
-
1
,
int
(
self
.
sample_rate
*
startup_delay
))
mean_sq
=
self
.
samples
**
2
if
startup_sample_idx
>
0
:
mean_sq
[:
startup_sample_idx
]
=
\
np
.
sum
(
mean_sq
[:
startup_sample_idx
])
/
startup_sample_idx
idx_start
=
max
(
0
,
startup_sample_idx
-
1
)
initial_condition
=
mean_sq
[
idx_start
]
*
decay_rate
mean_sq
[
idx_start
:]
=
lfilter
(
[
1.0
-
decay_rate
],
[
1.0
,
-
decay_rate
],
mean_sq
[
idx_start
:],
axis
=
0
,
zi
=
[
initial_condition
])[
0
]
rms_estimate_db
=
10.0
*
np
.
log10
(
mean_sq
+
rms_eps
)
gain_db
=
target_db
-
rms_estimate_db
if
np
.
any
(
gain_db
>
max_gain_db
):
warnings
.
warn
(
"Unable to normalize segment to {} dB because it has an RMS "
"value of {} dB and the difference exceeds max_gain_db ({} dB)"
.
format
(
target_db
,
self
.
rms_db
,
max_gain_db
),
NormalizationWarning
)
gain_db
=
np
.
minimum
(
gain_db
,
max_gain_db
)
self
.
apply_gain
(
gain_db
)
self
.
apply_gain
(
gain_db
)
def
resample
(
self
,
target_sample_rate
,
quality
=
'sinc_medium'
):
def
resample
(
self
,
target_sample_rate
,
quality
=
'sinc_medium'
):
"""Resample audio and return new AudioSegment.
"""Resample audio segment. This resamples the audio to a new
This resamples the audio to a new sample rate and returns a brand
sample rate.
new AudioSegment. The existing AudioSegment is unchanged.
Note that this is an in-place transformation.
Note that this is an in-place transformation.
:param
new_sample_rate: t
arget sample rate
:param
target_sample_rate: T
arget sample rate
:type
new_sample_rate: scalar
:type
target_sample_rate: int
:param quality: One of {'sinc_fastest', 'sinc_medium', 'sinc_best'}.
:param quality: One of {'sinc_fastest', 'sinc_medium', 'sinc_best'}.
Sets resampling speed/quality tradeoff.
Sets resampling speed/quality tradeoff.
See http://www.mega-nerd.com/SRC/api_misc.html#Converters
See http://www.mega-nerd.com/SRC/api_misc.html#Converters
...
@@ -364,33 +337,33 @@ class AudioSegment(object):
...
@@ -364,33 +337,33 @@ class AudioSegment(object):
new_samples
=
scikits
.
samplerate
.
resample
(
new_samples
=
scikits
.
samplerate
.
resample
(
self
.
_samples
,
r
=
resample_ratio
,
type
=
quality
)
self
.
_samples
,
r
=
resample_ratio
,
type
=
quality
)
self
.
_samples
=
new_samples
self
.
_samples
=
new_samples
self
.
_sample_rate
=
new
_sample_rate
self
.
_sample_rate
=
target
_sample_rate
def
pad_silence
(
self
,
duration
,
sides
=
'both'
):
def
pad_silence
(
self
,
duration
,
sides
=
'both'
):
"""Pads this audio sample with a period of silence.
"""Pads this audio sample with a period of silence.
Note that this is an in-place transformation.
Note that this is an in-place transformation.
:param duration:
l
ength of silence in seconds to pad
:param duration:
L
ength of silence in seconds to pad
:type duration: float
:type duration: float
:param sides:
:param sides:
Position for padding
'beginning' - adds silence in the beginning
'beginning' - adds silence in the beginning
'end' - adds silence in the end
'end' - adds silence in the end
'both' - adds silence in both the beginning and the end.
'both' - adds silence in both the beginning and the end.
:type sides: basestring
:type sides: str
:raises ValueError: If the sides not surport
"""
"""
if
duration
==
0.0
:
if
duration
==
0.0
:
return
self
return
self
cls
=
type
(
self
)
silence
=
self
.
make_silence
(
duration
,
self
.
_sample_rate
)
silence
=
cls
.
make_silence
(
duration
,
self
.
_sample_rate
)
if
sides
==
"beginning"
:
if
sides
==
"beginning"
:
padded
=
cls
.
concatenate
(
silence
,
self
)
padded
=
self
.
concatenate
(
silence
,
self
)
elif
sides
==
"end"
:
elif
sides
==
"end"
:
padded
=
cls
.
concatenate
(
self
,
silence
)
padded
=
self
.
concatenate
(
self
,
silence
)
elif
sides
==
"both"
:
elif
sides
==
"both"
:
padded
=
cls
.
concatenate
(
silence
,
self
,
silence
)
padded
=
self
.
concatenate
(
silence
,
self
,
silence
)
else
:
else
:
raise
ValueError
(
"Unknown value for the kwarg
'sides'"
)
raise
ValueError
(
"Unknown value for the kwarg
%s"
%
sides
)
self
.
_samples
=
padded
.
_samples
self
.
_samples
=
padded
.
_samples
self
.
_sample_rate
=
padded
.
_sample_rate
self
.
_sample_rate
=
padded
.
_sample_rate
...
@@ -399,87 +372,82 @@ class AudioSegment(object):
...
@@ -399,87 +372,82 @@ class AudioSegment(object):
:param start_sec: Beginning of subsegment in seconds,
:param start_sec: Beginning of subsegment in seconds,
(beginning of segment if None).
(beginning of segment if None).
:type start_sec:
scalar
:type start_sec:
float
:param end_sec: End of subsegment in seconds,
:param end_sec: End of subsegment in seconds,
(end of segment if None).
(end of segment if None).
:type end_sec: scalar
:type end_sec: float
:return: New AudioSegment containing specified subsegment.
:return: New AudioSegment containing specified
:rtype: AudioSegment
subsegment.
:trype: AudioSegment
"""
"""
# Default boundaries
start_sec
=
0.0
if
start_sec
is
None
else
start_sec
if
start_sec
is
None
:
end_sec
=
self
.
duration
if
end_sec
is
None
else
end_sec
start_sec
=
0.0
if
end_sec
is
None
:
end_sec
=
self
.
duration
# negative boundaries are relative to end of segment
# negative boundaries are relative to end of segment
if
start_sec
<
0.0
:
if
start_sec
<
0.0
:
start_sec
=
self
.
duration
+
start_sec
start_sec
=
self
.
duration
+
start_sec
if
end_sec
<
0.0
:
if
end_sec
<
0.0
:
end_sec
=
self
.
duration
+
end_sec
end_sec
=
self
.
duration
+
end_sec
start_sample
=
int
(
round
(
start_sec
*
self
.
_sample_rate
))
start_sample
=
int
(
round
(
start_sec
*
self
.
_sample_rate
))
end_sample
=
int
(
round
(
end_sec
*
self
.
_sample_rate
))
end_sample
=
int
(
round
(
end_sec
*
self
.
_sample_rate
))
samples
=
self
.
_samples
[
start_sample
:
end_sample
]
samples
=
self
.
_samples
[
start_sample
:
end_sample
]
return
type
(
self
)(
samples
,
sample_rate
=
self
.
_sample_rate
)
return
type
(
self
)(
samples
,
sample_rate
=
self
.
_sample_rate
)
def
random_subsegment
(
self
,
subsegment_length
,
rng
=
None
):
def
random_subsegment
(
self
,
subsegment_length
,
rng
=
None
):
"""
"""Return a random subsegment of a specified length in seconds.
Return a random subsegment of a specified length in seconds.
:param subsegment_length: Subsegment length in seconds.
:param subsegment_length: Subsegment length in seconds.
:type subsegment_length:
scalar
:type subsegment_length:
float
:param rng: Random number generator state
:param rng: Random number generator state
:type rng: random.Random [optional]
:type rng: random.Random
:return: New AudioSegment containing random subsegment
of original segment
:return:clip (SpeechDLSegment): New SpeechDLSegmen containing random
:rtype: AudioSegment
subsegment of original segment.
:raises ValueError: If the length of subsegment greater than origineal
segemnt.
"""
"""
if
rng
is
None
:
rng
=
random
.
Random
()
if
rng
is
None
else
rng
rng
=
random
.
Random
()
if
subsegment_length
>
self
.
duration
:
if
subsegment_length
>
self
.
duration
:
raise
ValueError
(
"Length of subsegment must not be greater "
raise
ValueError
(
"Length of subsegment must not be greater "
"than original segment."
)
"than original segment."
)
start_time
=
rng
.
uniform
(
0.0
,
self
.
duration
-
subsegment_length
)
start_time
=
rng
.
uniform
(
0.0
,
self
.
duration
-
subsegment_length
)
return
self
.
subsegment
(
start_time
,
start_time
+
subsegment_length
)
return
self
.
subsegment
(
start_time
,
start_time
+
subsegment_length
)
def
convolve
(
self
,
i
r
,
allow_resampling
=
False
):
def
convolve
(
self
,
i
mpulse_segment
,
allow_resample
=
False
):
"""Convolve this audio segment with the given filter.
"""Convolve this audio segment with the given filter.
:param ir: impulse response
Note that this is an in-place transformation.
:type ir: AudioSegment
:param allow_resampling: indicates whether resampling is allowed
when the ir has a different sample rate from this signal.
:type allow_resampling: boolean
"""
if
allow_resampling
and
self
.
sample_rate
!=
ir
.
sample_rate
:
ir
=
ir
.
resample
(
self
.
sample_rate
)
if
self
.
sample_rate
!=
ir
.
sample_rate
:
raise
ValueError
(
"Impulse response sample rate ({}Hz) is "
"equal to base signal sample rate ({}Hz)."
.
format
(
ir
.
sample_rate
,
self
.
sample_rate
))
samples
=
signal
.
fftconvolve
(
self
.
samples
,
ir
.
samples
,
"full"
)
:param impulse_segment: Impulse response segments.
:type impulse_segment: AudioSegment
:param allow_resample: indicates whether resampling is allowed when
the impulse_segment has a different sample
rate from this signal.
:type allow_resample: boolean
:raises ValueError: If the sample rate is not match between two
audio segments and resample is not allowed.
"""
if
allow_resample
and
self
.
sample_rate
!=
impulse_segment
.
sample_rate
:
impulse_segment
=
impulse_segment
.
resample
(
self
.
sample_rate
)
if
self
.
sample_rate
!=
impulse_segment
.
sample_rate
:
raise
ValueError
(
"Impulse segment's sample rate (%d Hz) is not"
"equal to base signal sample rate (%d Hz)."
%
(
impulse_segment
.
sample_rate
,
self
.
sample_rate
))
samples
=
signal
.
fftconvolve
(
self
.
samples
,
impulse_segment
.
samples
,
"full"
)
self
.
_samples
=
samples
self
.
_samples
=
samples
def
convolve_and_normalize
(
self
,
i
r
,
allow_resample
=
False
):
def
convolve_and_normalize
(
self
,
i
mpulse_segment
,
allow_resample
=
False
):
"""Convolve and normalize the resulting audio segment so that it
"""Convolve and normalize the resulting audio segment so that it
has the same average power as the input signal.
has the same average power as the input signal.
:param i
r: impulse response
:param i
mpulse_segment: Impulse response segments.
:type i
r
: AudioSegment
:type i
mpulse_segment
: AudioSegment
:param allow_resampl
ing: indicates whether resampling is allowed
:param allow_resampl
e: indicates whether resampling is allowed when
when the ir
has a different sample rate from this signal.
the impulse_segment
has a different sample rate from this signal.
:type allow_resampl
ing
: boolean
:type allow_resampl
e
: boolean
"""
"""
self
.
convolve
(
ir
,
allow_resampling
=
allow_resampling
)
target_db
=
self
.
rms_db
self
.
normalize
(
target_db
=
self
.
rms_db
)
self
.
convolve
(
impulse_segment
,
allow_resample
=
allow_resample
)
self
.
normalize
(
target_db
)
def
add_noise
(
self
,
def
add_noise
(
self
,
noise
,
noise
,
...
@@ -492,36 +460,33 @@ class AudioSegment(object):
...
@@ -492,36 +460,33 @@ class AudioSegment(object):
of matching length is sampled from it and used instead.
of matching length is sampled from it and used instead.
:param noise: Noise signal to add.
:param noise: Noise signal to add.
:type noise:
SpeechDL
Segment
:type noise:
Audio
Segment
:param snr_dB: Signal-to-Noise Ratio, in decibels.
:param snr_dB: Signal-to-Noise Ratio, in decibels.
:type snr_dB:
scalar
:type snr_dB:
float
:param allow_downsampling: whether to allow the noise signal
:param allow_downsampling: whether to allow the noise signal
to be downsampled
to be downsampled
to match the base signal sample rate.
to match the base signal sample rate.
:type allow_downsampling: boolean
:type allow_downsampling: boolean
:param max_gain_db: Maximum amount of gain to apply to noise
:param max_gain_db: Maximum amount of gain to apply to noise
signal before
signal before adding it in. This is to prevent attempting
adding it in. This is to prevent attempting to apply infinite
to apply infinite
gain to a zero signal.
gain to a zero signal.
:type max_gain_db:
scalar
:type max_gain_db:
float
:param rng: Random number generator state.
:param rng: Random number generator state.
:type rng: random.Random
:type rng: random.Random
:raises ValueError: If the sample rate does not match between the two audio segments
Returns:
and resample is not allowed, or if the duration of noise segments
SpeechDLSegment: signal with noise added
.
is shorter than original audio segments
.
"""
"""
if
rng
is
None
:
rng
=
random
.
Random
()
if
rng
is
None
else
rng
rng
=
random
.
Random
()
if
allow_downsampling
and
noise
.
sample_rate
>
self
.
sample_rate
:
if
allow_downsampling
and
noise
.
sample_rate
>
self
.
sample_rate
:
noise
=
noise
.
resample
(
self
.
sample_rate
)
noise
=
noise
.
resample
(
self
.
sample_rate
)
if
noise
.
sample_rate
!=
self
.
sample_rate
:
if
noise
.
sample_rate
!=
self
.
sample_rate
:
raise
ValueError
(
"Noise sample rate (
{}
Hz) is not equal to "
raise
ValueError
(
"Noise sample rate (
%d
Hz) is not equal to "
"base signal sample rate (
{}Hz)."
"base signal sample rate (
%d Hz)."
%
.
format
(
noise
.
sample_rate
,
self
.
sample_rate
))
(
noise
.
sample_rate
,
self
.
sample_rate
))
if
noise
.
duration
<
self
.
duration
:
if
noise
.
duration
<
self
.
duration
:
raise
ValueError
(
"Noise signal (
{}
sec) must be at "
raise
ValueError
(
"Noise signal (
%f
sec) must be at "
"least as long as base signal (
{} sec)."
"least as long as base signal (
%f sec)."
%
.
format
(
noise
.
duration
,
self
.
duration
))
(
noise
.
duration
,
self
.
duration
))
noise_gain_db
=
self
.
rms_db
-
noise
.
rms_db
-
snr_dB
noise_gain_db
=
self
.
rms_db
-
noise
.
rms_db
-
snr_dB
noise_gain_db
=
min
(
max_gain_db
,
noise_gain_db
)
noise_gain_db
=
min
(
max_gain_db
,
noise_gain_db
)
noise_subsegment
=
noise
.
random_subsegment
(
self
.
duration
,
rng
=
rng
)
noise_subsegment
=
noise
.
random_subsegment
(
self
.
duration
,
rng
=
rng
)
...
@@ -529,6 +494,12 @@ class AudioSegment(object):
...
@@ -529,6 +494,12 @@ class AudioSegment(object):
self
.
_samples
=
output
.
_samples
self
.
_samples
=
output
.
_samples
self
.
_sample_rate
=
output
.
_sample_rate
self
.
_sample_rate
=
output
.
_sample_rate
def
tranform_noise
(
self
,
noise_subsegment
,
noise_gain_db
):
""" tranform noise file
"""
return
type
(
self
)(
noise_subsegment
.
_samples
*
(
10.
**
(
noise_gain_db
/
20.
)),
noise_subsegment
.
_sample_rate
)
@
property
@
property
def
samples
(
self
):
def
samples
(
self
):
"""Return audio samples.
"""Return audio samples.
...
@@ -618,9 +589,3 @@ class AudioSegment(object):
...
@@ -618,9 +589,3 @@ class AudioSegment(object):
else
:
else
:
raise
TypeError
(
"Unsupported sample type: %s."
%
samples
.
dtype
)
raise
TypeError
(
"Unsupported sample type: %s."
%
samples
.
dtype
)
return
output_samples
.
astype
(
dtype
)
return
output_samples
.
astype
(
dtype
)
def
tranform_noise
(
self
,
noise_subsegment
,
noise_gain_db
):
""" tranform noise file
"""
return
type
(
self
)(
noise_subsegment
.
_samples
*
(
10.
**
(
noise_gain_db
/
20.
)),
noise_subsegment
.
_sample_rate
)
deep_speech_2/data_utils/augmentor/audio_database.py
已删除
100755 → 0
浏览文件 @
a84bdf64
from
__future__
import
print_function
from
collections
import
defaultdict
import
bisect
import
logging
import
numpy
as
np
import
os
import
random
import
sys
UNK_TAG
=
"<UNK>"
def
stream_audio_index
(
fname
,
UNK
=
UNK_TAG
):
"""Reads an audio index file and emits one record in the index at a time.
:param fname: audio index path
:type fname: basestring
:param UNK: UNK token to denote that certain audios are not tagged.
:type UNK: basesring
Yields:
idx, duration, size, relpath, tags (int, float, int, str, list(str)):
audio file id, length of the audio in seconds, size in byte,
relative path w.r.t. to the root noise directory, list of tags
"""
with
open
(
fname
)
as
audio_index_file
:
for
i
,
line
in
enumerate
(
audio_index_file
):
tok
=
line
.
strip
().
split
(
"
\t
"
)
assert
len
(
tok
)
>=
4
,
\
"Invalid line at line {} in file {}"
.
format
(
i
+
1
,
audio_index_file
)
idx
=
int
(
tok
[
0
])
duration
=
float
(
tok
[
1
])
# Sometimes, the duration can round down to 0.0
assert
duration
>=
0.0
,
\
"Invalid duration at line {} in file {}"
.
format
(
i
+
1
,
audio_index_file
)
size
=
int
(
tok
[
2
])
assert
size
>
0
,
\
"Invalid size at line {} in file {}"
.
format
(
i
+
1
,
audio_index_file
)
relpath
=
tok
[
3
]
if
len
(
tok
)
==
4
:
tags
=
[
UNK_TAG
]
else
:
tags
=
tok
[
4
:]
yield
idx
,
duration
,
size
,
relpath
,
tags
def
truncate_float
(
val
,
ndigits
=
6
):
""" Truncates a floating-point value to have the desired number of
digits after the decimal point.
:param val: input value.
:type val: float
:parma ndigits: desired number of digits.
:type ndigits: int
:return: truncated value
:rtype: float
"""
p
=
10.0
**
ndigits
return
float
(
int
(
val
*
p
))
/
p
def
print_audio_index
(
idx
,
duration
,
size
,
relpath
,
tags
,
file
=
sys
.
stdout
):
"""Prints an audio record to the index file.
:param idx: Audio file id.
:type idx: int
:param duration: length of the audio in seconds
:type duration: float
:param size: size of the file in bytes
:type size: int
:param relpath: relative path w.r.t. to the root noise directory.
:type relpath: basestring
:parma tags: list of tags
:parma tags: list(str)
:parma file: file to which we want to write an audio record.
:type file: sys.stdout
"""
file
.
write
(
"{}
\t
{:.6f}
\t
{}
\t
{}"
.
format
(
idx
,
truncate_float
(
duration
,
ndigits
=
6
),
size
,
relpath
))
for
tag
in
tags
:
file
.
write
(
"
\t
{}"
.
format
(
tag
))
file
.
write
(
"
\n
"
)
class
AudioIndex
(
object
):
""" In-memory index of audio files that do not have annotations.
This supports duration-based sampling and sampling from a target
distribution.
Each line in the index file consists of the following fields:
(id (int), duration (float), size (int), relative path (str),
list of tags ([str]))
"""
def
__init__
(
self
):
self
.
audio_dir
=
None
self
.
index_fname
=
None
self
.
tags
=
None
self
.
bin_size
=
2.0
self
.
clear
()
def
clear
(
self
):
""" Clears the index
Returns:
None
"""
self
.
idx_to_record
=
{}
# The list of indices correspond to audio files whose duration is
# greater than or equal to the key.
self
.
duration_to_id_set
=
{}
self
.
duration_to_id_set_per_tag
=
defaultdict
(
lambda
:
{})
self
.
duration_to_list
=
defaultdict
(
lambda
:
[])
self
.
duration_to_list_per_tag
=
defaultdict
(
lambda
:
defaultdict
(
lambda
:
[]))
self
.
tag_to_id_set
=
defaultdict
(
lambda
:
set
())
self
.
shared_duration_bins
=
[]
self
.
id_set_complete
=
set
()
self
.
id_set
=
set
()
self
.
duration_bins
=
[]
def
has_audio
(
self
,
distr
=
None
):
"""
:param distr: The target distribution of audio tags that we want to
match. If this is not supplied, the function simply checks that
there are some audio files.
:parma distr: dict
:return: True if there are audio files.
:rtype: boolean
"""
if
distr
is
None
:
return
len
(
self
.
id_set
)
>
0
else
:
for
tag
in
distr
:
if
tag
not
in
self
.
duration_to_list_per_tag
:
return
False
return
True
def
_load_all_records_from_disk
(
self
,
audio_dir
,
idx_fname
,
bin_size
):
"""Loads all audio records from the disk into memory and groups them
into chunks based on their duration and the bin_size granalarity.
Once all the records are read, indices are built from these records
by another function so that the audio samples can be drawn efficiently.
Updates:
self.audio_dir (path): audio root directory
self.idx_fname (path): audio database index filename
self.bin_size (float): granularity of bins
self.idx_to_record (dict): maps from the audio id to
(duration, file_size, relative_path, tags)
self.tag_to_id_set (dict): maps from the tag to
the set of id's of audios that have this tag.
self.id_set_complete (set): set of all audio id's in the index file
self.min_duration (float): minimum audio duration observed in the
index file
self.duration_bins (list): the lower bounds on the duration of
audio files falling in each bin
self.duration_to_id_set (dict): contains (k, v) where v is the set
of id's of audios whose lengths are longer than or equal to k.
(e.g. k is the duration lower bound of this bin).
self.duration_to_id_set_per_tag (dict): Something like above but
has a finer granularity mapping from the tag to
duration_to_id_set.
self.shared_duration_bins (list): list of sets where each set
contains duration lower bounds whose audio id sets are the
same. The rationale for having this is that there are a few
but extremely long audio files which lead to a lot of bins.
When the id sets do not change across various minimum duration
boundaries, we
cluster these together and make them point to the same id set
reference.
:return: whether the records were read from the disk. The assumption is
that the audio index file on disk and the actual audio files
are constructed once and never change during training. We only
re-read when either the directory or the index file path change.
"""
if
self
.
audio_dir
==
audio_dir
and
self
.
idx_fname
==
idx_fname
and
\
self
.
bin_size
==
bin_size
:
# The audio directory and/or the list of audio files
# haven't changed. No need to load the list again.
return
False
# Remember where the audio index is most recently read from.
self
.
audio_dir
=
audio_dir
self
.
idx_fname
=
idx_fname
self
.
bin_size
=
bin_size
# Read in the idx and compute the number of bins necessary
self
.
clear
()
rank
=
[]
min_duration
=
float
(
'inf'
)
max_duration
=
float
(
'-inf'
)
for
idx
,
duration
,
file_size
,
relpath
,
tags
in
\
stream_audio_index
(
idx_fname
):
self
.
idx_to_record
[
idx
]
=
(
duration
,
file_size
,
relpath
,
tags
)
max_duration
=
max
(
max_duration
,
duration
)
min_duration
=
min
(
min_duration
,
duration
)
rank
.
append
((
duration
,
idx
))
for
tag
in
tags
:
self
.
tag_to_id_set
[
tag
].
add
(
idx
)
if
len
(
rank
)
==
0
:
# file is empty
raise
IOError
(
"Index file {} is empty"
.
format
(
idx_fname
))
for
tag
in
self
.
tag_to_id_set
:
self
.
id_set_complete
|=
self
.
tag_to_id_set
[
tag
]
dur
=
min_duration
self
.
min_duration
=
min_duration
while
dur
<
max_duration
+
bin_size
:
self
.
duration_bins
.
append
(
dur
)
dur
+=
bin_size
# Sort in decreasing order of duration and populate
# the cumulative indices lists.
rank
.
sort
(
reverse
=
True
)
# These are indices for `rank` and used to keep track of whether
# there are new records to add in the current bin.
last
=
0
cur
=
0
# The set of audios falling in the previous bin; in the case,
# where we don't find new audios for the current bin, we store
# the reference to the last set so as to conserve memory.
# This is not such a big problem if the audio duration is
# bounded by a small number like 30 seconds and the
# bin size is big enough. But, for raw freesound audios,
# some audios can be as long as a few hours!
last_audio_set
=
set
()
# The same but for each tag so that we can pick audios based on
# tags and also some user-specified tag distribution.
last_audio_set_per_tag
=
defaultdict
(
lambda
:
set
())
# Set of lists of bins sharing the same audio sets.
shared
=
set
()
for
i
in
range
(
len
(
self
.
duration_bins
)
-
1
,
-
1
,
-
1
):
lower_bound
=
self
.
duration_bins
[
i
]
new_audio_idxs
=
set
()
new_audio_idxs_per_tag
=
defaultdict
(
lambda
:
set
())
while
cur
<
len
(
rank
)
and
rank
[
cur
][
0
]
>=
lower_bound
:
idx
=
rank
[
cur
][
1
]
tags
=
self
.
idx_to_record
[
idx
][
3
]
new_audio_idxs
.
add
(
idx
)
for
tag
in
tags
:
new_audio_idxs_per_tag
[
tag
].
add
(
idx
)
cur
+=
1
# This makes certain that the same list is shared across
# different bins if no new indices are added.
if
cur
==
last
:
shared
.
add
(
lower_bound
)
else
:
last_audio_set
=
last_audio_set
|
new_audio_idxs
for
tag
in
new_audio_idxs_per_tag
:
last_audio_set_per_tag
[
tag
]
=
\
last_audio_set_per_tag
[
tag
]
|
\
new_audio_idxs_per_tag
[
tag
]
if
len
(
shared
)
>
0
:
self
.
shared_duration_bins
.
append
(
shared
)
shared
=
set
([
lower_bound
])
### last_audio_set = set() should set blank
last
=
cur
self
.
duration_to_id_set
[
lower_bound
]
=
last_audio_set
for
tag
in
last_audio_set_per_tag
:
self
.
duration_to_id_set_per_tag
[
lower_bound
][
tag
]
=
\
last_audio_set_per_tag
[
tag
]
# The last `shared` record isn't added to the `shared_duration_bins`.
self
.
shared_duration_bins
.
append
(
shared
)
# We make sure that the while loop above has exhausted through the
# `rank` list by checking if the `cur`rent index in `rank` equals
# the length of the array, which is the halting condition.
assert
cur
==
len
(
rank
)
return
True
def
_build_index_from_records
(
self
,
tag_list
):
""" Uses the in-memory records read from the index file to build
an in-memory index restricted to the given tag list.
:param tag_list: List of tags we are interested in sampling from.
:type tag_list: list(str)
Updates:
self.id_set (set): the set of all audio id's that can be sampled.
self.duration_to_list (dict): maps from the duration lower bound
to the id's of audios longer than this duration.
self.duration_to_list_per_tag (dict): maps from the tag to
the same structure as self.duration_to_list. This is to support
sampling from a target noise distribution.
:return: whether the index was built from scratch
"""
if
self
.
tags
==
tag_list
:
return
False
self
.
tags
=
tag_list
if
len
(
tag_list
)
==
0
:
self
.
id_set
=
self
.
id_set_complete
else
:
self
.
id_set
=
set
()
for
tag
in
tag_list
:
self
.
id_set
|=
self
.
tag_to_id_set
[
tag
]
# Next, we need to take a subset of the audio files
for
shared
in
self
.
shared_duration_bins
:
# All bins in `shared' have the same index lists
# so we can intersect once and set all of them to this list.
lb
=
list
(
shared
)[
0
]
intersected
=
list
(
self
.
id_set
&
self
.
duration_to_id_set
[
lb
])
duration_to_id_set
=
self
.
duration_to_id_set_per_tag
[
lb
]
intersected_per_tag
=
{
tag
:
self
.
tag_to_id_set
[
tag
]
&
duration_to_id_set
[
tag
]
for
tag
in
duration_to_id_set
}
for
bin_key
in
shared
:
self
.
duration_to_list
[
bin_key
]
=
intersected
for
tag
in
intersected_per_tag
:
self
.
duration_to_list_per_tag
[
tag
][
bin_key
]
=
\
intersected_per_tag
[
tag
]
assert
len
(
self
.
duration_to_list
)
==
len
(
self
.
duration_to_id_set
)
return
True
def
refresh_records_from_index_file
(
self
,
audio_dir
,
idx_fname
,
tag_list
,
bin_size
=
2.0
):
""" Loads the index file and populates the records
for building the internal index.
If the audio directory or index file name has changed, the whole index
is reloaded from scratch. If only the tag_list is changed, then the
desired index is built from the complete, in-memory record.
:param audio_dir: audio directory
:type audio_dir: basestring
:param idx_fname: audio index file name
:type idex_fname: basestring
:param tag_list: list of tags we are interested in loading;
if empty, we load all.
:type tag_list: list
:param bin_size: optional argument for controlling the granularity
of duration bins
:type bin_size: float
"""
if
tag_list
is
None
:
tag_list
=
[]
reloaded_records
=
self
.
_load_all_records_from_disk
(
audio_dir
,
idx_fname
,
bin_size
)
if
reloaded_records
or
self
.
tags
!=
tag_list
:
self
.
_build_index_from_records
(
tag_list
)
logger
.
info
(
'loaded {} audio files from {}'
.
format
(
len
(
self
.
id_set
),
idx_fname
))
def
sample_audio
(
self
,
duration
,
rng
=
None
,
distr
=
None
):
""" Uniformly draws an audio record of at least the desired duration
:param duration: minimum desired audio duration
:type duration: float
:param rng: random number generator
:type rng: random.Random
:param distr: target distribution of audio tags. If not provided,
:type distr: dict
all audio files are sampled uniformly at random.
:returns: success, (duration, file_size, path)
"""
if
duration
<
0.0
:
duration
=
self
.
min_duration
i
=
bisect
.
bisect_left
(
self
.
duration_bins
,
duration
)
if
i
==
len
(
self
.
duration_bins
):
return
False
,
None
bin_key
=
self
.
duration_bins
[
i
]
if
distr
is
None
:
indices
=
self
.
duration_to_list
[
bin_key
]
else
:
# If a desired audio distribution is given, we sample from it.
if
rng
is
None
:
rng
=
random
.
Random
()
nprng
=
np
.
random
.
RandomState
(
rng
.
getrandbits
(
32
))
prob_masses
=
distr
.
values
()
prob_masses
/=
np
.
sum
(
prob_masses
)
tag
=
nprng
.
choice
(
distr
.
keys
(),
p
=
prob_masses
)
indices
=
self
.
duration_to_list_per_tag
[
tag
][
bin_key
]
if
len
(
indices
)
==
0
:
return
False
,
None
else
:
if
rng
is
None
:
rng
=
random
.
Random
()
# duration, file size and relative path from root
s
=
self
.
idx_to_record
[
rng
.
sample
(
indices
,
1
)[
0
]]
s
=
(
s
[
0
],
s
[
1
],
os
.
path
.
join
(
self
.
audio_dir
,
s
[
2
]))
return
True
,
s
deep_speech_2/data_utils/augmentor/augmentation.py
浏览文件 @
42ba74ef
...
@@ -6,11 +6,6 @@ from __future__ import print_function
...
@@ -6,11 +6,6 @@ from __future__ import print_function
import
json
import
json
import
random
import
random
from
data_utils.augmentor.volume_perturb
import
VolumePerturbAugmentor
from
data_utils.augmentor.volume_perturb
import
VolumePerturbAugmentor
from
data_utils.augmentor.resamler
import
ResamplerAugmentor
from
data_utils.augmentor.speed_perturb
import
SpeedPerturbatioAugmentor
from
data_utils.augmentor.online_bayesian_normalization
import
OnlineBayesianNormalizationAugmentor
from
data_utils.augmentor.Impulse_response
import
ImpulseResponseAugmentor
from
data_utils.augmentor.noise_speech
import
NoiseSpeechAugmentor
class
AugmentationPipeline
(
object
):
class
AugmentationPipeline
(
object
):
...
@@ -81,15 +76,5 @@ class AugmentationPipeline(object):
...
@@ -81,15 +76,5 @@ class AugmentationPipeline(object):
"""Return an augmentation model by the type name, and pass in params."""
"""Return an augmentation model by the type name, and pass in params."""
if
augmentor_type
==
"volume"
:
if
augmentor_type
==
"volume"
:
return
VolumePerturbAugmentor
(
self
.
_rng
,
**
params
)
return
VolumePerturbAugmentor
(
self
.
_rng
,
**
params
)
if
augmentor_type
==
"resamle"
:
return
ResamplerAugmentor
(
self
.
_rng
,
**
params
)
if
augmentor_type
==
"speed"
:
return
SpeedPerturbatioAugmentor
(
self
.
_rng
,
**
params
)
if
augmentor_type
==
"online_bayesian_normalization"
:
return
OnlineBayesianNormalizationAugmentor
(
self
.
_rng
,
**
params
)
if
augmentor_type
==
"Impulse_response"
:
return
ImpulseResponseAugmentor
(
self
.
_rng
,
**
params
)
if
augmentor_type
==
"noise_speech"
:
return
NoiseSpeechAugmentor
(
self
.
_rng
,
**
params
)
else
:
else
:
raise
ValueError
(
"Unknown augmentor type [%s]."
%
augmentor_type
)
raise
ValueError
(
"Unknown augmentor type [%s]."
%
augmentor_type
)
deep_speech_2/data_utils/augmentor/implus_response.py
已删除
100755 → 0
浏览文件 @
a84bdf64
""" Impulse response"""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
.
import
base
from
.
import
audio_database
from
data_utils.speech
import
SpeechSegment
class
ImpulseResponseAugmentor
(
base
.
AugmentorBase
):
""" Instantiates an impulse response model
:param ir_dir: directory containing impulse responses
:type ir_dir: basestring
:param tags: optional parameter for specifying what
particular impulse responses to apply.
:type tags: list
:parm tag_distr: optional noise distribution
:type tag_distr: dict
"""
def
__init__
(
self
,
rng
,
ir_dir
,
index_file
,
tags
=
None
,
tag_distr
=
None
):
# Define all required parameter maps here.
self
.
ir_dir
=
ir_dir
self
.
index_file
=
index_file
self
.
tags
=
tags
self
.
tag_distr
=
tag_distr
self
.
audio_index
=
audio_database
.
AudioIndex
()
self
.
rng
=
rng
def
_init_data
(
self
):
""" Preloads stuff from disk in an attempt (e.g. list of files, etc)
to make later loading faster. If the data configuration remains the
same, this function does nothing.
"""
self
.
audio_index
.
refresh_records_from_index_file
(
self
.
ir_dir
,
self
.
index_file
,
self
.
tags
)
def
transform_audio
(
self
,
audio_segment
):
""" Convolves the input audio with an impulse response.
:param audio_segment: input audio
:type audio_segment: AudioSegemnt
"""
# This handles the cases where the data source or directories change.
self
.
_init_data
()
read_size
=
0
tag_distr
=
self
.
tag_distr
if
not
self
.
audio_index
.
has_audio
(
tag_distr
):
if
tag_distr
is
None
:
if
not
self
.
tags
:
raise
RuntimeError
(
"The ir index does not have audio "
"files to sample from."
)
else
:
raise
RuntimeError
(
"The ir index does not have audio "
"files of the given tags to sample "
"from."
)
else
:
raise
RuntimeError
(
"The ir index does not have audio "
"files to match the target ir "
"distribution."
)
else
:
# Querying with a negative duration triggers the index to search
# from all impulse responses.
success
,
record
=
self
.
audio_index
.
sample_audio
(
-
1.0
,
rng
=
self
.
rng
,
distr
=
tag_distr
)
if
success
is
True
:
_
,
read_size
,
ir_fname
=
record
ir_wav
=
SpeechSegment
.
from_file
(
ir_fname
)
audio_segment
.
convolve
(
ir_wav
,
allow_resampling
=
True
)
deep_speech_2/data_utils/augmentor/noise_speech.py
已删除
100755 → 0
浏览文件 @
a84bdf64
""" noise speech
"""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
logging
import
numpy
as
np
import
os
from
collections
import
defaultdict
from
.
import
base
from
.
import
audio_database
from
data_utils.speech
import
SpeechSegment
TURK
=
"turk"
USE_AUDIO_DATABASE_SOURCES
=
frozenset
([
"freesound"
,
"chime"
])
HALF_NOISE_LENGTH_MIN_THRESHOLD
=
3.0
FIND_NOISE_MAX_ATTEMPTS
=
20
logger
=
logging
.
getLogger
(
__name__
)
def
get_first_smaller
(
items
,
value
):
index
=
bisect
.
bisect_left
(
items
,
value
)
-
1
assert
items
[
index
]
<
value
,
\
'get_first_smaller failed! %d %d'
%
(
items
[
index
],
value
)
return
items
[
index
]
def
get_first_larger
(
items
,
value
):
'Find leftmost value greater than value'
index
=
bisect
.
bisect_right
(
items
,
value
)
assert
index
<
len
(
items
),
\
"no noise bin exists for this audio length (%f)"
%
value
assert
items
[
index
]
>
value
,
\
'get_first_larger failed! %d %d'
%
(
items
[
index
],
value
)
return
items
[
index
]
def
_get_turk_noise_files
(
noise_dir
,
index_file
):
""" Creates a map from duration => a list of noise filenames
:param noise_dir: Directory of noise files which contains
"noise-samples-list"
:type noise_dir: basestring
:param index_file: Noise list
:type index_file: basestring
returns:noise_files (defaultdict): A map of bins to noise files.
Each key is the duration, and the value is a list of noise
files binned to this duration. Each bin is 2 secs.
Note: noise-samples-list should contain one line per noise (wav) file
along with its duration in milliseconds
"""
noise_files
=
defaultdict
(
list
)
if
not
os
.
path
.
exists
(
index_file
):
logger
.
error
(
'No noise files were found at {}'
.
format
(
index_file
))
return
noise_files
num_noise_files
=
0
rounded_durations
=
list
(
range
(
0
,
65
,
2
))
with
open
(
index_file
,
'r'
)
as
fl
:
for
line
in
fl
:
fname
=
os
.
path
.
join
(
noise_dir
,
line
.
strip
().
split
()[
0
])
duration
=
float
(
line
.
strip
().
split
()[
1
])
/
1000
# bin the noise files into length bins rounded by 2 sec
bin_id
=
get_first_smaller
(
rounded_durations
,
duration
)
noise_files
[
bin_id
].
append
(
fname
)
num_noise_files
+=
1
logger
.
info
(
'Loaded {} turk noise files'
.
format
(
num_noise_files
))
return
noise_files
class
NoiseSpeechAugmentor
(
base
.
AugmentorBase
):
""" Noise addition block
:param snr_min: minimum signal-to-noise ratio
:type snr_min: float
:param snr_max: maximum signal-to-noise ratio
:type snr_max: float
:param noise_dir: root of where noise files are stored
:type noise_fir: basestring
:param index_file: index of noises of interest in noise_dir
:type index_file: basestring
:param source: select one from
- turk
- freesound
- chime
Note that this field is no longer required for the freesound
and chime
:type source: string
:param tags: optional parameter for specifying what
particular noises we want to add. See above for the available tags.
:type tags: list
:param tag_distr: optional noise distribution
:type tag_distr: dict
"""
def
__init__
(
self
,
rng
,
snr_min
,
snr_max
,
noise_dir
,
source
,
allow_downsampling
=
None
,
index_file
=
None
,
tags
=
None
,
tag_distr
=
None
):
# Define all required parameter maps here.
self
.
rng
=
rng
self
.
snr_min
=
snr_min
self
.
snr_max
=
snr_max
self
.
noise_dir
=
noise_dir
self
.
source
=
source
self
.
allow_downsampling
=
allow_downsampling
self
.
index_file
=
index_file
self
.
tags
=
tags
self
.
tag_distr
=
tag_distr
# When new noise sources are added, make sure to define the
# associated bookkeeping variables here.
self
.
turk_noise_files
=
[]
self
.
turk_noise_dir
=
None
self
.
audio_index
=
audio_database
.
AudioIndex
()
def
_init_data
(
self
):
""" Preloads stuff from disk in an attempt (e.g. list of files, etc)
to make later loading faster. If the data configuration remains the
same, this function does nothing.
"""
noise_dir
=
self
.
noise_dir
index_file
=
self
.
index_file
source
=
self
.
source
if
not
index_file
:
if
source
==
TURK
:
index_file
=
os
.
path
.
join
(
noise_dir
,
'noise-samples-list'
)
logger
.
debug
(
"index_file not provided; "
+
"defaulting to "
+
index_file
)
else
:
if
source
!=
""
:
assert
source
in
USE_AUDIO_DATABASE_SOURCES
,
\
"{} not supported by audio_database"
.
format
(
source
)
index_file
=
os
.
path
.
join
(
noise_dir
,
"audio_index_commercial.txt"
)
logger
.
debug
(
"index_file not provided; "
+
"defaulting to "
+
index_file
)
if
source
==
TURK
:
if
self
.
turk_noise_dir
!=
noise_dir
:
self
.
turk_noise_dir
=
noise_dir
self
.
turk_noise_files
=
_get_turk_noise_files
(
noise_dir
,
index_file
)
# elif source == TODO_SUPPORT_NON_AUDIO_DATABASE_BASED_SOURCES:
else
:
if
source
!=
""
:
assert
source
in
USE_AUDIO_DATABASE_SOURCES
,
\
"{} not supported by audio_database"
.
format
(
source
)
self
.
audio_index
.
refresh_records_from_index_file
(
self
.
noise_dir
,
index_file
,
self
.
tags
)
def
transform_audio
(
self
,
audio_segment
):
"""Adds walla noise
:param audio_segment: Input audio
:type audio_segment: SpeechSegment
"""
# This handles the cases where the data source or directories change.
self
.
_init_data
source
=
self
.
source
allow_downsampling
=
self
.
allow_downsampling
if
source
==
TURK
:
self
.
_add_turk_noise
(
audio_segment
,
self
.
rng
,
allow_downsampling
)
# elif source == TODO_SUPPORT_NON_AUDIO_DATABASE_BASED_SOURCES:
else
:
self
.
_add_noise
(
audio_segment
,
self
.
rng
,
allow_downsampling
)
def
_sample_snr
(
self
):
""" Returns a float sampled in [`self.snr_min`, `self.snr_max`]
if both `self.snr_min` and `self.snr_max` are non-zero.
"""
snr_min
=
self
.
snr_min
snr_max
=
self
.
snr_max
sampled_snr
=
self
.
rng
.
uniform
(
snr_min
,
snr_max
)
return
sampled_snr
def
_add_turk_noise
(
self
,
audio_segment
,
allow_downsampling
):
""" Adds a turk noise to the input audio.
:param audio_segment: input audio
:type audio_segment: audiosegment
:param allow_downsampling: indicates whether downsampling
is allowed
:type allow_downsampling: boolean
"""
read_size
=
0
if
len
(
self
.
turk_noise_files
)
>
0
:
snr
=
self
.
_sample_snr
(
self
.
rng
)
# Draw the noise file randomly from noise files that are
# slightly longer than the utterance
noise_bins
=
sorted
(
self
.
turk_noise_files
.
keys
())
# note some bins can be empty, so we can't just round up
# to the nearest 2-sec interval
rounded_duration
=
get_first_larger
(
noise_bins
,
audio_segment
.
duration
)
noise_fname
=
\
self
.
rng
.
sample
(
self
.
turk_noise_files
[
rounded_duration
],
1
)[
0
]
noise
=
SpeechSegment
.
from_wav_file
(
noise_fname
)
logger
.
debug
(
'noise_fname {}'
.
format
(
noise_fname
))
logger
.
debug
(
'snr {}'
.
format
(
snr
))
read_size
=
len
(
noise
)
*
2
# May throw exceptions, but this is caught by
# AudioFeaturizer.get_audio_files.
audio_segment
.
add_noise
(
noise
,
snr
,
rng
=
self
.
rng
,
allow_downsampling
=
allow_downsampling
)
def
_add_noise
(
self
,
audio_segment
,
allow_downsampling
):
""" Adds a noise indexed in audio_database.AudioIndex.
:param audio_segment: input audio
:type audio_segment: SpeechSegment
:param allow_downsampling: indicates whether downsampling
is allowed
:type allow_downsampling: boolean
Returns:
(SpeechSegment, int)
- sound with turk noise added
- number of bytes read from disk
"""
read_size
=
0
tag_distr
=
self
.
tag_distr
if
not
self
.
audio_index
.
has_audio
(
tag_distr
):
if
tag_distr
is
None
:
if
not
self
.
tags
:
raise
RuntimeError
(
"The noise index does not have audio "
"files to sample from."
)
else
:
raise
RuntimeError
(
"The noise index does not have audio "
"files of the given tags to sample "
"from."
)
else
:
raise
RuntimeError
(
"The noise index does not have audio "
"files to match the target noise "
"distribution."
)
else
:
# Compute audio segment related statistics
audio_duration
=
audio_segment
.
duration
# Sample relevant augmentation parameters.
snr
=
self
.
_sample_snr
(
self
.
rng
)
# Perhaps, we may not have a sufficiently long noise, so we need
# to search iteratively.
min_duration
=
audio_duration
+
0.25
for
_
in
range
(
FIND_NOISE_MAX_ATTEMPTS
):
logger
.
debug
(
"attempting to find noise of length "
"at least {}"
.
format
(
min_duration
))
success
,
record
=
\
self
.
audio_index
.
sample_audio
(
min_duration
,
rng
=
self
.
rng
,
distr
=
tag_distr
)
if
success
is
True
:
noise_duration
,
read_size
,
noise_fname
=
record
# Assert after logging so we know
# what caused augmentation to fail.
logger
.
debug
(
"noise_fname {}"
.
format
(
noise_fname
))
logger
.
debug
(
"snr {}"
.
format
(
snr
))
assert
noise_duration
>=
min_duration
break
# Decrease the desired minimum duration linearly.
# If the value becomes smaller than some threshold,
# we half the value instead.
if
min_duration
>
HALF_NOISE_LENGTH_MIN_THRESHOLD
:
min_duration
-=
2.0
else
:
min_duration
*=
0.5
if
success
is
False
:
logger
.
info
(
"Failed to find a noise file"
)
return
diff_duration
=
audio_duration
+
0.25
-
noise_duration
if
diff_duration
>=
0.0
:
# Here, the noise is shorter than the audio file, so
# we pad with zeros to make sure the noise sound is applied
# with a uniformly random shift.
noise
=
SpeechSegment
.
from_file
(
noise_fname
)
noise
=
noise
.
pad_silence
(
diff_duration
,
sides
=
"both"
)
else
:
# The noise clip is at least ~25 ms longer than the audio
# segment here.
diff_duration
=
int
(
noise_duration
*
audio_segment
.
sample_rate
)
-
\
int
(
audio_duration
*
audio_segment
.
sample_rate
)
-
\
int
(
0.02
*
audio_segment
.
sample_rate
)
start
=
float
(
self
.
rng
.
randint
(
0
,
diff_duration
))
/
\
audio
.
sample_rate
finish
=
min
(
start
+
audio_duration
+
0.2
,
noise_duration
)
noise
=
SpeechSegment
.
slice_from_file
(
noise_fname
,
start
,
finish
)
if
len
(
noise
)
<
len
(
audio_segment
):
# This is to ensure that the noise clip is at least as
# long as the audio segment.
num_samples_to_pad
=
len
(
audio_segment
)
-
len
(
noise
)
# Padding this amount of silence on both ends ensures that
# the placement of the noise clip is uniformly random.
silence
=
SpeechSegment
(
np
.
zeros
(
num_samples_to_pad
),
audio_segment
.
sample_rate
)
noise
=
SpeechSegment
.
concatenate
(
silence
,
noise
,
silence
)
audio_segment
.
add_noise
(
noise
,
snr
,
rng
=
self
.
rng
,
allow_downsampling
=
allow_downsampling
)
deep_speech_2/data_utils/augmentor/online_bayesian_normalization.py
已删除
100755 → 0
浏览文件 @
a84bdf64
""" Online bayesian normalization
"""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
.
import
base
class
OnlineBayesianNormalizationAugmentor
(
base
.
AugmentorBase
):
"""
Instantiates an online bayesian normalization module.
:param target_db: Target RMS value in decibels
:type target_db: func[int->scalar]
:param prior_db: Prior RMS estimate in decibels
:type prior_db: func[int->scalar]
:param prior_samples: Prior strength in number of samples
:type prior_samples: func[int->scalar]
:param startup_delay: Start-up delay in seconds during
which normalization statistics is accrued.
:type starup_delay: func[int->scalar]
"""
def
__init__
(
self
,
rng
,
target_db
,
prior_db
,
prior_samples
,
startup_delay
=
base
.
parse_parameter_from
(
0.0
)):
self
.
target_db
=
target_db
self
.
prior_db
=
prior_db
self
.
prior_samples
=
prior_samples
self
.
startup_delay
=
startup_delay
self
.
rng
=
rng
def
transform_audio
(
self
,
audio_segment
):
"""
Normalizes the input audio using the online Bayesian approach.
:param audio_segment: input audio
:type audio_segment: SpeechSegment
:param iteration: current iteration
:type iteration: int
:param text: audio transcription
:type text: basestring
:param rng: RNG to use for augmentation
:type rng: random.Random
"""
read_size
=
0
target_db
=
self
.
target_db
(
iteration
)
prior_db
=
self
.
prior_db
(
iteration
)
prior_samples
=
self
.
prior_samples
(
iteration
)
startup_delay
=
self
.
startup_delay
(
iteration
)
audio
.
normalize_online_bayesian
(
target_db
,
prior_db
,
prior_samples
,
startup_delay
=
startup_delay
)
deep_speech_2/data_utils/augmentor/resampler.py
已删除
100755 → 0
浏览文件 @
a84bdf64
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
.
import
base
class
ResamplerAugmentor
(
base
.
AugmentorBase
):
""" Instantiates a resampler module.
:param new_sample_rate: New sample rate in Hz
:type new_sample_rate: func[int->scalar]
:param rng: Random generator object.
:type rng: random.Random
"""
def
__init__
(
self
,
rng
,
new_sample_rate
):
self
.
new_sample_rate
=
new_sample_rate
self
.
_rng
=
rng
def
transform_audio
(
self
,
audio_segment
):
""" Resamples the input audio to the target sample rate.
Note that this is an in-place transformation.
:param audio: input audio
:type audio: SpeechDLSegment
"""
new_sample_rate
=
self
.
new_sample_rate
audio
.
resample
(
new_sample_rate
)
\ No newline at end of file
deep_speech_2/data_utils/augmentor/speed_perturb.py
已删除
100755 → 0
浏览文件 @
a84bdf64
"""Speed perturbation module for making ASR robust to different voice
types (high pitched, low pitched, etc)
Samples uniformly between speed_min and speed_max
"""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
.
import
base
class
SpeedPerturbatioAugmentor
(
base
.
AugmentorBase
):
"""
Instantiates a speed perturbation module.
See reference paper here:
http://www.danielpovey.com/files/2015_interspeech_augmentation.pdf
:param speed_min: Lower bound on new rate to sample
:type speed_min: func[int->scalar]
:param speed_max: Upper bound on new rate to sample
:type speed_max: func[int->scalar]
"""
def
__init__
(
self
,
rng
,
speed_min
,
speed_max
):
if
(
speed_min
<
0.9
):
raise
ValueError
(
"Sampling speed below 0.9 can cause unnatural effects"
)
if
(
speed_min
>
1.1
):
raise
ValueError
(
"Sampling speed above 1.1 can cause unnatural effects"
)
self
.
speed_min
=
speed_min
self
.
speed_max
=
speed_max
self
.
rng
=
rng
def
transform_audio
(
self
,
audio_segment
):
"""
Samples a new speed rate from the given range and
changes the speed of the given audio clip.
Note that this is an in-place transformation.
:param audio_segment: input audio
:type audio_segment: SpeechDLSegment
"""
read_size
=
0
speed_min
=
self
.
speed_min
(
iteration
)
speed_max
=
self
.
speed_max
(
iteration
)
sampled_speed
=
rng
.
uniform
(
speed_min
,
speed_max
)
audio
=
audio
.
change_speed
(
sampled_speed
)
deep_speech_2/data_utils/augmentor/volume_perturb.py
浏览文件 @
42ba74ef
...
@@ -3,10 +3,10 @@ from __future__ import absolute_import
...
@@ -3,10 +3,10 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
division
from
__future__
import
print_function
from
__future__
import
print_function
from
.
import
b
ase
from
data_utils.augmentor.base
import
AugmentorB
ase
class
VolumePerturbAugmentor
(
base
.
AugmentorBase
):
class
VolumePerturbAugmentor
(
AugmentorBase
):
"""Augmentation model for adding random volume perturbation.
"""Augmentation model for adding random volume perturbation.
This is used for multi-loudness training of PCEN. See
This is used for multi-loudness training of PCEN. See
...
...
deep_speech_2/requirements.txt
浏览文件 @
42ba74ef
SoundFile==0.9.0.post1
SoundFile==0.9.0.post1
wget==3.2
wget==3.2
scikits.samplerate==0.3.3
scipy==0.13.0b1
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录