Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
mrywhh
Real-Time-Voice-Cloning
提交
a0c4f750
R
Real-Time-Voice-Cloning
项目概览
mrywhh
/
Real-Time-Voice-Cloning
落后 Fork 源项目 12 个版本
从无法访问的项目Fork
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
R
Real-Time-Voice-Cloning
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
a0c4f750
编写于
6月 02, 2019
作者:
C
Corentin Jemine
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Implemented noise removal based on noise profile
上级
c4e753ff
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
286 addition
and
8 deletion
+286
-8
sv2tts/LICENSE.txt
sv2tts/LICENSE.txt
+1
-0
sv2tts/synthesizer/preprocess.py
sv2tts/synthesizer/preprocess.py
+16
-8
sv2tts/utils/logmmse.py
sv2tts/utils/logmmse.py
+269
-0
未找到文件。
sv2tts/LICENSE.txt
浏览文件 @
a0c4f750
...
...
@@ -3,6 +3,7 @@ MIT License
Modified & original work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ)
Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah)
Original work Copyright (c) 2019 fatchord (https://github.com/fatchord)
Original work Copyright (c) 2015 braindead (https://github.com/braindead)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
...
...
sv2tts/synthesizer/preprocess.py
浏览文件 @
a0c4f750
from
synthesizer
import
audio
from
multiprocessing.pool
import
Pool
from
synthesizer
import
audio
from
functools
import
partial
from
itertools
import
chain
from
encoder
import
inference
as
encoder
from
pathlib
import
Path
from
utils
import
logmmse
from
tqdm
import
tqdm
import
numpy
as
np
import
librosa
...
...
@@ -89,14 +90,22 @@ def split_on_silences(wav_fpath, words, end_times, hparams):
start_times
=
np
.
array
([
0.0
]
+
end_times
[:
-
1
])
end_times
=
np
.
array
(
end_times
)
assert
len
(
words
)
==
len
(
end_times
)
==
len
(
start_times
)
assert
words
[
0
]
==
''
and
words
[
-
1
]
==
''
assert
words
[
0
]
==
""
and
words
[
-
1
]
==
""
#
Break the sentence on
pauses that are too long
mask
=
(
words
==
''
)
&
(
end_times
-
start_times
>=
hparams
.
silence_min_duration_split
)
#
Find
pauses that are too long
mask
=
(
words
==
""
)
&
(
end_times
-
start_times
>=
hparams
.
silence_min_duration_split
)
mask
[
0
]
=
mask
[
-
1
]
=
True
breaks
=
np
.
where
(
mask
)[
0
]
# Profile the noise from the silences and perform noise reduction on the waveform
silence_times
=
[[
start_times
[
i
],
end_times
[
i
]]
for
i
in
breaks
]
silence_times
=
(
np
.
array
(
silence_times
)
*
hparams
.
sample_rate
).
astype
(
np
.
int
)
noisy_wav
=
np
.
concatenate
([
wav
[
stime
[
0
]:
stime
[
1
]]
for
stime
in
silence_times
])
if
len
(
noisy_wav
)
>
hparams
.
sample_rate
*
0.02
:
profile
=
logmmse
.
profile_noise
(
noisy_wav
,
hparams
.
sample_rate
)
wav
=
logmmse
.
denoise
(
wav
,
profile
,
eta
=
0
)
# Re-attach segments that are too short
breaks
=
np
.
where
(
mask
)[
0
]
segments
=
list
(
zip
(
breaks
[:
-
1
],
breaks
[
1
:]))
segment_durations
=
[
start_times
[
end
]
-
end_times
[
start
]
for
start
,
end
in
segments
]
i
=
0
...
...
@@ -124,11 +133,10 @@ def split_on_silences(wav_fpath, words, end_times, hparams):
segment_times
=
[[
end_times
[
start
],
start_times
[
end
]]
for
start
,
end
in
segments
]
segment_times
=
(
np
.
array
(
segment_times
)
*
hparams
.
sample_rate
).
astype
(
np
.
int
)
wavs
=
[
wav
[
segment_time
[
0
]:
segment_time
[
1
]]
for
segment_time
in
segment_times
]
texts
=
[
' '
.
join
(
words
[
start
+
1
:
end
]).
replace
(
" "
,
" "
)
for
start
,
end
in
segments
]
texts
=
[
" "
.
join
(
words
[
start
+
1
:
end
]).
replace
(
" "
,
" "
)
for
start
,
end
in
segments
]
# # DEBUG: play the audio segments
# # DEBUG: play the audio segments
(run with -n=1)
# import sounddevice as sd
# print("From %s" % audio_fpath)
# if len(wavs) > 1:
# print("This sentence was split in %d segments:" % len(wavs))
# else:
...
...
sv2tts/utils/logmmse.py
0 → 100644
浏览文件 @
a0c4f750
# The MIT License (MIT)
#
# Copyright (c) 2015 braindead
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
#
# This code was extracted from the logmmse package (https://pypi.org/project/logmmse/) and I
# simply modified the interface to meet my needs.
import
numpy
as
np
import
math
from
scipy.special
import
expn
from
collections
import
namedtuple
np
.
seterr
(
'raise'
)
NoiseProfile
=
namedtuple
(
"NoiseProfile"
,
"sampling_rate window_size len1 len2 win n_fft noise_mu2"
)
def
profile_noise
(
noise
,
sampling_rate
,
window_size
=
0
):
"""
Creates a profile of the noise in a given waveform.
:param noise: a waveform containing noise ONLY, as a numpy array of floats or ints.
:param sampling_rate: the sampling rate of the audio
:param window_size: the size of the window the logmmse algorithm operates on. A default value
will be picked if left as 0.
:return: a NoiseProfile object
"""
noise
,
dtype
=
to_float
(
noise
)
noise
+=
np
.
finfo
(
np
.
float64
).
eps
if
window_size
==
0
:
window_size
=
int
(
math
.
floor
(
0.02
*
sampling_rate
))
if
window_size
%
2
==
1
:
window_size
=
window_size
+
1
perc
=
50
len1
=
int
(
math
.
floor
(
window_size
*
perc
/
100
))
len2
=
int
(
window_size
-
len1
)
win
=
np
.
hanning
(
window_size
)
win
=
win
*
len2
/
np
.
sum
(
win
)
n_fft
=
2
*
window_size
noise_mean
=
np
.
zeros
(
n_fft
)
n_frames
=
len
(
noise
)
//
window_size
for
j
in
range
(
0
,
window_size
*
n_frames
,
window_size
):
noise_mean
+=
np
.
absolute
(
np
.
fft
.
fft
(
win
*
noise
[
j
:
j
+
window_size
],
n_fft
,
axis
=
0
))
noise_mu2
=
(
noise_mean
/
n_frames
)
**
2
return
NoiseProfile
(
sampling_rate
,
window_size
,
len1
,
len2
,
win
,
n_fft
,
noise_mu2
)
def
denoise
(
wav
,
noise_profile
:
NoiseProfile
,
eta
=
0.15
):
"""
Cleans the noise from a speech waveform given a noise profile. The waveform must have the
same sampling rate as the one used to create the noise profile.
:param wav: a speech waveform as a numpy array of floats or ints.
:param noise_profile: a NoiseProfile object that was created from a similar (or a segment of
the same) waveform.
:param eta: voice threshold for noise update. While the voice activation detection value is
below this threshold, the noise profile will be continuously updated throughout the audio.
Set to 0 to disable updating the noise profile.
:return: the clean wav as a numpy array of floats or ints of the same length.
"""
wav
,
dtype
=
to_float
(
wav
)
wav
+=
np
.
finfo
(
np
.
float64
).
eps
p
=
noise_profile
nframes
=
int
(
math
.
floor
(
len
(
wav
)
/
p
.
len2
)
-
math
.
floor
(
p
.
window_size
/
p
.
len2
))
x_final
=
np
.
zeros
(
nframes
*
p
.
len2
)
aa
=
0.98
mu
=
0.98
ksi_min
=
10
**
(
-
25
/
10
)
x_old
=
np
.
zeros
(
p
.
len1
)
xk_prev
=
np
.
zeros
(
p
.
len1
)
noise_mu2
=
p
.
noise_mu2
for
k
in
range
(
0
,
nframes
*
p
.
len2
,
p
.
len2
):
insign
=
p
.
win
*
wav
[
k
:
k
+
p
.
window_size
]
spec
=
np
.
fft
.
fft
(
insign
,
p
.
n_fft
,
axis
=
0
)
sig
=
np
.
absolute
(
spec
)
sig2
=
sig
**
2
gammak
=
np
.
minimum
(
sig2
/
noise_mu2
,
40
)
if
xk_prev
.
all
()
==
0
:
ksi
=
aa
+
(
1
-
aa
)
*
np
.
maximum
(
gammak
-
1
,
0
)
else
:
ksi
=
aa
*
xk_prev
/
noise_mu2
+
(
1
-
aa
)
*
np
.
maximum
(
gammak
-
1
,
0
)
ksi
=
np
.
maximum
(
ksi_min
,
ksi
)
log_sigma_k
=
gammak
*
ksi
/
(
1
+
ksi
)
-
np
.
log
(
1
+
ksi
)
vad_decision
=
np
.
sum
(
log_sigma_k
)
/
p
.
window_size
if
vad_decision
<
eta
:
noise_mu2
=
mu
*
noise_mu2
+
(
1
-
mu
)
*
sig2
a
=
ksi
/
(
1
+
ksi
)
vk
=
a
*
gammak
ei_vk
=
0.5
*
expn
(
1
,
vk
)
hw
=
a
*
np
.
exp
(
ei_vk
)
sig
=
sig
*
hw
xk_prev
=
sig
**
2
xi_w
=
np
.
fft
.
ifft
(
hw
*
spec
,
p
.
n_fft
,
axis
=
0
)
xi_w
=
np
.
real
(
xi_w
)
x_final
[
k
:
k
+
p
.
len2
]
=
x_old
+
xi_w
[
0
:
p
.
len1
]
x_old
=
xi_w
[
p
.
len1
:
p
.
window_size
]
output
=
from_float
(
x_final
,
dtype
)
output
=
np
.
pad
(
output
,
(
0
,
len
(
wav
)
-
len
(
output
)),
mode
=
"constant"
)
return
output
## This is the original code
# def mono_logmmse(data, sampling_rate, initial_noise=6, window_size=0, noise_threshold=0.15):
# data, dtype = to_float(data)
# data += np.finfo(np.float64).eps
#
# num_frames = len(data)
# chunk_size = int(np.floor(60 * sampling_rate))
# m_output = np.array([], dtype=dtype)
# saved_params = None
# frames_read = 0
# while frames_read < num_frames:
# frames = num_frames - frames_read if frames_read + chunk_size > num_frames else chunk_size
# signal = data[frames_read:frames_read + frames]
# frames_read = frames_read + frames
# _output, saved_params = _logmmse(signal, sampling_rate, initial_noise, window_size,
# noise_threshold, saved_params)
# m_output = np.concatenate((m_output, from_float(_output, dtype)))
# return np.array(m_output).T
#
#
# def _logmmse(x, sampling_rate, noise_frames=6, slen=0, eta=0.15, saved_params=None):
# if slen == 0:
# slen = int(math.floor(0.02 * sampling_rate))
#
# if slen % 2 == 1:
# slen = slen + 1
#
# perc = 50
# len1 = int(math.floor(slen * perc / 100))
# len2 = int(slen - len1)
#
# win = np.hanning(slen)
# win = win * len2 / np.sum(win)
# n_fft = 2 * slen
#
# x_old = np.zeros(len1)
# xk_prev = np.zeros(len1)
# nframes = int(math.floor(len(x) / len2) - math.floor(slen / len2))
# xfinal = np.zeros(nframes * len2)
#
# if saved_params is None:
# noise_mean = np.zeros(n_fft)
# for j in range(0, slen * noise_frames, slen):
# noise_mean = noise_mean + np.absolute(np.fft.fft(win * x[j:j + slen], n_fft, axis=0))
# noise_mu2 = noise_mean / noise_frames ** 2
# else:
# noise_mu2 = saved_params['noise_mu2']
# xk_prev = saved_params['Xk_prev']
# x_old = saved_params['x_old']
#
# aa = 0.98
# mu = 0.98
# ksi_min = 10 ** (-25 / 10)
#
# for k in range(0, nframes * len2, len2):
# insign = win * x[k:k + slen]
#
# spec = np.fft.fft(insign, n_fft, axis=0)
# sig = np.absolute(spec)
# sig2 = sig ** 2
#
# gammak = np.minimum(sig2 / noise_mu2, 40)
#
# if xk_prev.all() == 0:
# ksi = aa + (1 - aa) * np.maximum(gammak - 1, 0)
# else:
# ksi = aa * xk_prev / noise_mu2 + (1 - aa) * np.maximum(gammak - 1, 0)
# ksi = np.maximum(ksi_min, ksi)
#
# log_sigma_k = gammak * ksi/(1 + ksi) - np.log(1 + ksi)
# vad_decision = np.sum(log_sigma_k) / slen
# if vad_decision < eta:
# noise_mu2 = mu * noise_mu2 + (1 - mu) * sig2
#
# a = ksi / (1 + ksi)
# vk = a * gammak
# ei_vk = 0.5 * expn(1, vk)
# hw = a * np.exp(ei_vk)
# sig = sig * hw
# xk_prev = sig ** 2
# xi_w = np.fft.ifft(hw * spec, n_fft, axis=0)
# xi_w = np.real(xi_w)
#
# xfinal[k:k + len2] = x_old + xi_w[0:len1]
# x_old = xi_w[len1:slen]
#
# return xfinal, {'noise_mu2': noise_mu2, 'Xk_prev': xk_prev, 'x_old': x_old}
def
to_float
(
_input
):
if
_input
.
dtype
==
np
.
float64
:
return
_input
,
_input
.
dtype
elif
_input
.
dtype
==
np
.
float32
:
return
_input
.
astype
(
np
.
float64
),
_input
.
dtype
elif
_input
.
dtype
==
np
.
uint8
:
return
(
_input
-
128
)
/
128.
,
_input
.
dtype
elif
_input
.
dtype
==
np
.
int16
:
return
_input
/
32768.
,
_input
.
dtype
elif
_input
.
dtype
==
np
.
int32
:
return
_input
/
2147483648.
,
_input
.
dtype
raise
ValueError
(
'Unsupported wave file format'
)
def
from_float
(
_input
,
dtype
):
if
dtype
==
np
.
float64
:
return
_input
,
np
.
float64
elif
dtype
==
np
.
float32
:
return
_input
.
astype
(
np
.
float32
)
elif
dtype
==
np
.
uint8
:
return
((
_input
*
128
)
+
128
).
astype
(
np
.
uint8
)
elif
dtype
==
np
.
int16
:
return
(
_input
*
32768
).
astype
(
np
.
int16
)
elif
dtype
==
np
.
int32
:
print
(
_input
)
return
(
_input
*
2147483648
).
astype
(
np
.
int32
)
raise
ValueError
(
'Unsupported wave file format'
)
if
__name__
==
'__main__'
:
import
sounddevice
as
sd
import
librosa
fpath
=
r
"E:\Datasets\LibriSpeech\train-clean-360\23\124439\23-124439-0003.flac"
wav
,
sr
=
librosa
.
load
(
fpath
)
# mono_logmmse(wav, sr)
noise
=
wav
[:
10000
]
noise
=
np
.
concatenate
((
wav
[:
10000
],
wav
[
47000
:
65000
],
wav
[
90000
:
140000
]))
profile
=
profile_noise
(
noise
,
sr
)
wav
=
denoise
(
wav
,
profile
)
sd
.
play
(
wav
,
sr
)
sd
.
wait
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录