Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
mrywhh
Real-Time-Voice-Cloning
提交
0396cdc3
R
Real-Time-Voice-Cloning
项目概览
mrywhh
/
Real-Time-Voice-Cloning
落后 Fork 源项目 12 个版本
从无法访问的项目Fork
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
R
Real-Time-Voice-Cloning
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
0396cdc3
编写于
4月 13, 2019
作者:
C
Corentin Jemine
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Reimplemented the preprocessing routine for the encoder
上级
605c4681
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
120 addition
and
162 deletion
+120
-162
SV2TTS/encoder/config.py
SV2TTS/encoder/config.py
+4
-9
SV2TTS/encoder/preprocess.py
SV2TTS/encoder/preprocess.py
+108
-146
SV2TTS/encoder_preprocess.py
SV2TTS/encoder_preprocess.py
+8
-7
未找到文件。
SV2TTS/
global_
config.py
→
SV2TTS/
encoder/
config.py
浏览文件 @
0396cdc3
import
torch
librispeech_datasets
=
{
"train"
:
{
"clean"
:
[
"LibriSpeech/train-clean-100"
,
"LibriSpeech/train-clean-360"
],
...
...
@@ -45,10 +43,7 @@ other_datasets = [
]
anglophone_nationalites
=
[
"australia"
,
"canada"
,
"ireland"
,
"uk"
,
"usa"
]
clean_data_root
=
"E://Datasets//SpeakerEncoder"
all_datasets
=
librispeech_datasets
+
voxceleb_datasets
demo_datasets_root
=
"E://Datasets"
model_dir
=
fileio
.
join
(
project_root
,
"saved_models"
)
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
#
# model_dir = fileio.join(project_root, "saved_models")
#
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SV2TTS/encoder/preprocess.py
浏览文件 @
0396cdc3
from
pathos.multiprocessing
import
ThreadPool
from
encoder.params_data
import
*
from
encoder.config
import
librispeech_datasets
,
anglophone_nationalites
from
datetime
import
datetime
from
encoder
import
audio
from
pathlib
import
Path
from
tqdm
import
tqdm
import
numpy
as
np
import
sys
import
os
class
DatasetLog
:
...
...
@@ -12,7 +13,7 @@ class DatasetLog:
Registers metadata about the dataset in a text file.
"""
def
__init__
(
self
,
root
,
name
):
self
.
text_file
=
open
(
fileio
.
join
(
root
,
"log_%s.txt"
%
name
),
'w'
)
self
.
text_file
=
open
(
Path
(
root
,
"Log_%s.txt"
%
name
.
replace
(
"/"
,
"_"
)),
"w"
)
self
.
sample_data
=
dict
()
start_time
=
str
(
datetime
.
now
().
strftime
(
"%A %d %B %Y at %H:%M"
))
...
...
@@ -23,7 +24,7 @@ class DatasetLog:
def
_log_params
(
self
):
from
encoder
import
params_data
self
.
write_line
(
"Parameter values:"
)
for
param_name
in
(
p
for
p
in
dir
(
params_data
)
if
not
p
.
startswith
(
'__'
)):
for
param_name
in
(
p
for
p
in
dir
(
params_data
)
if
not
p
.
startswith
(
"__"
)):
value
=
getattr
(
params_data
,
param_name
)
self
.
write_line
(
"
\t
%s: %s"
%
(
param_name
,
value
))
self
.
write_line
(
"-----"
)
...
...
@@ -47,166 +48,127 @@ class DatasetLog:
end_time
=
str
(
datetime
.
now
().
strftime
(
"%A %d %B %Y at %H:%M"
))
self
.
write_line
(
"Finished on %s"
%
end_time
)
self
.
text_file
.
close
()
def
preprocess_librispeech
(
n_speakers
=
None
,
n_utterances
=
None
):
fileio
.
ensure_dir
(
clean_data_root
)
for
dataset_name
in
librispeech_datasets
:
dataset_root
=
fileio
.
join
(
librispeech_root
,
dataset_name
)
out_dir
=
fileio
.
ensure_dir
(
fileio
.
join
(
clean_data_root
,
dataset_name
))
logger
=
DatasetLog
(
clean_data_root
,
dataset_name
)
# Get the speaker directories
speaker_ids
=
fileio
.
listdir
(
dataset_root
,
numerical_sorting
=
True
)[:
n_speakers
]
print
(
"Librispeech: Preprocessing data for %d speakers."
%
len
(
speaker_ids
))
# Function to preprocess utterances for one speaker
def
preprocess_speaker
(
speaker_id
):
print
(
"Starting speaker %s"
%
speaker_id
)
speaker_name
=
"LibriSpeech_%s_%s"
%
(
dataset_name
,
speaker_id
)
speaker_in_dir
=
fileio
.
join
(
dataset_root
,
speaker_id
)
speaker_out_dir
=
fileio
.
ensure_dir
(
fileio
.
join
(
out_dir
,
speaker_name
))
fileio
.
resetdir
(
speaker_out_dir
)
sources_file
=
open
(
fileio
.
join
(
speaker_out_dir
,
"sources.txt"
),
'w'
)
fpaths
=
fileio
.
get_files
(
speaker_in_dir
,
r
"\.flac"
,
recursive
=
True
)[:
n_utterances
]
for
i
,
in_fpath
in
enumerate
(
fpaths
):
# Load and preprocess the waveform
wave
=
audio
.
load
(
in_fpath
)
wave
=
audio
.
preprocess_wave
(
wave
)
# Create and save the mel spectrogram
frames
=
audio
.
wave_to_mel_filterbank
(
wave
)
if
len
(
frames
)
<
partial_utterance_n_frames
:
continue
fname
=
fileio
.
leaf
(
in_fpath
).
replace
(
".flac"
,
".npy"
)
out_fpath
=
fileio
.
join
(
speaker_out_dir
,
fname
)
np
.
save
(
out_fpath
,
frames
)
logger
.
add_sample
(
duration
=
len
(
wave
)
/
sampling_rate
)
sources_file
.
write
(
"%s %s
\n
"
%
(
fname
,
in_fpath
))
sources_file
.
close
()
print
(
"Speaker %s done!"
%
speaker_id
)
# Process the utterances for each speaker
with
ThreadPool
(
8
)
as
pool
:
list
(
pool
.
imap
(
preprocess_speaker
,
speaker_ids
))
logger
.
finalize
()
def
_init_preprocess_dataset
(
dataset_name
,
datasets_root
,
out_dir
)
->
(
Path
,
DatasetLog
):
dataset_root
=
datasets_root
.
joinpath
(
dataset_name
)
if
not
dataset_root
.
exists
():
print
(
"Couldn
\'
t find %s, skipping this dataset."
%
dataset_root
)
return
None
,
None
return
dataset_root
,
DatasetLog
(
out_dir
,
dataset_name
)
def
preprocess_voxceleb1
(
n_speakers
=
None
,
n_utterances
=
None
):
fileio
.
ensure_dir
(
clean_data_root
)
dataset_name
=
"voxceleb1"
out_dir
=
fileio
.
ensure_dir
(
fileio
.
join
(
clean_data_root
,
dataset_name
))
logger
=
DatasetLog
(
clean_data_root
,
dataset_name
)
def
_preprocess_speaker_dirs
(
speaker_dirs
,
dataset_name
,
datasets_root
,
out_dir
,
extension
,
skip_existing
,
logger
):
speaker_dirs
=
speaker_dirs
[:
10
]
# TODO
print
(
"%s: Preprocessing data for %d speakers."
%
(
dataset_name
,
len
(
speaker_dirs
)))
# Get the contents of the meta file
metadata
=
fileio
.
read_all_lines
(
fileio
.
join
(
voxceleb1_root
,
"vox1_meta.csv"
))[
1
:]
metadata_fields
=
[
line
.
split
(
'
\t
'
)
for
line
in
metadata
]
# Select the ID and the nationality, filter out non-anglophone speakers
nationalities
=
{
line
[
0
]:
line
[
3
]
for
line
in
metadata_fields
}
speaker_ids
=
[
speaker_id
for
speaker_id
,
nationality
in
nationalities
.
items
()
if
nationality
.
lower
()
in
anglophone_nationalites
]
speaker_ids
=
speaker_ids
[:
n_speakers
]
print
(
"VoxCeleb1: using samples from %d (assumed anglophone) speakers out of %d."
%
(
len
(
speaker_ids
),
len
(
nationalities
)))
# Get the speaker directories
speakers_root
=
fileio
.
join
(
voxceleb1_root
,
"wav"
)
disk_speaker_ids
=
fileio
.
listdir
(
speakers_root
)
speaker_ids_len
=
len
(
speaker_ids
)
speaker_ids
=
list
(
filter
(
lambda
s_id
:
s_id
in
disk_speaker_ids
,
speaker_ids
))
print
(
"Found %d speakers on the disk, %d missing (this is normal)."
%
(
len
(
speaker_ids
),
speaker_ids_len
-
len
(
speaker_ids
)))
print
(
"Preprocessing data for %d speakers."
%
len
(
speaker_ids
))
# Function to preprocess utterances for one speaker
def
preprocess_speaker
(
speaker_id
):
print
(
"Starting speaker %s"
%
speaker_id
)
speaker_name
=
"VoxCeleb1_%s"
%
speaker_id
speaker_in_dir
=
fileio
.
join
(
speakers_root
,
speaker_id
)
speaker_out_dir
=
fileio
.
ensure_dir
(
fileio
.
join
(
out_dir
,
speaker_name
))
fileio
.
resetdir
(
speaker_out_dir
)
sources_file
=
open
(
fileio
.
join
(
speaker_out_dir
,
"sources.txt"
),
'w'
)
def
preprocess_speaker
(
speaker_dir
:
Path
):
# Give a name to the speaker that includes its dataset
speaker_name
=
"_"
.
join
(
speaker_dir
.
relative_to
(
datasets_root
).
parts
)
# Create an output directory with that name, as well as a txt file containing a
# reference to each source file.
speaker_out_dir
=
out_dir
.
joinpath
(
speaker_name
)
speaker_out_dir
.
mkdir
(
exist_ok
=
True
)
sources_fpath
=
speaker_out_dir
.
joinpath
(
"sources.txt"
)
fpaths
=
fileio
.
get_files
(
speaker_in_dir
,
r
"\.wav"
,
recursive
=
True
)[:
n_utterances
]
for
i
,
in_fpath
in
enumerate
(
fpaths
):
# There"s a possibility that the preprocessing was interrupted earlier, check if
# there already is a sources file.
if
sources_fpath
.
exists
():
with
sources_fpath
.
open
(
"r"
)
as
sources_file
:
existing_fnames
=
{
line
.
split
(
":"
)[
0
]
for
line
in
sources_file
}
else
:
existing_fnames
=
{}
# Gather all audio files for that speaker recursively
sources_file
=
sources_fpath
.
open
(
"a"
)
for
in_fpath
in
speaker_dir
.
glob
(
"**/*.%s"
%
extension
):
# Check if the target output file already exists
out_fname
=
"_"
.
join
(
in_fpath
.
relative_to
(
speaker_dir
).
parts
)
out_fname
=
out_fname
.
replace
(
".%s"
%
extension
,
".npy"
)
if
skip_existing
and
out_fname
in
existing_fnames
:
continue
# Load and preprocess the waveform
wave
=
audio
.
load
(
in_fpath
)
wave
=
audio
.
preprocess_wave
(
wave
)
wav
=
audio
.
load
(
in_fpath
)
wav
=
audio
.
preprocess_wave
(
wav
)
if
len
(
wav
)
==
0
:
continue
# Create
and save the mel spectrogram
frames
=
audio
.
wave_to_mel_filterbank
(
wav
e
)
# Create
the mel spectrogram, discard those that are too short
frames
=
audio
.
wave_to_mel_filterbank
(
wav
)
if
len
(
frames
)
<
partial_utterance_n_frames
:
continue
video_id
=
fileio
.
leaf
(
fileio
.
leafdir
(
in_fpath
))
fname
=
video_id
+
'_'
+
fileio
.
leaf
(
in_fpath
).
replace
(
".wav"
,
".npy"
)
out_fpath
=
fileio
.
join
(
speaker_out_dir
,
fname
)
np
.
save
(
out_fpath
,
frames
)
logger
.
add_sample
(
duration
=
len
(
wave
)
/
sampling_rate
)
sources_file
.
write
(
"%s %s
\n
"
%
(
fname
,
in_fpath
))
out_fpath
=
speaker_out_dir
.
joinpath
(
out_fname
)
np
.
save
(
out_fpath
,
frames
)
logger
.
add_sample
(
duration
=
len
(
wav
)
/
sampling_rate
)
sources_file
.
write
(
"%s:%s
\n
"
%
(
out_fname
,
in_fpath
))
sources_file
.
close
()
print
(
"Speaker %s done!"
%
speaker_id
)
# Process the utterances for each speaker
with
ThreadPool
(
8
)
as
pool
:
list
(
pool
.
imap
(
preprocess_speaker
,
sorted
(
speaker_ids
)))
list
(
tqdm
(
pool
.
imap
(
preprocess_speaker
,
speaker_dirs
),
desc
=
dataset_name
,
unit
=
" speakers done"
))
logger
.
finalize
()
print
(
"Done preprocessing %s.
\n
"
%
dataset_name
)
def
preprocess_voxceleb2
(
n_speakers
=
None
,
n_utterances
=
None
):
fileio
.
ensure_dir
(
clean_data_root
)
dataset_name
=
"voxceleb2"
out_dir
=
fileio
.
ensure_dir
(
fileio
.
join
(
clean_data_root
,
dataset_name
))
logger
=
DatasetLog
(
clean_data_root
,
dataset_name
)
def
preprocess_librispeech
(
datasets_root
:
Path
,
out_dir
:
Path
,
skip_existing
=
False
):
for
dataset_name
in
librispeech_datasets
[
"train"
][
"other"
]:
# Initialize the preprocessing
dataset_root
,
logger
=
_init_preprocess_dataset
(
dataset_name
,
datasets_root
,
out_dir
)
if
not
dataset_root
:
return
# Preprocess all speakers
speaker_dirs
=
list
(
dataset_root
.
glob
(
"*"
))
_preprocess_speaker_dirs
(
speaker_dirs
,
dataset_name
,
datasets_root
,
out_dir
,
"flac"
,
skip_existing
,
logger
)
def
preprocess_voxceleb1
(
datasets_root
:
Path
,
out_dir
:
Path
,
skip_existing
=
False
):
# Initialize the preprocessing
dataset_name
=
"VoxCeleb1"
dataset_root
,
logger
=
_init_preprocess_dataset
(
dataset_name
,
datasets_root
,
out_dir
)
if
not
dataset_root
:
return
# Get the contents of the meta file
with
dataset_root
.
joinpath
(
"vox1_meta.csv"
).
open
(
"r"
)
as
metafile
:
metadata
=
[
line
.
split
(
"
\t
"
)
for
line
in
metafile
][
1
:]
# Get the speaker directories
speakers_root
=
fileio
.
join
(
voxceleb2_root
,
"dev"
,
"aac"
)
speaker_ids
=
fileio
.
listdir
(
speakers_root
)[:
n_speakers
]
print
(
"Preprocessing data for %d speakers."
%
len
(
speaker_ids
))
# Select the ID and the nationality, filter out non-anglophone speakers
nationalities
=
{
line
[
0
]:
line
[
3
]
for
line
in
metadata
}
keep_speaker_ids
=
[
speaker_id
for
speaker_id
,
nationality
in
nationalities
.
items
()
if
nationality
.
lower
()
in
anglophone_nationalites
]
print
(
"VoxCeleb1: using samples from %d (assumed anglophone) speakers out of %d."
%
(
len
(
keep_speaker_ids
),
len
(
nationalities
)))
# Function to preprocess utterances for one speaker
def
preprocess_speaker
(
speaker_id
):
print
(
"Starting speaker %s"
%
speaker_id
)
speaker_name
=
"VoxCeleb2_%s"
%
speaker_id
speaker_in_dir
=
fileio
.
join
(
speakers_root
,
speaker_id
)
speaker_out_dir
=
fileio
.
ensure_dir
(
fileio
.
join
(
out_dir
,
speaker_name
))
fileio
.
resetdir
(
speaker_out_dir
)
sources_file
=
open
(
fileio
.
join
(
speaker_out_dir
,
"sources.txt"
),
'w'
)
fpaths
=
fileio
.
get_files
(
speaker_in_dir
,
r
"\.m4a"
,
recursive
=
True
)[:
n_utterances
]
for
i
,
in_fpath
in
enumerate
(
fpaths
):
# Load and preprocess the waveform
wave
=
audio
.
load
(
in_fpath
)
wave
=
audio
.
preprocess_wave
(
wave
)
if
len
(
wave
)
==
0
:
print
(
'Warning: audio file %s is entirely silent after processing.'
%
in_fpath
,
file
=
sys
.
stderr
)
continue
# Create and save the mel spectrogram
frames
=
audio
.
wave_to_mel_filterbank
(
wave
)
if
len
(
frames
)
<
partial_utterance_n_frames
:
continue
video_id
=
fileio
.
leaf
(
fileio
.
leafdir
(
in_fpath
))
fname
=
video_id
+
'_'
+
fileio
.
leaf
(
in_fpath
).
replace
(
".m4a"
,
".npy"
)
out_fpath
=
fileio
.
join
(
speaker_out_dir
,
fname
)
np
.
save
(
out_fpath
,
frames
)
logger
.
add_sample
(
duration
=
len
(
wave
)
/
sampling_rate
)
sources_file
.
write
(
"%s %s
\n
"
%
(
fname
,
in_fpath
))
sources_file
.
close
()
print
(
"Speaker %s done!"
%
speaker_id
)
# Get the speaker directories for anglophone speakers only
speaker_dirs
=
dataset_root
.
joinpath
(
"wav"
).
glob
(
"*"
)
speaker_dirs
=
[
speaker_dir
for
speaker_dir
in
speaker_dirs
if
speaker_dir
.
name
in
keep_speaker_ids
]
print
(
"Found %d anglophone speakers on the disk, %d missing (this is normal)."
%
(
len
(
speaker_dirs
),
len
(
keep_speaker_ids
)
-
len
(
speaker_dirs
)))
# Preprocess all speakers
_preprocess_speaker_dirs
(
speaker_dirs
,
dataset_name
,
datasets_root
,
out_dir
,
"wav"
,
skip_existing
,
logger
)
def
preprocess_voxceleb2
(
datasets_root
:
Path
,
out_dir
:
Path
,
skip_existing
=
False
):
# Initialize the preprocessing
dataset_name
=
"VoxCeleb2"
dataset_root
,
logger
=
_init_preprocess_dataset
(
dataset_name
,
datasets_root
,
out_dir
)
if
not
dataset_root
:
return
# Process the utterances for each speaker
with
ThreadPool
(
8
)
as
pool
:
list
(
pool
.
imap
(
preprocess_speaker
,
speaker_ids
))
logger
.
finalize
()
# Get the speaker directories
# Preprocess all speakers
speaker_dirs
=
list
(
dataset_root
.
joinpath
(
"dev"
,
"aac"
).
glob
(
"*"
))
_preprocess_speaker_dirs
(
speaker_dirs
,
dataset_name
,
datasets_root
,
out_dir
,
"m4a"
,
skip_existing
,
logger
)
SV2TTS/encoder_preprocess.py
浏览文件 @
0396cdc3
from
encoder.preprocess
import
preprocess_librispeech
,
preprocess_voxceleb1
,
preprocess_voxceleb2
from
pathlib
import
Path
import
argparse
import
os
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
...
...
@@ -23,17 +23,18 @@ if __name__ == "__main__":
args
=
vars
(
parser
.
parse_args
())
# Reformat the arguments
args
[
"datasets_root"
]
=
Path
(
args
[
"datasets_root"
])
args
[
"datasets"
]
=
args
[
"datasets"
].
split
(
","
)
if
not
hasattr
(
args
,
"out_dir"
):
args
[
"out_dir"
]
=
os
.
path
.
join
(
args
[
"datasets_root"
],
"SV2TTS"
,
"encoder"
)
os
.
makedirs
(
args
[
"out_dir"
],
exist_ok
=
True
)
args
[
"
datasets"
]
=
args
[
"datasets"
].
split
(
","
)
args
[
"out_dir"
]
=
Path
(
args
[
"datasets_root"
],
"SV2TTS"
,
"encoder"
)
args
[
"out_dir"
]
=
Path
(
args
[
"out_dir"
]
)
args
[
"
out_dir"
].
mkdir
(
exist_ok
=
True
)
# Preprocess the datasets
preprocess_func
=
{
"librispeech_other"
:
preprocess_librispeech
()
,
"voxceleb1"
:
preprocess_voxceleb1
()
,
"voxceleb2"
:
preprocess_voxceleb2
()
,
"librispeech_other"
:
preprocess_librispeech
,
"voxceleb1"
:
preprocess_voxceleb1
,
"voxceleb2"
:
preprocess_voxceleb2
,
}
for
dataset
in
args
.
pop
(
"datasets"
):
print
(
"Preprocessing %s"
%
dataset
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录