Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
3568bb62
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 1 年 前同步成功
通知
207
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
3568bb62
编写于
1月 06, 2022
作者:
小湉湉
提交者:
GitHub
1月 06, 2022
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #1278 from jerryuhoo/develop
[TTS] add multi-speaker support for finetuning hifigan vocoder
上级
e69abc92
318cc9e5
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
76 addition
and
102 deletion
+76
-102
examples/csmsc/voc3/finetune.sh
examples/csmsc/voc3/finetune.sh
+4
-2
examples/csmsc/voc5/finetune.sh
examples/csmsc/voc5/finetune.sh
+4
-2
examples/csmsc/voc5/local/link_wav.py
examples/csmsc/voc5/local/link_wav.py
+0
-85
paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
+64
-9
utils/link_wav.py
utils/link_wav.py
+4
-4
未找到文件。
examples/csmsc/voc3/finetune.sh
浏览文件 @
3568bb62
...
...
@@ -15,11 +15,13 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--fastspeech2-stat
=
fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy
\
--dur-file
=
durations.txt
\
--output-dir
=
dump_finetune
\
--phones-dict
=
fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
--phones-dict
=
fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
\
--dataset
=
baker
\
--rootdir
=
~/datasets/BZNSYP/
fi
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
python3
l
ocal
/l
ink_wav.py
\
python3 link_wav.py
\
--old-dump-dir
=
dump
\
--dump-dir
=
dump_finetune
fi
...
...
examples/csmsc/voc5/finetune.sh
浏览文件 @
3568bb62
...
...
@@ -15,11 +15,13 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--fastspeech2-stat
=
fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy
\
--dur-file
=
durations.txt
\
--output-dir
=
dump_finetune
\
--phones-dict
=
fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
--phones-dict
=
fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
\
--dataset
=
baker
\
--rootdir
=
~/datasets/BZNSYP/
fi
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
python3
l
ocal
/l
ink_wav.py
\
python3 link_wav.py
\
--old-dump-dir
=
dump
\
--dump-dir
=
dump_finetune
fi
...
...
examples/csmsc/voc5/local/link_wav.py
已删除
100644 → 0
浏览文件 @
e69abc92
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
argparse
import
os
from
operator
import
itemgetter
from
pathlib
import
Path
import
jsonlines
import
numpy
as
np
def
main
():
# parse config and args
parser
=
argparse
.
ArgumentParser
(
description
=
"Preprocess audio and then extract features ."
)
parser
.
add_argument
(
"--old-dump-dir"
,
default
=
None
,
type
=
str
,
help
=
"directory to dump feature files."
)
parser
.
add_argument
(
"--dump-dir"
,
type
=
str
,
required
=
True
,
help
=
"directory to finetune dump feature files."
)
args
=
parser
.
parse_args
()
old_dump_dir
=
Path
(
args
.
old_dump_dir
).
expanduser
()
old_dump_dir
=
old_dump_dir
.
resolve
()
dump_dir
=
Path
(
args
.
dump_dir
).
expanduser
()
# use absolute path
dump_dir
=
dump_dir
.
resolve
()
dump_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
assert
old_dump_dir
.
is_dir
()
assert
dump_dir
.
is_dir
()
for
sub
in
[
"train"
,
"dev"
,
"test"
]:
# 把 old_dump_dir 里面的 *-wave.npy 软连接到 dump_dir 的对应位置
output_dir
=
dump_dir
/
sub
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
results
=
[]
for
name
in
os
.
listdir
(
output_dir
/
"raw"
):
# 003918_feats.npy
utt_id
=
name
.
split
(
"_"
)[
0
]
mel_path
=
output_dir
/
(
"raw/"
+
name
)
gen_mel
=
np
.
load
(
mel_path
)
wave_name
=
utt_id
+
"_wave.npy"
wav
=
np
.
load
(
old_dump_dir
/
sub
/
(
"raw/"
+
wave_name
))
os
.
symlink
(
old_dump_dir
/
sub
/
(
"raw/"
+
wave_name
),
output_dir
/
(
"raw/"
+
wave_name
))
num_sample
=
wav
.
shape
[
0
]
num_frames
=
gen_mel
.
shape
[
0
]
wav_path
=
output_dir
/
(
"raw/"
+
wave_name
)
record
=
{
"utt_id"
:
utt_id
,
"num_samples"
:
num_sample
,
"num_frames"
:
num_frames
,
"feats"
:
str
(
mel_path
),
"wave"
:
str
(
wav_path
),
}
results
.
append
(
record
)
results
.
sort
(
key
=
itemgetter
(
"utt_id"
))
with
jsonlines
.
open
(
output_dir
/
"raw/metadata.jsonl"
,
'w'
)
as
writer
:
for
item
in
results
:
writer
.
write
(
item
)
if
__name__
==
"__main__"
:
main
()
paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
浏览文件 @
3568bb62
...
...
@@ -21,6 +21,8 @@ import numpy as np
import
paddle
import
yaml
from
yacs.config
import
CfgNode
from
tqdm
import
tqdm
import
os
from
paddlespeech.t2s.datasets.preprocess_utils
import
get_phn_dur
from
paddlespeech.t2s.datasets.preprocess_utils
import
merge_silence
...
...
@@ -30,6 +32,8 @@ from paddlespeech.t2s.modules.normalizer import ZScore
def
evaluate
(
args
,
fastspeech2_config
):
rootdir
=
Path
(
args
.
rootdir
).
expanduser
()
assert
rootdir
.
is_dir
()
# construct dataset for evaluation
with
open
(
args
.
phones_dict
,
"r"
)
as
f
:
...
...
@@ -41,9 +45,16 @@ def evaluate(args, fastspeech2_config):
for
phn
,
id
in
phn_id
:
phone_dict
[
phn
]
=
int
(
id
)
if
args
.
speaker_dict
:
with
open
(
args
.
speaker_dict
,
'rt'
)
as
f
:
spk_id_list
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
spk_num
=
len
(
spk_id_list
)
else
:
spk_num
=
None
odim
=
fastspeech2_config
.
n_mels
model
=
FastSpeech2
(
idim
=
vocab_size
,
odim
=
odim
,
**
fastspeech2_config
[
"model"
])
idim
=
vocab_size
,
odim
=
odim
,
**
fastspeech2_config
[
"model"
]
,
spk_num
=
spk_num
)
model
.
set_state_dict
(
paddle
.
load
(
args
.
fastspeech2_checkpoint
)[
"main_params"
])
...
...
@@ -65,7 +76,34 @@ def evaluate(args, fastspeech2_config):
sentences
,
speaker_set
=
get_phn_dur
(
args
.
dur_file
)
merge_silence
(
sentences
)
for
i
,
utt_id
in
enumerate
(
sentences
):
if
args
.
dataset
==
"baker"
:
wav_files
=
sorted
(
list
((
rootdir
/
"Wave"
).
rglob
(
"*.wav"
)))
# split data into 3 sections
num_train
=
9800
num_dev
=
100
train_wav_files
=
wav_files
[:
num_train
]
dev_wav_files
=
wav_files
[
num_train
:
num_train
+
num_dev
]
test_wav_files
=
wav_files
[
num_train
+
num_dev
:]
elif
args
.
dataset
==
"aishell3"
:
sub_num_dev
=
5
wav_dir
=
rootdir
/
"train"
/
"wav"
train_wav_files
=
[]
dev_wav_files
=
[]
test_wav_files
=
[]
for
speaker
in
os
.
listdir
(
wav_dir
):
wav_files
=
sorted
(
list
((
wav_dir
/
speaker
).
rglob
(
"*.wav"
)))
if
len
(
wav_files
)
>
100
:
train_wav_files
+=
wav_files
[:
-
sub_num_dev
*
2
]
dev_wav_files
+=
wav_files
[
-
sub_num_dev
*
2
:
-
sub_num_dev
]
test_wav_files
+=
wav_files
[
-
sub_num_dev
:]
else
:
train_wav_files
+=
wav_files
train_wav_files
=
[
os
.
path
.
basename
(
str
(
str_path
))
for
str_path
in
train_wav_files
]
dev_wav_files
=
[
os
.
path
.
basename
(
str
(
str_path
))
for
str_path
in
dev_wav_files
]
test_wav_files
=
[
os
.
path
.
basename
(
str
(
str_path
))
for
str_path
in
test_wav_files
]
for
i
,
utt_id
in
enumerate
(
tqdm
(
sentences
)):
phones
=
sentences
[
utt_id
][
0
]
durations
=
sentences
[
utt_id
][
1
]
speaker
=
sentences
[
utt_id
][
2
]
...
...
@@ -82,21 +120,30 @@ def evaluate(args, fastspeech2_config):
phone_ids
=
[
phone_dict
[
phn
]
for
phn
in
phones
]
phone_ids
=
paddle
.
to_tensor
(
np
.
array
(
phone_ids
))
if
args
.
speaker_dict
:
speaker_id
=
int
([
item
[
1
]
for
item
in
spk_id_list
if
speaker
==
item
[
0
]][
0
])
speaker_id
=
paddle
.
to_tensor
(
speaker_id
)
else
:
speaker_id
=
None
durations
=
paddle
.
to_tensor
(
np
.
array
(
durations
))
# 生成的和真实的可能有 1, 2 帧的差距,但是 batch_fn 会修复
# split data into 3 sections
if
args
.
dataset
==
"baker"
:
num_train
=
9800
num_dev
=
100
if
i
in
range
(
0
,
num_train
)
:
wav_path
=
utt_id
+
".wav"
if
wav_path
in
train_wav_files
:
sub_output_dir
=
output_dir
/
(
"train/raw"
)
elif
i
in
range
(
num_train
,
num_train
+
num_dev
)
:
elif
wav_path
in
dev_wav_files
:
sub_output_dir
=
output_dir
/
(
"dev/raw"
)
el
se
:
el
if
wav_path
in
test_wav_files
:
sub_output_dir
=
output_dir
/
(
"test/raw"
)
sub_output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
with
paddle
.
no_grad
():
mel
=
fastspeech2_inference
(
phone_ids
,
durations
=
durations
)
mel
=
fastspeech2_inference
(
phone_ids
,
durations
=
durations
,
spk_id
=
speaker_id
)
np
.
save
(
sub_output_dir
/
(
utt_id
+
"_feats.npy"
),
mel
)
...
...
@@ -109,6 +156,8 @@ def main():
default
=
"baker"
,
type
=
str
,
help
=
"name of dataset, should in {baker, ljspeech, vctk} now"
)
parser
.
add_argument
(
"--rootdir"
,
default
=
None
,
type
=
str
,
help
=
"directory to dataset."
)
parser
.
add_argument
(
"--fastspeech2-config"
,
type
=
str
,
help
=
"fastspeech2 config file."
)
parser
.
add_argument
(
...
...
@@ -126,6 +175,12 @@ def main():
type
=
str
,
default
=
"phone_id_map.txt"
,
help
=
"phone vocabulary file."
)
parser
.
add_argument
(
"--speaker-dict"
,
type
=
str
,
default
=
None
,
help
=
"speaker id map file."
)
parser
.
add_argument
(
"--dur-file"
,
default
=
None
,
type
=
str
,
help
=
"path to durations.txt."
)
...
...
examples/csmsc/voc3/local
/link_wav.py
→
utils
/link_wav.py
浏览文件 @
3568bb62
...
...
@@ -18,7 +18,7 @@ from pathlib import Path
import
jsonlines
import
numpy
as
np
from
tqdm
import
tqdm
def
main
():
# parse config and args
...
...
@@ -52,9 +52,9 @@ def main():
output_dir
=
dump_dir
/
sub
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
results
=
[]
f
or
name
in
os
.
listdir
(
output_dir
/
"raw"
):
# 003918_feats.npy
utt_id
=
name
.
split
(
"_"
)[
0
]
f
iles
=
os
.
listdir
(
output_dir
/
"raw"
)
for
name
in
tqdm
(
files
):
utt_id
=
name
.
split
(
"_
feats.npy
"
)[
0
]
mel_path
=
output_dir
/
(
"raw/"
+
name
)
gen_mel
=
np
.
load
(
mel_path
)
wave_name
=
utt_id
+
"_wave.npy"
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录