Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
8f507ba4
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
12 个月 前同步成功
通知
204
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
8f507ba4
编写于
1月 12, 2022
作者:
小湉湉
提交者:
GitHub
1月 12, 2022
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #1302 from jerryuhoo/develop
[TTS] Add support for finetuning speedyspeech
上级
b3c03d73
111a4523
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
287 addition
and
26 deletion
+287
-26
examples/csmsc/voc3/finetune.sh
examples/csmsc/voc3/finetune.sh
+1
-1
examples/csmsc/voc5/finetune.sh
examples/csmsc/voc5/finetune.sh
+1
-1
paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py
paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py
+246
-0
paddlespeech/t2s/models/speedyspeech/speedyspeech.py
paddlespeech/t2s/models/speedyspeech/speedyspeech.py
+26
-21
utils/link_wav.py
utils/link_wav.py
+13
-3
未找到文件。
examples/csmsc/voc3/finetune.sh
浏览文件 @
8f507ba4
...
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
...
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
fi
fi
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
python3 link_wav.py
\
python3
${
MAIN_ROOT
}
/utils/
link_wav.py
\
--old-dump-dir
=
dump
\
--old-dump-dir
=
dump
\
--dump-dir
=
dump_finetune
--dump-dir
=
dump_finetune
fi
fi
...
...
examples/csmsc/voc5/finetune.sh
浏览文件 @
8f507ba4
...
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
...
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
fi
fi
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
python3 link_wav.py
\
python3
${
MAIN_ROOT
}
/utils/
link_wav.py
\
--old-dump-dir
=
dump
\
--old-dump-dir
=
dump
\
--dump-dir
=
dump_finetune
--dump-dir
=
dump_finetune
fi
fi
...
...
paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py
0 → 100644
浏览文件 @
8f507ba4
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# generate mels using durations.txt
# for mb melgan finetune
# 长度和原本的 mel 不一致怎么办?
import
argparse
import
os
from
pathlib
import
Path
import
numpy
as
np
import
paddle
import
yaml
from
tqdm
import
tqdm
from
yacs.config
import
CfgNode
from
paddlespeech.t2s.datasets.preprocess_utils
import
get_phn_dur
from
paddlespeech.t2s.datasets.preprocess_utils
import
merge_silence
from
paddlespeech.t2s.frontend.zh_frontend
import
Frontend
from
paddlespeech.t2s.models.speedyspeech
import
SpeedySpeech
from
paddlespeech.t2s.models.speedyspeech
import
SpeedySpeechInference
from
paddlespeech.t2s.modules.normalizer
import
ZScore
def
evaluate
(
args
,
speedyspeech_config
):
rootdir
=
Path
(
args
.
rootdir
).
expanduser
()
assert
rootdir
.
is_dir
()
# construct dataset for evaluation
with
open
(
args
.
phones_dict
,
"r"
)
as
f
:
phn_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
vocab_size
=
len
(
phn_id
)
print
(
"vocab_size:"
,
vocab_size
)
phone_dict
=
{}
for
phn
,
id
in
phn_id
:
phone_dict
[
phn
]
=
int
(
id
)
with
open
(
args
.
tones_dict
,
"r"
)
as
f
:
tone_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
tone_size
=
len
(
tone_id
)
print
(
"tone_size:"
,
tone_size
)
frontend
=
Frontend
(
phone_vocab_path
=
args
.
phones_dict
,
tone_vocab_path
=
args
.
tones_dict
)
if
args
.
speaker_dict
:
with
open
(
args
.
speaker_dict
,
'rt'
)
as
f
:
spk_id_list
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
spk_num
=
len
(
spk_id_list
)
else
:
spk_num
=
None
model
=
SpeedySpeech
(
vocab_size
=
vocab_size
,
tone_size
=
tone_size
,
**
speedyspeech_config
[
"model"
],
spk_num
=
spk_num
)
model
.
set_state_dict
(
paddle
.
load
(
args
.
speedyspeech_checkpoint
)[
"main_params"
])
model
.
eval
()
stat
=
np
.
load
(
args
.
speedyspeech_stat
)
mu
,
std
=
stat
mu
=
paddle
.
to_tensor
(
mu
)
std
=
paddle
.
to_tensor
(
std
)
speedyspeech_normalizer
=
ZScore
(
mu
,
std
)
speedyspeech_inference
=
SpeedySpeechInference
(
speedyspeech_normalizer
,
model
)
speedyspeech_inference
.
eval
()
output_dir
=
Path
(
args
.
output_dir
)
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
sentences
,
speaker_set
=
get_phn_dur
(
args
.
dur_file
)
merge_silence
(
sentences
)
if
args
.
dataset
==
"baker"
:
wav_files
=
sorted
(
list
((
rootdir
/
"Wave"
).
rglob
(
"*.wav"
)))
# split data into 3 sections
num_train
=
9800
num_dev
=
100
train_wav_files
=
wav_files
[:
num_train
]
dev_wav_files
=
wav_files
[
num_train
:
num_train
+
num_dev
]
test_wav_files
=
wav_files
[
num_train
+
num_dev
:]
elif
args
.
dataset
==
"aishell3"
:
sub_num_dev
=
5
wav_dir
=
rootdir
/
"train"
/
"wav"
train_wav_files
=
[]
dev_wav_files
=
[]
test_wav_files
=
[]
for
speaker
in
os
.
listdir
(
wav_dir
):
wav_files
=
sorted
(
list
((
wav_dir
/
speaker
).
rglob
(
"*.wav"
)))
if
len
(
wav_files
)
>
100
:
train_wav_files
+=
wav_files
[:
-
sub_num_dev
*
2
]
dev_wav_files
+=
wav_files
[
-
sub_num_dev
*
2
:
-
sub_num_dev
]
test_wav_files
+=
wav_files
[
-
sub_num_dev
:]
else
:
train_wav_files
+=
wav_files
train_wav_files
=
[
os
.
path
.
basename
(
str
(
str_path
))
for
str_path
in
train_wav_files
]
dev_wav_files
=
[
os
.
path
.
basename
(
str
(
str_path
))
for
str_path
in
dev_wav_files
]
test_wav_files
=
[
os
.
path
.
basename
(
str
(
str_path
))
for
str_path
in
test_wav_files
]
for
i
,
utt_id
in
enumerate
(
tqdm
(
sentences
)):
phones
=
sentences
[
utt_id
][
0
]
durations
=
sentences
[
utt_id
][
1
]
speaker
=
sentences
[
utt_id
][
2
]
# 裁剪掉开头和结尾的 sil
if
args
.
cut_sil
:
if
phones
[
0
]
==
"sil"
and
len
(
durations
)
>
1
:
durations
=
durations
[
1
:]
phones
=
phones
[
1
:]
if
phones
[
-
1
]
==
'sil'
and
len
(
durations
)
>
1
:
durations
=
durations
[:
-
1
]
phones
=
phones
[:
-
1
]
phones
,
tones
=
frontend
.
_get_phone_tone
(
phones
,
get_tone_ids
=
True
)
if
tones
:
tone_ids
=
frontend
.
_t2id
(
tones
)
tone_ids
=
paddle
.
to_tensor
(
tone_ids
)
if
phones
:
phone_ids
=
frontend
.
_p2id
(
phones
)
phone_ids
=
paddle
.
to_tensor
(
phone_ids
)
if
args
.
speaker_dict
:
speaker_id
=
int
(
[
item
[
1
]
for
item
in
spk_id_list
if
speaker
==
item
[
0
]][
0
])
speaker_id
=
paddle
.
to_tensor
(
speaker_id
)
else
:
speaker_id
=
None
durations
=
paddle
.
to_tensor
(
np
.
array
(
durations
))
durations
=
paddle
.
unsqueeze
(
durations
,
axis
=
0
)
# 生成的和真实的可能有 1, 2 帧的差距,但是 batch_fn 会修复
# split data into 3 sections
wav_path
=
utt_id
+
".wav"
if
wav_path
in
train_wav_files
:
sub_output_dir
=
output_dir
/
(
"train/raw"
)
elif
wav_path
in
dev_wav_files
:
sub_output_dir
=
output_dir
/
(
"dev/raw"
)
elif
wav_path
in
test_wav_files
:
sub_output_dir
=
output_dir
/
(
"test/raw"
)
sub_output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
with
paddle
.
no_grad
():
mel
=
speedyspeech_inference
(
phone_ids
,
tone_ids
,
durations
=
durations
,
spk_id
=
speaker_id
)
np
.
save
(
sub_output_dir
/
(
utt_id
+
"_feats.npy"
),
mel
)
def
main
():
# parse args and config and redirect to train_sp
parser
=
argparse
.
ArgumentParser
(
description
=
"Synthesize with speedyspeech & parallel wavegan."
)
parser
.
add_argument
(
"--dataset"
,
default
=
"baker"
,
type
=
str
,
help
=
"name of dataset, should in {baker, ljspeech, vctk} now"
)
parser
.
add_argument
(
"--rootdir"
,
default
=
None
,
type
=
str
,
help
=
"directory to dataset."
)
parser
.
add_argument
(
"--speedyspeech-config"
,
type
=
str
,
help
=
"speedyspeech config file."
)
parser
.
add_argument
(
"--speedyspeech-checkpoint"
,
type
=
str
,
help
=
"speedyspeech checkpoint to load."
)
parser
.
add_argument
(
"--speedyspeech-stat"
,
type
=
str
,
help
=
"mean and standard deviation used to normalize spectrogram when training speedyspeech."
)
parser
.
add_argument
(
"--phones-dict"
,
type
=
str
,
default
=
"phone_id_map.txt"
,
help
=
"phone vocabulary file."
)
parser
.
add_argument
(
"--tones-dict"
,
type
=
str
,
default
=
"tone_id_map.txt"
,
help
=
"tone vocabulary file."
)
parser
.
add_argument
(
"--speaker-dict"
,
type
=
str
,
default
=
None
,
help
=
"speaker id map file."
)
parser
.
add_argument
(
"--dur-file"
,
default
=
None
,
type
=
str
,
help
=
"path to durations.txt."
)
parser
.
add_argument
(
"--output-dir"
,
type
=
str
,
help
=
"output dir."
)
parser
.
add_argument
(
"--ngpu"
,
type
=
int
,
default
=
1
,
help
=
"if ngpu == 0, use cpu."
)
def
str2bool
(
str
):
return
True
if
str
.
lower
()
==
'true'
else
False
parser
.
add_argument
(
"--cut-sil"
,
type
=
str2bool
,
default
=
True
,
help
=
"whether cut sil in the edge of audio"
)
args
=
parser
.
parse_args
()
if
args
.
ngpu
==
0
:
paddle
.
set_device
(
"cpu"
)
elif
args
.
ngpu
>
0
:
paddle
.
set_device
(
"gpu"
)
else
:
print
(
"ngpu should >= 0 !"
)
with
open
(
args
.
speedyspeech_config
)
as
f
:
speedyspeech_config
=
CfgNode
(
yaml
.
safe_load
(
f
))
print
(
"========Args========"
)
print
(
yaml
.
safe_dump
(
vars
(
args
)))
print
(
"========Config========"
)
print
(
speedyspeech_config
)
evaluate
(
args
,
speedyspeech_config
)
if
__name__
==
"__main__"
:
main
()
paddlespeech/t2s/models/speedyspeech/speedyspeech.py
浏览文件 @
8f507ba4
...
@@ -222,7 +222,7 @@ class SpeedySpeech(nn.Layer):
...
@@ -222,7 +222,7 @@ class SpeedySpeech(nn.Layer):
decoded
=
self
.
decoder
(
encodings
)
decoded
=
self
.
decoder
(
encodings
)
return
decoded
,
pred_durations
return
decoded
,
pred_durations
def
inference
(
self
,
text
,
tones
=
None
,
spk_id
=
None
):
def
inference
(
self
,
text
,
tones
=
None
,
durations
=
None
,
spk_id
=
None
):
# text: [T]
# text: [T]
# tones: [T]
# tones: [T]
# input of embedding must be int64
# input of embedding must be int64
...
@@ -234,24 +234,28 @@ class SpeedySpeech(nn.Layer):
...
@@ -234,24 +234,28 @@ class SpeedySpeech(nn.Layer):
encodings
=
self
.
encoder
(
text
,
tones
,
spk_id
)
encodings
=
self
.
encoder
(
text
,
tones
,
spk_id
)
pred_durations
=
self
.
duration_predictor
(
encodings
)
# (1, T)
if
type
(
durations
)
==
type
(
None
):
durations_to_expand
=
paddle
.
round
(
pred_durations
.
exp
())
pred_durations
=
self
.
duration_predictor
(
encodings
)
# (1, T)
durations_to_expand
=
(
durations_to_expand
).
astype
(
paddle
.
int64
)
durations_to_expand
=
paddle
.
round
(
pred_durations
.
exp
())
durations_to_expand
=
(
durations_to_expand
).
astype
(
paddle
.
int64
)
slens
=
paddle
.
sum
(
durations_to_expand
,
-
1
)
# [1]
t_dec
=
slens
[
0
]
# [1]
slens
=
paddle
.
sum
(
durations_to_expand
,
-
1
)
# [1]
t_enc
=
paddle
.
shape
(
pred_durations
)[
-
1
]
t_dec
=
slens
[
0
]
# [1]
M
=
paddle
.
zeros
([
1
,
t_dec
,
t_enc
])
t_enc
=
paddle
.
shape
(
pred_durations
)[
-
1
]
M
=
paddle
.
zeros
([
1
,
t_dec
,
t_enc
])
k
=
paddle
.
full
([
1
],
0
,
dtype
=
paddle
.
int64
)
for
j
in
range
(
t_enc
):
k
=
paddle
.
full
([
1
],
0
,
dtype
=
paddle
.
int64
)
d
=
durations_to_expand
[
0
,
j
]
for
j
in
range
(
t_enc
):
# If the d == 0, slice action is meaningless and not supported
d
=
durations_to_expand
[
0
,
j
]
if
d
>=
1
:
# If the d == 0, slice action is meaningless and not supported
M
[
0
,
k
:
k
+
d
,
j
]
=
1
if
d
>=
1
:
k
+=
d
M
[
0
,
k
:
k
+
d
,
j
]
=
1
k
+=
d
encodings
=
paddle
.
matmul
(
M
,
encodings
)
encodings
=
paddle
.
matmul
(
M
,
encodings
)
else
:
durations_to_expand
=
durations
encodings
=
expand
(
encodings
,
durations_to_expand
)
shape
=
paddle
.
shape
(
encodings
)
shape
=
paddle
.
shape
(
encodings
)
t_dec
,
feature_size
=
shape
[
1
],
shape
[
2
]
t_dec
,
feature_size
=
shape
[
1
],
shape
[
2
]
...
@@ -266,7 +270,8 @@ class SpeedySpeechInference(nn.Layer):
...
@@ -266,7 +270,8 @@ class SpeedySpeechInference(nn.Layer):
self
.
normalizer
=
normalizer
self
.
normalizer
=
normalizer
self
.
acoustic_model
=
speedyspeech_model
self
.
acoustic_model
=
speedyspeech_model
def
forward
(
self
,
phones
,
tones
,
spk_id
=
None
):
def
forward
(
self
,
phones
,
tones
,
durations
=
None
,
spk_id
=
None
):
normalized_mel
=
self
.
acoustic_model
.
inference
(
phones
,
tones
,
spk_id
)
normalized_mel
=
self
.
acoustic_model
.
inference
(
phones
,
tones
,
durations
=
durations
,
spk_id
=
spk_id
)
logmel
=
self
.
normalizer
.
inverse
(
normalized_mel
)
logmel
=
self
.
normalizer
.
inverse
(
normalized_mel
)
return
logmel
return
logmel
utils/link_wav.py
浏览文件 @
8f507ba4
...
@@ -20,6 +20,7 @@ import jsonlines
...
@@ -20,6 +20,7 @@ import jsonlines
import
numpy
as
np
import
numpy
as
np
from
tqdm
import
tqdm
from
tqdm
import
tqdm
def
main
():
def
main
():
# parse config and args
# parse config and args
parser
=
argparse
.
ArgumentParser
(
parser
=
argparse
.
ArgumentParser
(
...
@@ -58,9 +59,18 @@ def main():
...
@@ -58,9 +59,18 @@ def main():
mel_path
=
output_dir
/
(
"raw/"
+
name
)
mel_path
=
output_dir
/
(
"raw/"
+
name
)
gen_mel
=
np
.
load
(
mel_path
)
gen_mel
=
np
.
load
(
mel_path
)
wave_name
=
utt_id
+
"_wave.npy"
wave_name
=
utt_id
+
"_wave.npy"
wav
=
np
.
load
(
old_dump_dir
/
sub
/
(
"raw/"
+
wave_name
))
try
:
os
.
symlink
(
old_dump_dir
/
sub
/
(
"raw/"
+
wave_name
),
wav
=
np
.
load
(
old_dump_dir
/
sub
/
(
"raw/"
+
wave_name
))
output_dir
/
(
"raw/"
+
wave_name
))
os
.
symlink
(
old_dump_dir
/
sub
/
(
"raw/"
+
wave_name
),
output_dir
/
(
"raw/"
+
wave_name
))
except
FileNotFoundError
:
print
(
"delete "
+
name
+
" because it cannot be found in the dump folder"
)
os
.
remove
(
output_dir
/
"raw"
/
name
)
continue
except
FileExistsError
:
print
(
"file "
+
name
+
" exists, skip."
)
continue
num_sample
=
wav
.
shape
[
0
]
num_sample
=
wav
.
shape
[
0
]
num_frames
=
gen_mel
.
shape
[
0
]
num_frames
=
gen_mel
.
shape
[
0
]
wav_path
=
output_dir
/
(
"raw/"
+
wave_name
)
wav_path
=
output_dir
/
(
"raw/"
+
wave_name
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录