Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
fe8bf2a3
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
fe8bf2a3
编写于
3月 09, 2022
作者:
小湉湉
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
format synthesize, test=tts
上级
10ab7aab
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
438 addition
and
468 deletion
+438
-468
paddlespeech/t2s/exps/inference.py
paddlespeech/t2s/exps/inference.py
+140
-92
paddlespeech/t2s/exps/syn_utils.py
paddlespeech/t2s/exps/syn_utils.py
+243
-0
paddlespeech/t2s/exps/synthesize.py
paddlespeech/t2s/exps/synthesize.py
+12
-130
paddlespeech/t2s/exps/synthesize_e2e.py
paddlespeech/t2s/exps/synthesize_e2e.py
+21
-181
paddlespeech/t2s/exps/voice_cloning.py
paddlespeech/t2s/exps/voice_cloning.py
+12
-65
paddlespeech/t2s/modules/predictor/length_regulator.py
paddlespeech/t2s/modules/predictor/length_regulator.py
+10
-0
未找到文件。
paddlespeech/t2s/exps/inference.py
浏览文件 @
fe8bf2a3
...
@@ -17,13 +17,92 @@ from pathlib import Path
...
@@ -17,13 +17,92 @@ from pathlib import Path
import
numpy
import
numpy
import
soundfile
as
sf
import
soundfile
as
sf
from
paddle
import
inference
from
paddle
import
inference
from
timer
import
timer
from
paddlespeech.t2s.frontend
import
English
from
paddlespeech.t2s.frontend.zh_frontend
import
Frontend
from
paddlespeech.t2s.exps.syn_utils
import
get_frontend
from
paddlespeech.t2s.exps.syn_utils
import
get_sentences
from
paddlespeech.t2s.utils
import
str2bool
def
get_predictor
(
args
,
filed
=
'am'
):
full_name
=
''
if
filed
==
'am'
:
full_name
=
args
.
am
elif
filed
==
'voc'
:
full_name
=
args
.
voc
model_name
=
full_name
[:
full_name
.
rindex
(
'_'
)]
config
=
inference
.
Config
(
str
(
Path
(
args
.
inference_dir
)
/
(
full_name
+
".pdmodel"
)),
str
(
Path
(
args
.
inference_dir
)
/
(
full_name
+
".pdiparams"
)))
if
args
.
device
==
"gpu"
:
config
.
enable_use_gpu
(
100
,
0
)
elif
args
.
device
==
"cpu"
:
config
.
disable_gpu
()
# This line must be commented for fastspeech2, if not, it will OOM
if
model_name
!=
'fastspeech2'
:
config
.
enable_memory_optim
()
predictor
=
inference
.
create_predictor
(
config
)
return
predictor
# only inference for models trained with csmsc now
def
get_am_output
(
args
,
am_predictor
,
frontend
,
merge_sentences
,
input
):
def
main
():
am_name
=
args
.
am
[:
args
.
am
.
rindex
(
'_'
)]
am_dataset
=
args
.
am
[
args
.
am
.
rindex
(
'_'
)
+
1
:]
am_input_names
=
am_predictor
.
get_input_names
()
get_tone_ids
=
False
get_spk_id
=
False
if
am_name
==
'speedyspeech'
:
get_tone_ids
=
True
if
am_dataset
in
{
"aishell3"
,
"vctk"
}
and
args
.
speaker_dict
:
get_spk_id
=
True
spk_id
=
numpy
.
array
([
args
.
spk_id
])
if
args
.
lang
==
'zh'
:
input_ids
=
frontend
.
get_input_ids
(
input
,
merge_sentences
=
merge_sentences
,
get_tone_ids
=
get_tone_ids
)
phone_ids
=
input_ids
[
"phone_ids"
]
elif
args
.
lang
==
'en'
:
input_ids
=
frontend
.
get_input_ids
(
input
,
merge_sentences
=
merge_sentences
)
phone_ids
=
input_ids
[
"phone_ids"
]
else
:
print
(
"lang should in {'zh', 'en'}!"
)
if
get_tone_ids
:
tone_ids
=
input_ids
[
"tone_ids"
]
tones
=
tone_ids
[
0
].
numpy
()
tones_handle
=
am_predictor
.
get_input_handle
(
am_input_names
[
1
])
tones_handle
.
reshape
(
tones
.
shape
)
tones_handle
.
copy_from_cpu
(
tones
)
if
get_spk_id
:
spk_id_handle
=
am_predictor
.
get_input_handle
(
am_input_names
[
1
])
spk_id_handle
.
reshape
(
spk_id
.
shape
)
spk_id_handle
.
copy_from_cpu
(
spk_id
)
phones
=
phone_ids
[
0
].
numpy
()
phones_handle
=
am_predictor
.
get_input_handle
(
am_input_names
[
0
])
phones_handle
.
reshape
(
phones
.
shape
)
phones_handle
.
copy_from_cpu
(
phones
)
am_predictor
.
run
()
am_output_names
=
am_predictor
.
get_output_names
()
am_output_handle
=
am_predictor
.
get_output_handle
(
am_output_names
[
0
])
am_output_data
=
am_output_handle
.
copy_to_cpu
()
return
am_output_data
def
get_voc_output
(
args
,
voc_predictor
,
input
):
voc_input_names
=
voc_predictor
.
get_input_names
()
mel_handle
=
voc_predictor
.
get_input_handle
(
voc_input_names
[
0
])
mel_handle
.
reshape
(
input
.
shape
)
mel_handle
.
copy_from_cpu
(
input
)
voc_predictor
.
run
()
voc_output_names
=
voc_predictor
.
get_output_names
()
voc_output_handle
=
voc_predictor
.
get_output_handle
(
voc_output_names
[
0
])
wav
=
voc_output_handle
.
copy_to_cpu
()
return
wav
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
parser
=
argparse
.
ArgumentParser
(
description
=
"Paddle Infernce with speedyspeech & parallel wavegan."
)
description
=
"Paddle Infernce with speedyspeech & parallel wavegan."
)
# acoustic model
# acoustic model
...
@@ -70,113 +149,82 @@ def main():
...
@@ -70,113 +149,82 @@ def main():
parser
.
add_argument
(
parser
.
add_argument
(
"--inference_dir"
,
type
=
str
,
help
=
"dir to save inference models"
)
"--inference_dir"
,
type
=
str
,
help
=
"dir to save inference models"
)
parser
.
add_argument
(
"--output_dir"
,
type
=
str
,
help
=
"output dir"
)
parser
.
add_argument
(
"--output_dir"
,
type
=
str
,
help
=
"output dir"
)
# inference
parser
.
add_argument
(
"--use_trt"
,
type
=
str2bool
,
default
=
False
,
help
=
"Whether to use inference engin TensorRT."
,
)
parser
.
add_argument
(
"--int8"
,
type
=
str2bool
,
default
=
False
,
help
=
"Whether to use int8 inference."
,
)
parser
.
add_argument
(
"--fp16"
,
type
=
str2bool
,
default
=
False
,
help
=
"Whether to use float16 inference."
,
)
parser
.
add_argument
(
"--device"
,
default
=
"gpu"
,
choices
=
[
"gpu"
,
"cpu"
],
help
=
"Device selected for inference."
,
)
args
,
_
=
parser
.
parse_known_args
()
args
,
_
=
parser
.
parse_known_args
()
return
args
# only inference for models trained with csmsc now
def
main
():
args
=
parse_args
()
# frontend
# frontend
if
args
.
lang
==
'zh'
:
frontend
=
get_frontend
(
args
)
frontend
=
Frontend
(
phone_vocab_path
=
args
.
phones_dict
,
tone_vocab_path
=
args
.
tones_dict
)
elif
args
.
lang
==
'en'
:
frontend
=
English
(
phone_vocab_path
=
args
.
phones_dict
)
print
(
"frontend done!"
)
# am_predictor
am_predictor
=
get_predictor
(
args
,
filed
=
'am'
)
# model: {model_name}_{dataset}
# model: {model_name}_{dataset}
am_name
=
args
.
am
[:
args
.
am
.
rindex
(
'_'
)]
am_dataset
=
args
.
am
[
args
.
am
.
rindex
(
'_'
)
+
1
:]
am_dataset
=
args
.
am
[
args
.
am
.
rindex
(
'_'
)
+
1
:]
am_config
=
inference
.
Config
(
# voc_predictor
str
(
Path
(
args
.
inference_dir
)
/
(
args
.
am
+
".pdmodel"
)),
voc_predictor
=
get_predictor
(
args
,
filed
=
'voc'
)
str
(
Path
(
args
.
inference_dir
)
/
(
args
.
am
+
".pdiparams"
)))
am_config
.
enable_use_gpu
(
100
,
0
)
# This line must be commented for fastspeech2, if not, it will OOM
if
am_name
!=
'fastspeech2'
:
am_config
.
enable_memory_optim
()
am_predictor
=
inference
.
create_predictor
(
am_config
)
voc_config
=
inference
.
Config
(
str
(
Path
(
args
.
inference_dir
)
/
(
args
.
voc
+
".pdmodel"
)),
str
(
Path
(
args
.
inference_dir
)
/
(
args
.
voc
+
".pdiparams"
)))
voc_config
.
enable_use_gpu
(
100
,
0
)
voc_config
.
enable_memory_optim
()
voc_predictor
=
inference
.
create_predictor
(
voc_config
)
output_dir
=
Path
(
args
.
output_dir
)
output_dir
=
Path
(
args
.
output_dir
)
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
sentences
=
[]
print
(
"in new inference"
)
# construct dataset for evaluation
sentences
=
[]
with
open
(
args
.
text
,
'rt'
)
as
f
:
for
line
in
f
:
items
=
line
.
strip
().
split
()
utt_id
=
items
[
0
]
if
args
.
lang
==
'zh'
:
sentence
=
""
.
join
(
items
[
1
:])
elif
args
.
lang
==
'en'
:
sentence
=
" "
.
join
(
items
[
1
:])
sentences
.
append
((
utt_id
,
sentence
))
get_tone_ids
=
False
sentences
=
get_sentences
(
args
)
get_spk_id
=
False
if
am_name
==
'speedyspeech'
:
get_tone_ids
=
True
if
am_dataset
in
{
"aishell3"
,
"vctk"
}
and
args
.
speaker_dict
:
get_spk_id
=
True
spk_id
=
numpy
.
array
([
args
.
spk_id
])
am_input_names
=
am_predictor
.
get_input_names
()
print
(
"am_input_names:"
,
am_input_names
)
merge_sentences
=
True
merge_sentences
=
True
N
=
0
T
=
0
fs
=
24000
if
am_dataset
!=
'ljspeech'
else
22050
i
=
0
for
utt_id
,
sentence
in
sentences
:
for
utt_id
,
sentence
in
sentences
:
if
args
.
lang
==
'zh'
:
# warmup
input_ids
=
frontend
.
get_input_ids
(
i
+=
1
sentence
,
with
timer
()
as
t
:
am_output_data
=
get_am_output
(
args
,
am_predictor
=
am_predictor
,
frontend
=
frontend
,
merge_sentences
=
merge_sentences
,
merge_sentences
=
merge_sentences
,
get_tone_ids
=
get_tone_ids
)
input
=
sentence
)
phone_ids
=
input_ids
[
"phone_ids"
]
wav
=
get_voc_output
(
elif
args
.
lang
==
'en'
:
args
,
voc_predictor
=
voc_predictor
,
input
=
am_output_data
)
input_ids
=
frontend
.
get_input_ids
(
sentence
,
merge_sentences
=
merge_sentences
)
if
i
>=
3
:
phone_ids
=
input_ids
[
"phone_ids"
]
N
+=
wav
.
size
else
:
T
+=
t
.
elapse
print
(
"lang should in {'zh', 'en'}!"
)
speed
=
wav
.
size
/
t
.
elapse
rtf
=
fs
/
speed
if
get_tone_ids
:
tone_ids
=
input_ids
[
"tone_ids"
]
tones
=
tone_ids
[
0
].
numpy
()
tones_handle
=
am_predictor
.
get_input_handle
(
am_input_names
[
1
])
tones_handle
.
reshape
(
tones
.
shape
)
tones_handle
.
copy_from_cpu
(
tones
)
if
get_spk_id
:
spk_id_handle
=
am_predictor
.
get_input_handle
(
am_input_names
[
1
])
spk_id_handle
.
reshape
(
spk_id
.
shape
)
spk_id_handle
.
copy_from_cpu
(
spk_id
)
phones
=
phone_ids
[
0
].
numpy
()
phones_handle
=
am_predictor
.
get_input_handle
(
am_input_names
[
0
])
phones_handle
.
reshape
(
phones
.
shape
)
phones_handle
.
copy_from_cpu
(
phones
)
am_predictor
.
run
()
am_output_names
=
am_predictor
.
get_output_names
()
am_output_handle
=
am_predictor
.
get_output_handle
(
am_output_names
[
0
])
am_output_data
=
am_output_handle
.
copy_to_cpu
()
voc_input_names
=
voc_predictor
.
get_input_names
()
mel_handle
=
voc_predictor
.
get_input_handle
(
voc_input_names
[
0
])
mel_handle
.
reshape
(
am_output_data
.
shape
)
mel_handle
.
copy_from_cpu
(
am_output_data
)
voc_predictor
.
run
()
voc_output_names
=
voc_predictor
.
get_output_names
()
voc_output_handle
=
voc_predictor
.
get_output_handle
(
voc_output_names
[
0
])
wav
=
voc_output_handle
.
copy_to_cpu
()
sf
.
write
(
output_dir
/
(
utt_id
+
".wav"
),
wav
,
samplerate
=
24000
)
sf
.
write
(
output_dir
/
(
utt_id
+
".wav"
),
wav
,
samplerate
=
24000
)
print
(
f
"
{
utt_id
}
, mel:
{
am_output_data
.
shape
}
, wave:
{
wav
.
shape
}
, time:
{
t
.
elapse
}
s, Hz:
{
speed
}
, RTF:
{
rtf
}
."
)
print
(
f
"
{
utt_id
}
done!"
)
print
(
f
"
{
utt_id
}
done!"
)
print
(
f
"generation speed:
{
N
/
T
}
Hz, RTF:
{
fs
/
(
N
/
T
)
}
"
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
paddlespeech/t2s/exps/syn_utils.py
0 → 100644
浏览文件 @
fe8bf2a3
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
numpy
as
np
import
paddle
from
paddle
import
jit
from
paddle.static
import
InputSpec
from
paddlespeech.s2t.utils.dynamic_import
import
dynamic_import
from
paddlespeech.t2s.datasets.data_table
import
DataTable
from
paddlespeech.t2s.frontend
import
English
from
paddlespeech.t2s.frontend.zh_frontend
import
Frontend
from
paddlespeech.t2s.modules.normalizer
import
ZScore
model_alias
=
{
# acoustic model
"speedyspeech"
:
"paddlespeech.t2s.models.speedyspeech:SpeedySpeech"
,
"speedyspeech_inference"
:
"paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference"
,
"fastspeech2"
:
"paddlespeech.t2s.models.fastspeech2:FastSpeech2"
,
"fastspeech2_inference"
:
"paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference"
,
"tacotron2"
:
"paddlespeech.t2s.models.tacotron2:Tacotron2"
,
"tacotron2_inference"
:
"paddlespeech.t2s.models.tacotron2:Tacotron2Inference"
,
# voc
"pwgan"
:
"paddlespeech.t2s.models.parallel_wavegan:PWGGenerator"
,
"pwgan_inference"
:
"paddlespeech.t2s.models.parallel_wavegan:PWGInference"
,
"mb_melgan"
:
"paddlespeech.t2s.models.melgan:MelGANGenerator"
,
"mb_melgan_inference"
:
"paddlespeech.t2s.models.melgan:MelGANInference"
,
"style_melgan"
:
"paddlespeech.t2s.models.melgan:StyleMelGANGenerator"
,
"style_melgan_inference"
:
"paddlespeech.t2s.models.melgan:StyleMelGANInference"
,
"hifigan"
:
"paddlespeech.t2s.models.hifigan:HiFiGANGenerator"
,
"hifigan_inference"
:
"paddlespeech.t2s.models.hifigan:HiFiGANInference"
,
"wavernn"
:
"paddlespeech.t2s.models.wavernn:WaveRNN"
,
"wavernn_inference"
:
"paddlespeech.t2s.models.wavernn:WaveRNNInference"
,
}
# input
def
get_sentences
(
args
):
# construct dataset for evaluation
sentences
=
[]
with
open
(
args
.
text
,
'rt'
)
as
f
:
for
line
in
f
:
items
=
line
.
strip
().
split
()
utt_id
=
items
[
0
]
if
'lang'
in
args
and
args
.
lang
==
'zh'
:
sentence
=
""
.
join
(
items
[
1
:])
elif
'lang'
in
args
and
args
.
lang
==
'en'
:
sentence
=
" "
.
join
(
items
[
1
:])
sentences
.
append
((
utt_id
,
sentence
))
return
sentences
def
get_test_dataset
(
args
,
test_metadata
,
am_name
,
am_dataset
):
if
am_name
==
'fastspeech2'
:
fields
=
[
"utt_id"
,
"text"
]
if
am_dataset
in
{
"aishell3"
,
"vctk"
}
and
args
.
speaker_dict
:
print
(
"multiple speaker fastspeech2!"
)
fields
+=
[
"spk_id"
]
elif
'voice_cloning'
in
args
and
args
.
voice_cloning
:
print
(
"voice cloning!"
)
fields
+=
[
"spk_emb"
]
else
:
print
(
"single speaker fastspeech2!"
)
elif
am_name
==
'speedyspeech'
:
fields
=
[
"utt_id"
,
"phones"
,
"tones"
]
elif
am_name
==
'tacotron2'
:
fields
=
[
"utt_id"
,
"text"
]
if
'voice_cloning'
in
args
and
args
.
voice_cloning
:
print
(
"voice cloning!"
)
fields
+=
[
"spk_emb"
]
test_dataset
=
DataTable
(
data
=
test_metadata
,
fields
=
fields
)
return
test_dataset
# frontend
def
get_frontend
(
args
):
if
'lang'
in
args
and
args
.
lang
==
'zh'
:
frontend
=
Frontend
(
phone_vocab_path
=
args
.
phones_dict
,
tone_vocab_path
=
args
.
tones_dict
)
elif
'lang'
in
args
and
args
.
lang
==
'en'
:
frontend
=
English
(
phone_vocab_path
=
args
.
phones_dict
)
else
:
print
(
"wrong lang!"
)
print
(
"frontend done!"
)
return
frontend
# dygraph
def
get_am_inference
(
args
,
am_config
):
with
open
(
args
.
phones_dict
,
"r"
)
as
f
:
phn_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
vocab_size
=
len
(
phn_id
)
print
(
"vocab_size:"
,
vocab_size
)
tone_size
=
None
if
'tones_dict'
in
args
and
args
.
tones_dict
:
with
open
(
args
.
tones_dict
,
"r"
)
as
f
:
tone_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
tone_size
=
len
(
tone_id
)
print
(
"tone_size:"
,
tone_size
)
spk_num
=
None
if
'speaker_dict'
in
args
and
args
.
speaker_dict
:
with
open
(
args
.
speaker_dict
,
'rt'
)
as
f
:
spk_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
spk_num
=
len
(
spk_id
)
print
(
"spk_num:"
,
spk_num
)
odim
=
am_config
.
n_mels
# model: {model_name}_{dataset}
am_name
=
args
.
am
[:
args
.
am
.
rindex
(
'_'
)]
am_dataset
=
args
.
am
[
args
.
am
.
rindex
(
'_'
)
+
1
:]
am_class
=
dynamic_import
(
am_name
,
model_alias
)
am_inference_class
=
dynamic_import
(
am_name
+
'_inference'
,
model_alias
)
if
am_name
==
'fastspeech2'
:
am
=
am_class
(
idim
=
vocab_size
,
odim
=
odim
,
spk_num
=
spk_num
,
**
am_config
[
"model"
])
elif
am_name
==
'speedyspeech'
:
am
=
am_class
(
vocab_size
=
vocab_size
,
tone_size
=
tone_size
,
spk_num
=
spk_num
,
**
am_config
[
"model"
])
elif
am_name
==
'tacotron2'
:
am
=
am_class
(
idim
=
vocab_size
,
odim
=
odim
,
**
am_config
[
"model"
])
am
.
set_state_dict
(
paddle
.
load
(
args
.
am_ckpt
)[
"main_params"
])
am
.
eval
()
am_mu
,
am_std
=
np
.
load
(
args
.
am_stat
)
am_mu
=
paddle
.
to_tensor
(
am_mu
)
am_std
=
paddle
.
to_tensor
(
am_std
)
am_normalizer
=
ZScore
(
am_mu
,
am_std
)
am_inference
=
am_inference_class
(
am_normalizer
,
am
)
am_inference
.
eval
()
print
(
"acoustic model done!"
)
return
am_inference
,
am_name
,
am_dataset
def
get_voc_inference
(
args
,
voc_config
):
# model: {model_name}_{dataset}
voc_name
=
args
.
voc
[:
args
.
voc
.
rindex
(
'_'
)]
voc_class
=
dynamic_import
(
voc_name
,
model_alias
)
voc_inference_class
=
dynamic_import
(
voc_name
+
'_inference'
,
model_alias
)
if
voc_name
!=
'wavernn'
:
voc
=
voc_class
(
**
voc_config
[
"generator_params"
])
voc
.
set_state_dict
(
paddle
.
load
(
args
.
voc_ckpt
)[
"generator_params"
])
voc
.
remove_weight_norm
()
voc
.
eval
()
else
:
voc
=
voc_class
(
**
voc_config
[
"model"
])
voc
.
set_state_dict
(
paddle
.
load
(
args
.
voc_ckpt
)[
"main_params"
])
voc
.
eval
()
voc_mu
,
voc_std
=
np
.
load
(
args
.
voc_stat
)
voc_mu
=
paddle
.
to_tensor
(
voc_mu
)
voc_std
=
paddle
.
to_tensor
(
voc_std
)
voc_normalizer
=
ZScore
(
voc_mu
,
voc_std
)
voc_inference
=
voc_inference_class
(
voc_normalizer
,
voc
)
voc_inference
.
eval
()
print
(
"voc done!"
)
return
voc_inference
# to static
def
am_to_static
(
args
,
am_inference
,
am_name
,
am_dataset
):
if
am_name
==
'fastspeech2'
:
if
am_dataset
in
{
"aishell3"
,
"vctk"
}
and
args
.
speaker_dict
:
am_inference
=
jit
.
to_static
(
am_inference
,
input_spec
=
[
InputSpec
([
-
1
],
dtype
=
paddle
.
int64
),
InputSpec
([
1
],
dtype
=
paddle
.
int64
),
],
)
else
:
am_inference
=
jit
.
to_static
(
am_inference
,
input_spec
=
[
InputSpec
([
-
1
],
dtype
=
paddle
.
int64
)])
elif
am_name
==
'speedyspeech'
:
if
am_dataset
in
{
"aishell3"
,
"vctk"
}
and
args
.
speaker_dict
:
am_inference
=
jit
.
to_static
(
am_inference
,
input_spec
=
[
InputSpec
([
-
1
],
dtype
=
paddle
.
int64
),
# text
InputSpec
([
-
1
],
dtype
=
paddle
.
int64
),
# tone
InputSpec
([
1
],
dtype
=
paddle
.
int64
),
# spk_id
None
# duration
])
else
:
am_inference
=
jit
.
to_static
(
am_inference
,
input_spec
=
[
InputSpec
([
-
1
],
dtype
=
paddle
.
int64
),
InputSpec
([
-
1
],
dtype
=
paddle
.
int64
)
])
elif
am_name
==
'tacotron2'
:
am_inference
=
jit
.
to_static
(
am_inference
,
input_spec
=
[
InputSpec
([
-
1
],
dtype
=
paddle
.
int64
)])
paddle
.
jit
.
save
(
am_inference
,
os
.
path
.
join
(
args
.
inference_dir
,
args
.
am
))
am_inference
=
paddle
.
jit
.
load
(
os
.
path
.
join
(
args
.
inference_dir
,
args
.
am
))
return
am_inference
def
voc_to_static
(
args
,
voc_inference
):
voc_inference
=
jit
.
to_static
(
voc_inference
,
input_spec
=
[
InputSpec
([
-
1
,
80
],
dtype
=
paddle
.
float32
),
])
paddle
.
jit
.
save
(
voc_inference
,
os
.
path
.
join
(
args
.
inference_dir
,
args
.
voc
))
voc_inference
=
paddle
.
jit
.
load
(
os
.
path
.
join
(
args
.
inference_dir
,
args
.
voc
))
return
voc_inference
paddlespeech/t2s/exps/synthesize.py
浏览文件 @
fe8bf2a3
...
@@ -23,48 +23,11 @@ import yaml
...
@@ -23,48 +23,11 @@ import yaml
from
timer
import
timer
from
timer
import
timer
from
yacs.config
import
CfgNode
from
yacs.config
import
CfgNode
from
paddlespeech.
s2t.utils.dynamic_import
import
dynamic_import
from
paddlespeech.
t2s.exps.syn_utils
import
get_am_inference
from
paddlespeech.t2s.
datasets.data_table
import
DataTable
from
paddlespeech.t2s.
exps.syn_utils
import
get_test_dataset
from
paddlespeech.t2s.
modules.normalizer
import
ZScor
e
from
paddlespeech.t2s.
exps.syn_utils
import
get_voc_inferenc
e
from
paddlespeech.t2s.utils
import
str2bool
from
paddlespeech.t2s.utils
import
str2bool
model_alias
=
{
# acoustic model
"speedyspeech"
:
"paddlespeech.t2s.models.speedyspeech:SpeedySpeech"
,
"speedyspeech_inference"
:
"paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference"
,
"fastspeech2"
:
"paddlespeech.t2s.models.fastspeech2:FastSpeech2"
,
"fastspeech2_inference"
:
"paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference"
,
"tacotron2"
:
"paddlespeech.t2s.models.tacotron2:Tacotron2"
,
"tacotron2_inference"
:
"paddlespeech.t2s.models.tacotron2:Tacotron2Inference"
,
# voc
"pwgan"
:
"paddlespeech.t2s.models.parallel_wavegan:PWGGenerator"
,
"pwgan_inference"
:
"paddlespeech.t2s.models.parallel_wavegan:PWGInference"
,
"mb_melgan"
:
"paddlespeech.t2s.models.melgan:MelGANGenerator"
,
"mb_melgan_inference"
:
"paddlespeech.t2s.models.melgan:MelGANInference"
,
"style_melgan"
:
"paddlespeech.t2s.models.melgan:StyleMelGANGenerator"
,
"style_melgan_inference"
:
"paddlespeech.t2s.models.melgan:StyleMelGANInference"
,
"hifigan"
:
"paddlespeech.t2s.models.hifigan:HiFiGANGenerator"
,
"hifigan_inference"
:
"paddlespeech.t2s.models.hifigan:HiFiGANInference"
,
"wavernn"
:
"paddlespeech.t2s.models.wavernn:WaveRNN"
,
"wavernn_inference"
:
"paddlespeech.t2s.models.wavernn:WaveRNNInference"
,
}
def
evaluate
(
args
):
def
evaluate
(
args
):
# dataloader has been too verbose
# dataloader has been too verbose
...
@@ -86,96 +49,12 @@ def evaluate(args):
...
@@ -86,96 +49,12 @@ def evaluate(args):
print
(
am_config
)
print
(
am_config
)
print
(
voc_config
)
print
(
voc_config
)
# construct dataset for evaluation
# model: {model_name}_{dataset}
am_name
=
args
.
am
[:
args
.
am
.
rindex
(
'_'
)]
am_dataset
=
args
.
am
[
args
.
am
.
rindex
(
'_'
)
+
1
:]
if
am_name
==
'fastspeech2'
:
fields
=
[
"utt_id"
,
"text"
]
spk_num
=
None
if
am_dataset
in
{
"aishell3"
,
"vctk"
}
and
args
.
speaker_dict
:
print
(
"multiple speaker fastspeech2!"
)
with
open
(
args
.
speaker_dict
,
'rt'
)
as
f
:
spk_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
spk_num
=
len
(
spk_id
)
fields
+=
[
"spk_id"
]
elif
args
.
voice_cloning
:
print
(
"voice cloning!"
)
fields
+=
[
"spk_emb"
]
else
:
print
(
"single speaker fastspeech2!"
)
print
(
"spk_num:"
,
spk_num
)
elif
am_name
==
'speedyspeech'
:
fields
=
[
"utt_id"
,
"phones"
,
"tones"
]
elif
am_name
==
'tacotron2'
:
fields
=
[
"utt_id"
,
"text"
]
if
args
.
voice_cloning
:
print
(
"voice cloning!"
)
fields
+=
[
"spk_emb"
]
test_dataset
=
DataTable
(
data
=
test_metadata
,
fields
=
fields
)
with
open
(
args
.
phones_dict
,
"r"
)
as
f
:
phn_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
vocab_size
=
len
(
phn_id
)
print
(
"vocab_size:"
,
vocab_size
)
tone_size
=
None
if
args
.
tones_dict
:
with
open
(
args
.
tones_dict
,
"r"
)
as
f
:
tone_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
tone_size
=
len
(
tone_id
)
print
(
"tone_size:"
,
tone_size
)
# acoustic model
# acoustic model
odim
=
am_config
.
n_mels
am_inference
,
am_name
,
am_dataset
=
get_am_inference
(
args
,
am_config
)
am_class
=
dynamic_import
(
am_name
,
model_alias
)
test_dataset
=
get_test_dataset
(
args
,
test_metadata
,
am_name
,
am_dataset
)
am_inference_class
=
dynamic_import
(
am_name
+
'_inference'
,
model_alias
)
if
am_name
==
'fastspeech2'
:
am
=
am_class
(
idim
=
vocab_size
,
odim
=
odim
,
spk_num
=
spk_num
,
**
am_config
[
"model"
])
elif
am_name
==
'speedyspeech'
:
am
=
am_class
(
vocab_size
=
vocab_size
,
tone_size
=
tone_size
,
**
am_config
[
"model"
])
elif
am_name
==
'tacotron2'
:
am
=
am_class
(
idim
=
vocab_size
,
odim
=
odim
,
**
am_config
[
"model"
])
am
.
set_state_dict
(
paddle
.
load
(
args
.
am_ckpt
)[
"main_params"
])
am
.
eval
()
am_mu
,
am_std
=
np
.
load
(
args
.
am_stat
)
am_mu
=
paddle
.
to_tensor
(
am_mu
)
am_std
=
paddle
.
to_tensor
(
am_std
)
am_normalizer
=
ZScore
(
am_mu
,
am_std
)
am_inference
=
am_inference_class
(
am_normalizer
,
am
)
print
(
"am_inference.training0:"
,
am_inference
.
training
)
am_inference
.
eval
()
print
(
"acoustic model done!"
)
# vocoder
# vocoder
# model: {model_name}_{dataset}
voc_inference
=
get_voc_inference
(
args
,
voc_config
)
voc_name
=
args
.
voc
[:
args
.
voc
.
rindex
(
'_'
)]
voc_class
=
dynamic_import
(
voc_name
,
model_alias
)
voc_inference_class
=
dynamic_import
(
voc_name
+
'_inference'
,
model_alias
)
if
voc_name
!=
'wavernn'
:
voc
=
voc_class
(
**
voc_config
[
"generator_params"
])
voc
.
set_state_dict
(
paddle
.
load
(
args
.
voc_ckpt
)[
"generator_params"
])
voc
.
remove_weight_norm
()
voc
.
eval
()
else
:
voc
=
voc_class
(
**
voc_config
[
"model"
])
voc
.
set_state_dict
(
paddle
.
load
(
args
.
voc_ckpt
)[
"main_params"
])
voc
.
eval
()
voc_mu
,
voc_std
=
np
.
load
(
args
.
voc_stat
)
voc_mu
=
paddle
.
to_tensor
(
voc_mu
)
voc_std
=
paddle
.
to_tensor
(
voc_std
)
voc_normalizer
=
ZScore
(
voc_mu
,
voc_std
)
voc_inference
=
voc_inference_class
(
voc_normalizer
,
voc
)
print
(
"voc_inference.training0:"
,
voc_inference
.
training
)
voc_inference
.
eval
()
print
(
"voc done!"
)
output_dir
=
Path
(
args
.
output_dir
)
output_dir
=
Path
(
args
.
output_dir
)
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
...
@@ -227,7 +106,7 @@ def evaluate(args):
...
@@ -227,7 +106,7 @@ def evaluate(args):
print
(
f
"generation speed:
{
N
/
T
}
Hz, RTF:
{
am_config
.
fs
/
(
N
/
T
)
}
"
)
print
(
f
"generation speed:
{
N
/
T
}
Hz, RTF:
{
am_config
.
fs
/
(
N
/
T
)
}
"
)
def
main
():
def
parse_args
():
# parse args and config and redirect to train_sp
# parse args and config and redirect to train_sp
parser
=
argparse
.
ArgumentParser
(
parser
=
argparse
.
ArgumentParser
(
description
=
"Synthesize with acoustic model & vocoder"
)
description
=
"Synthesize with acoustic model & vocoder"
)
...
@@ -264,7 +143,6 @@ def main():
...
@@ -264,7 +143,6 @@ def main():
"--tones_dict"
,
type
=
str
,
default
=
None
,
help
=
"tone vocabulary file."
)
"--tones_dict"
,
type
=
str
,
default
=
None
,
help
=
"tone vocabulary file."
)
parser
.
add_argument
(
parser
.
add_argument
(
"--speaker_dict"
,
type
=
str
,
default
=
None
,
help
=
"speaker id map file."
)
"--speaker_dict"
,
type
=
str
,
default
=
None
,
help
=
"speaker id map file."
)
parser
.
add_argument
(
parser
.
add_argument
(
"--voice-cloning"
,
"--voice-cloning"
,
type
=
str2bool
,
type
=
str2bool
,
...
@@ -281,7 +159,6 @@ def main():
...
@@ -281,7 +159,6 @@ def main():
'style_melgan_csmsc'
'style_melgan_csmsc'
],
],
help
=
'Choose vocoder type of tts task.'
)
help
=
'Choose vocoder type of tts task.'
)
parser
.
add_argument
(
parser
.
add_argument
(
'--voc_config'
,
'--voc_config'
,
type
=
str
,
type
=
str
,
...
@@ -302,7 +179,12 @@ def main():
...
@@ -302,7 +179,12 @@ def main():
parser
.
add_argument
(
"--output_dir"
,
type
=
str
,
help
=
"output dir."
)
parser
.
add_argument
(
"--output_dir"
,
type
=
str
,
help
=
"output dir."
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
return
args
def
main
():
args
=
parse_args
()
if
args
.
ngpu
==
0
:
if
args
.
ngpu
==
0
:
paddle
.
set_device
(
"cpu"
)
paddle
.
set_device
(
"cpu"
)
elif
args
.
ngpu
>
0
:
elif
args
.
ngpu
>
0
:
...
...
paddlespeech/t2s/exps/synthesize_e2e.py
浏览文件 @
fe8bf2a3
...
@@ -12,59 +12,20 @@
...
@@ -12,59 +12,20 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
import
argparse
import
argparse
import
os
from
pathlib
import
Path
from
pathlib
import
Path
import
numpy
as
np
import
paddle
import
paddle
import
soundfile
as
sf
import
soundfile
as
sf
import
yaml
import
yaml
from
paddle
import
jit
from
paddle.static
import
InputSpec
from
timer
import
timer
from
timer
import
timer
from
yacs.config
import
CfgNode
from
yacs.config
import
CfgNode
from
paddlespeech.s2t.utils.dynamic_import
import
dynamic_import
from
paddlespeech.t2s.exps.syn_utils
import
am_to_static
from
paddlespeech.t2s.frontend
import
English
from
paddlespeech.t2s.exps.syn_utils
import
get_am_inference
from
paddlespeech.t2s.frontend.zh_frontend
import
Frontend
from
paddlespeech.t2s.exps.syn_utils
import
get_frontend
from
paddlespeech.t2s.modules.normalizer
import
ZScore
from
paddlespeech.t2s.exps.syn_utils
import
get_sentences
from
paddlespeech.t2s.exps.syn_utils
import
get_voc_inference
model_alias
=
{
from
paddlespeech.t2s.exps.syn_utils
import
voc_to_static
# acoustic model
"speedyspeech"
:
"paddlespeech.t2s.models.speedyspeech:SpeedySpeech"
,
"speedyspeech_inference"
:
"paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference"
,
"fastspeech2"
:
"paddlespeech.t2s.models.fastspeech2:FastSpeech2"
,
"fastspeech2_inference"
:
"paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference"
,
"tacotron2"
:
"paddlespeech.t2s.models.tacotron2:Tacotron2"
,
"tacotron2_inference"
:
"paddlespeech.t2s.models.tacotron2:Tacotron2Inference"
,
# voc
"pwgan"
:
"paddlespeech.t2s.models.parallel_wavegan:PWGGenerator"
,
"pwgan_inference"
:
"paddlespeech.t2s.models.parallel_wavegan:PWGInference"
,
"mb_melgan"
:
"paddlespeech.t2s.models.melgan:MelGANGenerator"
,
"mb_melgan_inference"
:
"paddlespeech.t2s.models.melgan:MelGANInference"
,
"style_melgan"
:
"paddlespeech.t2s.models.melgan:StyleMelGANGenerator"
,
"style_melgan_inference"
:
"paddlespeech.t2s.models.melgan:StyleMelGANInference"
,
"hifigan"
:
"paddlespeech.t2s.models.hifigan:HiFiGANGenerator"
,
"hifigan_inference"
:
"paddlespeech.t2s.models.hifigan:HiFiGANInference"
,
"wavernn"
:
"paddlespeech.t2s.models.wavernn:WaveRNN"
,
"wavernn_inference"
:
"paddlespeech.t2s.models.wavernn:WaveRNNInference"
,
}
def
evaluate
(
args
):
def
evaluate
(
args
):
...
@@ -81,155 +42,28 @@ def evaluate(args):
...
@@ -81,155 +42,28 @@ def evaluate(args):
print
(
am_config
)
print
(
am_config
)
print
(
voc_config
)
print
(
voc_config
)
# construct dataset for evaluation
sentences
=
get_sentences
(
args
)
sentences
=
[]
with
open
(
args
.
text
,
'rt'
)
as
f
:
for
line
in
f
:
items
=
line
.
strip
().
split
()
utt_id
=
items
[
0
]
if
args
.
lang
==
'zh'
:
sentence
=
""
.
join
(
items
[
1
:])
elif
args
.
lang
==
'en'
:
sentence
=
" "
.
join
(
items
[
1
:])
sentences
.
append
((
utt_id
,
sentence
))
with
open
(
args
.
phones_dict
,
"r"
)
as
f
:
phn_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
vocab_size
=
len
(
phn_id
)
print
(
"vocab_size:"
,
vocab_size
)
tone_size
=
None
if
args
.
tones_dict
:
with
open
(
args
.
tones_dict
,
"r"
)
as
f
:
tone_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
tone_size
=
len
(
tone_id
)
print
(
"tone_size:"
,
tone_size
)
spk_num
=
None
if
args
.
speaker_dict
:
with
open
(
args
.
speaker_dict
,
'rt'
)
as
f
:
spk_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
spk_num
=
len
(
spk_id
)
print
(
"spk_num:"
,
spk_num
)
# frontend
# frontend
if
args
.
lang
==
'zh'
:
frontend
=
get_frontend
(
args
)
frontend
=
Frontend
(
phone_vocab_path
=
args
.
phones_dict
,
tone_vocab_path
=
args
.
tones_dict
)
elif
args
.
lang
==
'en'
:
frontend
=
English
(
phone_vocab_path
=
args
.
phones_dict
)
print
(
"frontend done!"
)
# acoustic model
# acoustic model
odim
=
am_config
.
n_mels
am_inference
,
am_name
,
am_dataset
=
get_am_inference
(
args
,
am_config
)
# model: {model_name}_{dataset}
am_name
=
args
.
am
[:
args
.
am
.
rindex
(
'_'
)]
am_dataset
=
args
.
am
[
args
.
am
.
rindex
(
'_'
)
+
1
:]
am_class
=
dynamic_import
(
am_name
,
model_alias
)
am_inference_class
=
dynamic_import
(
am_name
+
'_inference'
,
model_alias
)
if
am_name
==
'fastspeech2'
:
am
=
am_class
(
idim
=
vocab_size
,
odim
=
odim
,
spk_num
=
spk_num
,
**
am_config
[
"model"
])
elif
am_name
==
'speedyspeech'
:
am
=
am_class
(
vocab_size
=
vocab_size
,
tone_size
=
tone_size
,
spk_num
=
spk_num
,
**
am_config
[
"model"
])
elif
am_name
==
'tacotron2'
:
am
=
am_class
(
idim
=
vocab_size
,
odim
=
odim
,
**
am_config
[
"model"
])
am
.
set_state_dict
(
paddle
.
load
(
args
.
am_ckpt
)[
"main_params"
])
am
.
eval
()
am_mu
,
am_std
=
np
.
load
(
args
.
am_stat
)
am_mu
=
paddle
.
to_tensor
(
am_mu
)
am_std
=
paddle
.
to_tensor
(
am_std
)
am_normalizer
=
ZScore
(
am_mu
,
am_std
)
am_inference
=
am_inference_class
(
am_normalizer
,
am
)
am_inference
.
eval
()
print
(
"acoustic model done!"
)
# vocoder
# vocoder
# model: {model_name}_{dataset}
voc_inference
=
get_voc_inference
(
args
,
voc_config
)
voc_name
=
args
.
voc
[:
args
.
voc
.
rindex
(
'_'
)]
voc_class
=
dynamic_import
(
voc_name
,
model_alias
)
voc_inference_class
=
dynamic_import
(
voc_name
+
'_inference'
,
model_alias
)
if
voc_name
!=
'wavernn'
:
voc
=
voc_class
(
**
voc_config
[
"generator_params"
])
voc
.
set_state_dict
(
paddle
.
load
(
args
.
voc_ckpt
)[
"generator_params"
])
voc
.
remove_weight_norm
()
voc
.
eval
()
else
:
voc
=
voc_class
(
**
voc_config
[
"model"
])
voc
.
set_state_dict
(
paddle
.
load
(
args
.
voc_ckpt
)[
"main_params"
])
voc
.
eval
()
voc_mu
,
voc_std
=
np
.
load
(
args
.
voc_stat
)
voc_mu
=
paddle
.
to_tensor
(
voc_mu
)
voc_std
=
paddle
.
to_tensor
(
voc_std
)
voc_normalizer
=
ZScore
(
voc_mu
,
voc_std
)
voc_inference
=
voc_inference_class
(
voc_normalizer
,
voc
)
voc_inference
.
eval
()
print
(
"voc done!"
)
# whether dygraph to static
# whether dygraph to static
if
args
.
inference_dir
:
if
args
.
inference_dir
:
# acoustic model
# acoustic model
if
am_name
==
'fastspeech2'
:
am_inference
=
am_to_static
(
args
,
am_inference
,
am_name
,
am_dataset
)
if
am_dataset
in
{
"aishell3"
,
"vctk"
}
and
args
.
speaker_dict
:
am_inference
=
jit
.
to_static
(
am_inference
,
input_spec
=
[
InputSpec
([
-
1
],
dtype
=
paddle
.
int64
),
InputSpec
([
1
],
dtype
=
paddle
.
int64
)
])
else
:
am_inference
=
jit
.
to_static
(
am_inference
,
input_spec
=
[
InputSpec
([
-
1
],
dtype
=
paddle
.
int64
)])
elif
am_name
==
'speedyspeech'
:
if
am_dataset
in
{
"aishell3"
,
"vctk"
}
and
args
.
speaker_dict
:
am_inference
=
jit
.
to_static
(
am_inference
,
input_spec
=
[
InputSpec
([
-
1
],
dtype
=
paddle
.
int64
),
# text
InputSpec
([
-
1
],
dtype
=
paddle
.
int64
),
# tone
InputSpec
([
1
],
dtype
=
paddle
.
int64
),
# spk_id
None
# duration
])
else
:
am_inference
=
jit
.
to_static
(
am_inference
,
input_spec
=
[
InputSpec
([
-
1
],
dtype
=
paddle
.
int64
),
InputSpec
([
-
1
],
dtype
=
paddle
.
int64
)
])
elif
am_name
==
'tacotron2'
:
am_inference
=
jit
.
to_static
(
am_inference
,
input_spec
=
[
InputSpec
([
-
1
],
dtype
=
paddle
.
int64
)])
paddle
.
jit
.
save
(
am_inference
,
os
.
path
.
join
(
args
.
inference_dir
,
args
.
am
))
am_inference
=
paddle
.
jit
.
load
(
os
.
path
.
join
(
args
.
inference_dir
,
args
.
am
))
# vocoder
# vocoder
voc_inference
=
jit
.
to_static
(
voc_inference
=
voc_to_static
(
args
,
voc_inference
)
voc_inference
,
input_spec
=
[
InputSpec
([
-
1
,
80
],
dtype
=
paddle
.
float32
),
])
paddle
.
jit
.
save
(
voc_inference
,
os
.
path
.
join
(
args
.
inference_dir
,
args
.
voc
))
voc_inference
=
paddle
.
jit
.
load
(
os
.
path
.
join
(
args
.
inference_dir
,
args
.
voc
))
output_dir
=
Path
(
args
.
output_dir
)
output_dir
=
Path
(
args
.
output_dir
)
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
merge_sentences
=
Fals
e
merge_sentences
=
Tru
e
# Avoid not stopping at the end of a sub sentence when tacotron2_ljspeech dygraph to static graph
# Avoid not stopping at the end of a sub sentence when tacotron2_ljspeech dygraph to static graph
# but still not stopping in the end (NOTE by yuantian01 Feb 9 2022)
# but still not stopping in the end (NOTE by yuantian01 Feb 9 2022)
if
am_name
==
'tacotron2'
:
if
am_name
==
'tacotron2'
:
...
@@ -266,6 +100,8 @@ def evaluate(args):
...
@@ -266,6 +100,8 @@ def evaluate(args):
spk_id
=
paddle
.
to_tensor
(
args
.
spk_id
)
spk_id
=
paddle
.
to_tensor
(
args
.
spk_id
)
mel
=
am_inference
(
part_phone_ids
,
spk_id
)
mel
=
am_inference
(
part_phone_ids
,
spk_id
)
else
:
else
:
# import pdb
# pdb.set_trace()
mel
=
am_inference
(
part_phone_ids
)
mel
=
am_inference
(
part_phone_ids
)
elif
am_name
==
'speedyspeech'
:
elif
am_name
==
'speedyspeech'
:
part_tone_ids
=
tone_ids
[
i
]
part_tone_ids
=
tone_ids
[
i
]
...
@@ -298,7 +134,7 @@ def evaluate(args):
...
@@ -298,7 +134,7 @@ def evaluate(args):
print
(
f
"generation speed:
{
N
/
T
}
Hz, RTF:
{
am_config
.
fs
/
(
N
/
T
)
}
"
)
print
(
f
"generation speed:
{
N
/
T
}
Hz, RTF:
{
am_config
.
fs
/
(
N
/
T
)
}
"
)
def
main
():
def
parse_args
():
# parse args and config and redirect to train_sp
# parse args and config and redirect to train_sp
parser
=
argparse
.
ArgumentParser
(
parser
=
argparse
.
ArgumentParser
(
description
=
"Synthesize with acoustic model & vocoder"
)
description
=
"Synthesize with acoustic model & vocoder"
)
...
@@ -351,7 +187,6 @@ def main():
...
@@ -351,7 +187,6 @@ def main():
'wavernn_csmsc'
'wavernn_csmsc'
],
],
help
=
'Choose vocoder type of tts task.'
)
help
=
'Choose vocoder type of tts task.'
)
parser
.
add_argument
(
parser
.
add_argument
(
'--voc_config'
,
'--voc_config'
,
type
=
str
,
type
=
str
,
...
@@ -386,6 +221,11 @@ def main():
...
@@ -386,6 +221,11 @@ def main():
parser
.
add_argument
(
"--output_dir"
,
type
=
str
,
help
=
"output dir."
)
parser
.
add_argument
(
"--output_dir"
,
type
=
str
,
help
=
"output dir."
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
return
args
def
main
():
args
=
parse_args
()
if
args
.
ngpu
==
0
:
if
args
.
ngpu
==
0
:
paddle
.
set_device
(
"cpu"
)
paddle
.
set_device
(
"cpu"
)
...
...
paddlespeech/t2s/exps/voice_cloning.py
浏览文件 @
fe8bf2a3
...
@@ -21,29 +21,12 @@ import soundfile as sf
...
@@ -21,29 +21,12 @@ import soundfile as sf
import
yaml
import
yaml
from
yacs.config
import
CfgNode
from
yacs.config
import
CfgNode
from
paddlespeech.s2t.utils.dynamic_import
import
dynamic_import
from
paddlespeech.t2s.exps.syn_utils
import
get_am_inference
from
paddlespeech.t2s.exps.syn_utils
import
get_voc_inference
from
paddlespeech.t2s.frontend.zh_frontend
import
Frontend
from
paddlespeech.t2s.frontend.zh_frontend
import
Frontend
from
paddlespeech.t2s.modules.normalizer
import
ZScore
from
paddlespeech.vector.exps.ge2e.audio_processor
import
SpeakerVerificationPreprocessor
from
paddlespeech.vector.exps.ge2e.audio_processor
import
SpeakerVerificationPreprocessor
from
paddlespeech.vector.models.lstm_speaker_encoder
import
LSTMSpeakerEncoder
from
paddlespeech.vector.models.lstm_speaker_encoder
import
LSTMSpeakerEncoder
model_alias
=
{
# acoustic model
"fastspeech2"
:
"paddlespeech.t2s.models.fastspeech2:FastSpeech2"
,
"fastspeech2_inference"
:
"paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference"
,
"tacotron2"
:
"paddlespeech.t2s.models.tacotron2:Tacotron2"
,
"tacotron2_inference"
:
"paddlespeech.t2s.models.tacotron2:Tacotron2Inference"
,
# voc
"pwgan"
:
"paddlespeech.t2s.models.parallel_wavegan:PWGGenerator"
,
"pwgan_inference"
:
"paddlespeech.t2s.models.parallel_wavegan:PWGInference"
,
}
def
voice_cloning
(
args
):
def
voice_cloning
(
args
):
# Init body.
# Init body.
...
@@ -79,55 +62,14 @@ def voice_cloning(args):
...
@@ -79,55 +62,14 @@ def voice_cloning(args):
speaker_encoder
.
eval
()
speaker_encoder
.
eval
()
print
(
"GE2E Done!"
)
print
(
"GE2E Done!"
)
with
open
(
args
.
phones_dict
,
"r"
)
as
f
:
frontend
=
Frontend
(
phone_vocab_path
=
args
.
phones_dict
)
phn_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
print
(
"frontend done!"
)
vocab_size
=
len
(
phn_id
)
print
(
"vocab_size:"
,
vocab_size
)
# acoustic model
# acoustic model
odim
=
am_config
.
n_mels
am_inference
,
*
_
=
get_am_inference
(
args
,
am_config
)
# model: {model_name}_{dataset}
am_name
=
args
.
am
[:
args
.
am
.
rindex
(
'_'
)]
am_dataset
=
args
.
am
[
args
.
am
.
rindex
(
'_'
)
+
1
:]
am_class
=
dynamic_import
(
am_name
,
model_alias
)
am_inference_class
=
dynamic_import
(
am_name
+
'_inference'
,
model_alias
)
if
am_name
==
'fastspeech2'
:
am
=
am_class
(
idim
=
vocab_size
,
odim
=
odim
,
spk_num
=
None
,
**
am_config
[
"model"
])
elif
am_name
==
'tacotron2'
:
am
=
am_class
(
idim
=
vocab_size
,
odim
=
odim
,
**
am_config
[
"model"
])
am
.
set_state_dict
(
paddle
.
load
(
args
.
am_ckpt
)[
"main_params"
])
am
.
eval
()
am_mu
,
am_std
=
np
.
load
(
args
.
am_stat
)
am_mu
=
paddle
.
to_tensor
(
am_mu
)
am_std
=
paddle
.
to_tensor
(
am_std
)
am_normalizer
=
ZScore
(
am_mu
,
am_std
)
am_inference
=
am_inference_class
(
am_normalizer
,
am
)
am_inference
.
eval
()
print
(
"acoustic model done!"
)
# vocoder
# vocoder
# model: {model_name}_{dataset}
voc_inference
=
get_voc_inference
(
args
,
voc_config
)
voc_name
=
args
.
voc
[:
args
.
voc
.
rindex
(
'_'
)]
voc_class
=
dynamic_import
(
voc_name
,
model_alias
)
voc_inference_class
=
dynamic_import
(
voc_name
+
'_inference'
,
model_alias
)
voc
=
voc_class
(
**
voc_config
[
"generator_params"
])
voc
.
set_state_dict
(
paddle
.
load
(
args
.
voc_ckpt
)[
"generator_params"
])
voc
.
remove_weight_norm
()
voc
.
eval
()
voc_mu
,
voc_std
=
np
.
load
(
args
.
voc_stat
)
voc_mu
=
paddle
.
to_tensor
(
voc_mu
)
voc_std
=
paddle
.
to_tensor
(
voc_std
)
voc_normalizer
=
ZScore
(
voc_mu
,
voc_std
)
voc_inference
=
voc_inference_class
(
voc_normalizer
,
voc
)
voc_inference
.
eval
()
print
(
"voc done!"
)
frontend
=
Frontend
(
phone_vocab_path
=
args
.
phones_dict
)
print
(
"frontend done!"
)
output_dir
=
Path
(
args
.
output_dir
)
output_dir
=
Path
(
args
.
output_dir
)
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
...
@@ -170,7 +112,7 @@ def voice_cloning(args):
...
@@ -170,7 +112,7 @@ def voice_cloning(args):
print
(
f
"
{
utt_id
}
done!"
)
print
(
f
"
{
utt_id
}
done!"
)
def
main
():
def
parse_args
():
# parse args and config and redirect to train_sp
# parse args and config and redirect to train_sp
parser
=
argparse
.
ArgumentParser
(
description
=
""
)
parser
=
argparse
.
ArgumentParser
(
description
=
""
)
parser
.
add_argument
(
parser
.
add_argument
(
...
@@ -240,6 +182,11 @@ def main():
...
@@ -240,6 +182,11 @@ def main():
parser
.
add_argument
(
"--output-dir"
,
type
=
str
,
help
=
"output dir."
)
parser
.
add_argument
(
"--output-dir"
,
type
=
str
,
help
=
"output dir."
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
return
args
def
main
():
args
=
parse_args
()
if
args
.
ngpu
==
0
:
if
args
.
ngpu
==
0
:
paddle
.
set_device
(
"cpu"
)
paddle
.
set_device
(
"cpu"
)
...
...
paddlespeech/t2s/modules/predictor/length_regulator.py
浏览文件 @
fe8bf2a3
...
@@ -101,6 +101,16 @@ class LengthRegulator(nn.Layer):
...
@@ -101,6 +101,16 @@ class LengthRegulator(nn.Layer):
assert
alpha
>
0
assert
alpha
>
0
ds
=
paddle
.
round
(
ds
.
cast
(
dtype
=
paddle
.
float32
)
*
alpha
)
ds
=
paddle
.
round
(
ds
.
cast
(
dtype
=
paddle
.
float32
)
*
alpha
)
ds
=
ds
.
cast
(
dtype
=
paddle
.
int64
)
ds
=
ds
.
cast
(
dtype
=
paddle
.
int64
)
'''
from distutils.version import LooseVersion
from paddlespeech.t2s.modules.nets_utils import pad_list
# 这里在 paddle 2.2.2 的动转静是不通的
# if LooseVersion(paddle.__version__) >= "2.3.0" or hasattr(paddle, 'repeat_interleave'):
# if LooseVersion(paddle.__version__) >= "2.3.0":
if hasattr(paddle, 'repeat_interleave'):
repeat = [paddle.repeat_interleave(x, d, axis=0) for x, d in zip(xs, ds)]
return pad_list(repeat, self.pad_value)
'''
if
is_inference
:
if
is_inference
:
return
self
.
expand
(
xs
,
ds
)
return
self
.
expand
(
xs
,
ds
)
else
:
else
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录