Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
ad239eb4
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
10 个月 前同步成功
通知
200
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
未验证
提交
ad239eb4
编写于
3月 01, 2023
作者:
小湉湉
提交者:
GitHub
3月 01, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[TTS]add VITS inference (#2972)
上级
ff8c56b0
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
387 addition
and
22 deletion
+387
-22
examples/csmsc/vits/local/export2lite.sh
examples/csmsc/vits/local/export2lite.sh
+1
-0
examples/csmsc/vits/local/inference.sh
examples/csmsc/vits/local/inference.sh
+17
-0
examples/csmsc/vits/local/lite_predict.sh
examples/csmsc/vits/local/lite_predict.sh
+18
-0
examples/csmsc/vits/run.sh
examples/csmsc/vits/run.sh
+4
-0
paddlespeech/t2s/exps/lite_syn_utils.py
paddlespeech/t2s/exps/lite_syn_utils.py
+11
-10
paddlespeech/t2s/exps/syn_utils.py
paddlespeech/t2s/exps/syn_utils.py
+15
-12
paddlespeech/t2s/exps/vits/inference.py
paddlespeech/t2s/exps/vits/inference.py
+174
-0
paddlespeech/t2s/exps/vits/lite_predict.py
paddlespeech/t2s/exps/vits/lite_predict.py
+147
-0
未找到文件。
examples/csmsc/vits/local/export2lite.sh
0 → 120000
浏览文件 @
ad239eb4
../../tts3/local/export2lite.sh
\ No newline at end of file
examples/csmsc/vits/local/inference.sh
0 → 100755
浏览文件 @
ad239eb4
#!/bin/bash
train_output_path
=
$1
add_blank
=
$2
stage
=
0
stop_stage
=
0
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
python3
${
BIN_DIR
}
/inference.py
\
--inference_dir
=
${
train_output_path
}
/inference
\
--am
=
vits_csmsc
\
--text
=
${
BIN_DIR
}
/../sentences.txt
\
--output_dir
=
${
train_output_path
}
/pd_infer_out
\
--phones_dict
=
dump/phone_id_map.txt
\
--add-blank
=
${
add_blank
}
fi
\ No newline at end of file
examples/csmsc/vits/local/lite_predict.sh
0 → 100755
浏览文件 @
ad239eb4
#!/bin/bash
train_output_path
=
$1
add_blank
=
$2
stage
=
0
stop_stage
=
0
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
python3
${
BIN_DIR
}
/../lite_predict.py
\
--inference_dir
=
${
train_output_path
}
/pdlite
\
--am
=
vits_csmsc
\
--text
=
${
BIN_DIR
}
/../sentences.txt
\
--output_dir
=
${
train_output_path
}
/lite_infer_out
\
--phones_dict
=
dump/phone_id_map.txt
\
--add-blank
=
${
add_blank
}
fi
examples/csmsc/vits/run.sh
浏览文件 @
ad239eb4
...
...
@@ -35,3 +35,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# synthesize_e2e, vocoder is pwgan
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/synthesize_e2e.sh
${
conf_path
}
${
train_output_path
}
${
ckpt_name
}
${
add_blank
}
||
exit
-1
fi
if
[
${
stage
}
-le
4
]
&&
[
${
stop_stage
}
-ge
4
]
;
then
CUDA_VISIBLE_DEVICES
=
${
gpus
}
./local/inference.sh
${
train_output_path
}
${
add_blank
}
||
exit
-1
fi
paddlespeech/t2s/exps/lite_syn_utils.py
浏览文件 @
ad239eb4
...
...
@@ -19,15 +19,15 @@ def get_lite_predictor(model_dir: Optional[os.PathLike]=None,
return
predictor
def
get_lite_am_output
(
input
:
st
r
,
am_predicto
r
,
am
:
str
,
frontend
:
object
,
lang
:
str
=
'zh'
,
merge_sentences
:
bool
=
Tru
e
,
speaker_dict
:
Optional
[
os
.
PathLike
]
=
None
,
spk_id
:
int
=
0
,
):
def
get_lite_am_output
(
input
:
str
,
am_predicto
r
,
am
:
st
r
,
frontend
:
object
,
lang
:
str
=
'zh'
,
merge_sentences
:
bool
=
True
,
speaker_dict
:
Optional
[
os
.
PathLike
]
=
Non
e
,
spk_id
:
int
=
0
,
add_blank
:
bool
=
False
):
am_name
=
am
[:
am
.
rindex
(
'_'
)]
am_dataset
=
am
[
am
.
rindex
(
'_'
)
+
1
:]
get_spk_id
=
False
...
...
@@ -43,7 +43,8 @@ def get_lite_am_output(
text
=
input
,
merge_sentences
=
merge_sentences
,
get_tone_ids
=
get_tone_ids
,
lang
=
lang
)
lang
=
lang
,
add_blank
=
add_blank
,
)
if
get_tone_ids
:
tone_ids
=
frontend_dict
[
'tone_ids'
]
...
...
paddlespeech/t2s/exps/syn_utils.py
浏览文件 @
ad239eb4
...
...
@@ -284,7 +284,8 @@ def run_frontend(frontend: object,
merge_sentences
:
bool
=
False
,
get_tone_ids
:
bool
=
False
,
lang
:
str
=
'zh'
,
to_tensor
:
bool
=
True
):
to_tensor
:
bool
=
True
,
add_blank
:
bool
=
False
):
outs
=
dict
()
if
lang
==
'zh'
:
input_ids
=
{}
...
...
@@ -300,7 +301,8 @@ def run_frontend(frontend: object,
text
,
merge_sentences
=
merge_sentences
,
get_tone_ids
=
get_tone_ids
,
to_tensor
=
to_tensor
)
to_tensor
=
to_tensor
,
add_blank
=
add_blank
)
phone_ids
=
input_ids
[
"phone_ids"
]
if
get_tone_ids
:
tone_ids
=
input_ids
[
"tone_ids"
]
...
...
@@ -576,15 +578,15 @@ def get_predictor(
return
predictor
def
get_am_output
(
input
:
st
r
,
am_predictor
:
paddle
.
nn
.
Laye
r
,
am
:
str
,
frontend
:
object
,
lang
:
str
=
'zh'
,
merge_sentences
:
bool
=
Tru
e
,
speaker_dict
:
Optional
[
os
.
PathLike
]
=
None
,
spk_id
:
int
=
0
,
):
def
get_am_output
(
input
:
str
,
am_predictor
:
paddle
.
nn
.
Laye
r
,
am
:
st
r
,
frontend
:
object
,
lang
:
str
=
'zh'
,
merge_sentences
:
bool
=
True
,
speaker_dict
:
Optional
[
os
.
PathLike
]
=
Non
e
,
spk_id
:
int
=
0
,
add_blank
:
bool
=
False
):
am_name
=
am
[:
am
.
rindex
(
'_'
)]
am_dataset
=
am
[
am
.
rindex
(
'_'
)
+
1
:]
am_input_names
=
am_predictor
.
get_input_names
()
...
...
@@ -601,7 +603,8 @@ def get_am_output(
text
=
input
,
merge_sentences
=
merge_sentences
,
get_tone_ids
=
get_tone_ids
,
lang
=
lang
)
lang
=
lang
,
add_blank
=
add_blank
,
)
if
get_tone_ids
:
tone_ids
=
frontend_dict
[
'tone_ids'
]
...
...
paddlespeech/t2s/exps/vits/inference.py
0 → 100644
浏览文件 @
ad239eb4
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
argparse
from
pathlib
import
Path
import
paddle
import
soundfile
as
sf
from
timer
import
timer
from
paddlespeech.t2s.exps.syn_utils
import
get_am_output
from
paddlespeech.t2s.exps.syn_utils
import
get_frontend
from
paddlespeech.t2s.exps.syn_utils
import
get_predictor
from
paddlespeech.t2s.exps.syn_utils
import
get_sentences
from
paddlespeech.t2s.utils
import
str2bool
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
description
=
"Paddle Infernce with acoustic model & vocoder."
)
# acoustic model
parser
.
add_argument
(
'--am'
,
type
=
str
,
default
=
'vits_csmsc'
,
choices
=
[
'vits_csmsc'
,
'vits_aishell3'
],
help
=
'Choose acoustic model type of tts task.'
)
parser
.
add_argument
(
"--phones_dict"
,
type
=
str
,
default
=
None
,
help
=
"phone vocabulary file."
)
parser
.
add_argument
(
"--speaker_dict"
,
type
=
str
,
default
=
None
,
help
=
"speaker id map file."
)
parser
.
add_argument
(
'--spk_id'
,
type
=
int
,
default
=
0
,
help
=
'spk id for multi speaker acoustic model'
)
# other
parser
.
add_argument
(
'--lang'
,
type
=
str
,
default
=
'zh'
,
help
=
'Choose model language. zh or en or mix'
)
parser
.
add_argument
(
"--text"
,
type
=
str
,
help
=
"text to synthesize, a 'utt_id sentence' pair per line"
)
parser
.
add_argument
(
"--add-blank"
,
type
=
str2bool
,
default
=
True
,
help
=
"whether to add blank between phones"
)
parser
.
add_argument
(
"--inference_dir"
,
type
=
str
,
help
=
"dir to save inference models"
)
parser
.
add_argument
(
"--output_dir"
,
type
=
str
,
help
=
"output dir"
)
# inference
parser
.
add_argument
(
"--use_trt"
,
type
=
str2bool
,
default
=
False
,
help
=
"whether to use TensorRT or not in GPU"
,
)
parser
.
add_argument
(
"--use_mkldnn"
,
type
=
str2bool
,
default
=
False
,
help
=
"whether to use MKLDNN or not in CPU."
,
)
parser
.
add_argument
(
"--precision"
,
type
=
str
,
default
=
'fp32'
,
choices
=
[
'fp32'
,
'fp16'
,
'bf16'
,
'int8'
],
help
=
"mode of running"
)
parser
.
add_argument
(
"--device"
,
default
=
"gpu"
,
choices
=
[
"gpu"
,
"cpu"
],
help
=
"Device selected for inference."
,
)
parser
.
add_argument
(
'--cpu_threads'
,
type
=
int
,
default
=
1
)
args
,
_
=
parser
.
parse_known_args
()
return
args
# only inference for models trained with csmsc now
def
main
():
args
=
parse_args
()
paddle
.
set_device
(
args
.
device
)
# frontend
frontend
=
get_frontend
(
lang
=
args
.
lang
,
phones_dict
=
args
.
phones_dict
)
# am_predictor
am_predictor
=
get_predictor
(
model_dir
=
args
.
inference_dir
,
model_file
=
args
.
am
+
".pdmodel"
,
params_file
=
args
.
am
+
".pdiparams"
,
device
=
args
.
device
,
use_trt
=
args
.
use_trt
,
use_mkldnn
=
args
.
use_mkldnn
,
cpu_threads
=
args
.
cpu_threads
,
precision
=
args
.
precision
)
# model: {model_name}_{dataset}
am_dataset
=
args
.
am
[
args
.
am
.
rindex
(
'_'
)
+
1
:]
output_dir
=
Path
(
args
.
output_dir
)
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
sentences
=
get_sentences
(
text_file
=
args
.
text
,
lang
=
args
.
lang
)
merge_sentences
=
True
add_blank
=
args
.
add_blank
# vits's fs is 22050
fs
=
22050
# warmup
for
utt_id
,
sentence
in
sentences
[:
3
]:
with
timer
()
as
t
:
wav
=
get_am_output
(
input
=
sentence
,
am_predictor
=
am_predictor
,
am
=
args
.
am
,
frontend
=
frontend
,
lang
=
args
.
lang
,
merge_sentences
=
merge_sentences
,
speaker_dict
=
args
.
speaker_dict
,
spk_id
=
args
.
spk_id
,
add_blank
=
add_blank
)
speed
=
wav
.
size
/
t
.
elapse
rtf
=
fs
/
speed
print
(
f
"
{
utt_id
}
, wave:
{
wav
.
shape
}
, time:
{
t
.
elapse
}
s, Hz:
{
speed
}
, RTF:
{
rtf
}
."
)
print
(
"warm up done!"
)
N
=
0
T
=
0
for
utt_id
,
sentence
in
sentences
:
with
timer
()
as
t
:
wav
=
get_am_output
(
input
=
sentence
,
am_predictor
=
am_predictor
,
am
=
args
.
am
,
frontend
=
frontend
,
lang
=
args
.
lang
,
merge_sentences
=
merge_sentences
,
speaker_dict
=
args
.
speaker_dict
,
spk_id
=
args
.
spk_id
,
add_blank
=
add_blank
)
N
+=
wav
.
size
T
+=
t
.
elapse
speed
=
wav
.
size
/
t
.
elapse
rtf
=
fs
/
speed
sf
.
write
(
output_dir
/
(
utt_id
+
".wav"
),
wav
,
samplerate
=
fs
)
print
(
f
"
{
utt_id
}
, wave:
{
wav
.
shape
}
, time:
{
t
.
elapse
}
s, Hz:
{
speed
}
, RTF:
{
rtf
}
."
)
print
(
f
"
{
utt_id
}
done!"
)
print
(
f
"generation speed:
{
N
/
T
}
Hz, RTF:
{
fs
/
(
N
/
T
)
}
"
)
if
__name__
==
"__main__"
:
main
()
paddlespeech/t2s/exps/vits/lite_predict.py
0 → 100644
浏览文件 @
ad239eb4
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
argparse
from
pathlib
import
Path
import
soundfile
as
sf
from
timer
import
timer
from
paddlespeech.t2s.exps.lite_syn_utils
import
get_lite_am_output
from
paddlespeech.t2s.exps.lite_syn_utils
import
get_lite_predictor
from
paddlespeech.t2s.exps.syn_utils
import
get_frontend
from
paddlespeech.t2s.exps.syn_utils
import
get_sentences
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
description
=
"Paddle Infernce with acoustic model & vocoder."
)
# acoustic model
parser
.
add_argument
(
'--am'
,
type
=
str
,
default
=
'vits_csmsc'
,
choices
=
[
'vits_csmsc'
,
'vits_aishell3'
,
],
help
=
'Choose acoustic model type of tts task.'
)
parser
.
add_argument
(
"--phones_dict"
,
type
=
str
,
default
=
None
,
help
=
"phone vocabulary file."
)
parser
.
add_argument
(
"--speaker_dict"
,
type
=
str
,
default
=
None
,
help
=
"speaker id map file."
)
parser
.
add_argument
(
'--spk_id'
,
type
=
int
,
default
=
0
,
help
=
'spk id for multi speaker acoustic model'
)
# other
parser
.
add_argument
(
'--lang'
,
type
=
str
,
default
=
'zh'
,
help
=
'Choose model language. zh or en or mix'
)
parser
.
add_argument
(
"--text"
,
type
=
str
,
help
=
"text to synthesize, a 'utt_id sentence' pair per line"
)
parser
.
add_argument
(
"--add-blank"
,
type
=
str2bool
,
default
=
True
,
help
=
"whether to add blank between phones"
)
parser
.
add_argument
(
"--inference_dir"
,
type
=
str
,
help
=
"dir to save inference models"
)
parser
.
add_argument
(
"--output_dir"
,
type
=
str
,
help
=
"output dir"
)
args
,
_
=
parser
.
parse_known_args
()
return
args
# only inference for models trained with csmsc now
def
main
():
args
=
parse_args
()
# frontend
frontend
=
get_frontend
(
lang
=
args
.
lang
,
phones_dict
=
args
.
phones_dict
,
tones_dict
=
args
.
tones_dict
)
# am_predictor
am_predictor
=
get_lite_predictor
(
model_dir
=
args
.
inference_dir
,
model_file
=
args
.
am
+
"_x86.nb"
)
# model: {model_name}_{dataset}
am_dataset
=
args
.
am
[
args
.
am
.
rindex
(
'_'
)
+
1
:]
output_dir
=
Path
(
args
.
output_dir
)
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
sentences
=
get_sentences
(
text_file
=
args
.
text
,
lang
=
args
.
lang
)
merge_sentences
=
True
add_blank
=
args
.
add_blank
fs
=
22050
# warmup
for
utt_id
,
sentence
in
sentences
[:
3
]:
with
timer
()
as
t
:
wav
=
get_lite_am_output
(
input
=
sentence
,
am_predictor
=
am_predictor
,
am
=
args
.
am
,
frontend
=
frontend
,
lang
=
args
.
lang
,
merge_sentences
=
merge_sentences
,
speaker_dict
=
args
.
speaker_dict
,
spk_id
=
args
.
spk_id
,
add_blank
=
add_blank
)
speed
=
wav
.
size
/
t
.
elapse
rtf
=
fs
/
speed
print
(
f
"
{
utt_id
}
, wave:
{
wav
.
shape
}
, time:
{
t
.
elapse
}
s, Hz:
{
speed
}
, RTF:
{
rtf
}
."
)
print
(
"warm up done!"
)
N
=
0
T
=
0
for
utt_id
,
sentence
in
sentences
:
with
timer
()
as
t
:
wav
=
get_lite_am_output
(
input
=
sentence
,
am_predictor
=
am_predictor
,
am
=
args
.
am
,
frontend
=
frontend
,
lang
=
args
.
lang
,
merge_sentences
=
merge_sentences
,
speaker_dict
=
args
.
speaker_dict
,
spk_id
=
args
.
spk_id
,
add_blank
=
add_blank
)
N
+=
wav
.
size
T
+=
t
.
elapse
speed
=
wav
.
size
/
t
.
elapse
rtf
=
fs
/
speed
sf
.
write
(
output_dir
/
(
utt_id
+
".wav"
),
wav
,
samplerate
=
fs
)
print
(
f
"
{
utt_id
}
, wave:
{
wav
.
shape
}
, time:
{
t
.
elapse
}
s, Hz:
{
speed
}
, RTF:
{
rtf
}
."
)
print
(
f
"
{
utt_id
}
done!"
)
print
(
f
"generation speed:
{
N
/
T
}
Hz, RTF:
{
fs
/
(
N
/
T
)
}
"
)
if
__name__
==
"__main__"
:
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录