Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
1af9bd47
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 1 年 前同步成功
通知
207
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
1af9bd47
编写于
2月 16, 2023
作者:
H
HuangLiangJie
提交者:
GitHub
2月 16, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[TTS]Cantonese FastSpeech2 e2e infer, test=tts (#2927)
上级
004a4d60
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
191 addition
and
15 deletion
+191
-15
examples/canton/tts3/local/synthesize_e2e.sh
examples/canton/tts3/local/synthesize_e2e.sh
+53
-0
examples/canton/tts3/run.sh
examples/canton/tts3/run.sh
+2
-1
paddlespeech/t2s/exps/sentences_canton.txt
paddlespeech/t2s/exps/sentences_canton.txt
+7
-0
paddlespeech/t2s/exps/syn_utils.py
paddlespeech/t2s/exps/syn_utils.py
+18
-11
paddlespeech/t2s/exps/synthesize.py
paddlespeech/t2s/exps/synthesize.py
+2
-1
paddlespeech/t2s/exps/synthesize_e2e.py
paddlespeech/t2s/exps/synthesize_e2e.py
+3
-2
paddlespeech/t2s/frontend/canton_frontend.py
paddlespeech/t2s/frontend/canton_frontend.py
+106
-0
未找到文件。
examples/canton/tts3/local/synthesize_e2e.sh
0 → 100755
浏览文件 @
1af9bd47
#!/bin/bash
config_path
=
$1
train_output_path
=
$2
ckpt_name
=
$3
stage
=
0
stop_stage
=
0
# pwgan
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
FLAGS_allocator_strategy
=
naive_best_fit
\
FLAGS_fraction_of_gpu_memory_to_use
=
0.01
\
python3
${
BIN_DIR
}
/../synthesize_e2e.py
\
--am
=
fastspeech2_canton
\
--am_config
=
${
config_path
}
\
--am_ckpt
=
${
train_output_path
}
/checkpoints/
${
ckpt_name
}
\
--am_stat
=
dump/train/speech_stats.npy
\
--voc
=
pwgan_aishell3
\
--voc_config
=
pwg_aishell3_ckpt_0.5/default.yaml
\
--voc_ckpt
=
pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz
\
--voc_stat
=
pwg_aishell3_ckpt_0.5/feats_stats.npy
\
--lang
=
canton
\
--text
=
${
BIN_DIR
}
/../sentences_canton.txt
\
--output_dir
=
${
train_output_path
}
/test_e2e
\
--phones_dict
=
dump/phone_id_map.txt
\
--speaker_dict
=
dump/speaker_id_map.txt
\
--spk_id
=
0
\
--inference_dir
=
${
train_output_path
}
/inference
fi
# hifigan
if
[
${
stage
}
-le
1
]
&&
[
${
stop_stage
}
-ge
1
]
;
then
echo
"in hifigan syn_e2e"
FLAGS_allocator_strategy
=
naive_best_fit
\
FLAGS_fraction_of_gpu_memory_to_use
=
0.01
\
python3
${
BIN_DIR
}
/../synthesize_e2e.py
\
--am
=
fastspeech2_canton
\
--am_config
=
${
config_path
}
\
--am_ckpt
=
${
train_output_path
}
/checkpoints/
${
ckpt_name
}
\
--am_stat
=
dump/train/speech_stats.npy
\
--voc
=
hifigan_aishell3
\
--voc_config
=
hifigan_aishell3_ckpt_0.2.0/default.yaml
\
--voc_ckpt
=
hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz
\
--voc_stat
=
hifigan_aishell3_ckpt_0.2.0/feats_stats.npy
\
--lang
=
canton
\
--text
=
${
BIN_DIR
}
/../sentences_canton.txt
\
--output_dir
=
${
train_output_path
}
/test_e2e
\
--phones_dict
=
dump/phone_id_map.txt
\
--speaker_dict
=
dump/speaker_id_map.txt
\
--spk_id
=
0
\
--inference_dir
=
${
train_output_path
}
/inference
fi
examples/canton/tts3/run.sh
浏览文件 @
1af9bd47
...
...
@@ -9,7 +9,8 @@ stop_stage=100
conf_path
=
conf/default.yaml
train_output_path
=
exp/default
ckpt_name
=
snapshot_iter_112793.pdz
ckpt_name
=
snapshot_iter_280000.pdz
# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
...
...
paddlespeech/t2s/exps/sentences_canton.txt
0 → 100644
浏览文件 @
1af9bd47
001 白云山爬过一次嘅,好远啊,爬上去都成两个钟
002 睇书咯,番屋企,而家好多人好少睇书噶喎
003 因为如果唔考试嘅话,工资好低噶
004 冇固定噶,你中意休边日就边日噶
005 即系太迟嘅话咧,落班太迟嘅话就喺出边食啲咯
006 是非有公理,慎言莫冒犯别人
007 遇上冷风雨,休太认真
paddlespeech/t2s/exps/syn_utils.py
浏览文件 @
1af9bd47
...
...
@@ -33,6 +33,7 @@ from paddlespeech.t2s.datasets.am_batch_fn import *
from
paddlespeech.t2s.datasets.data_table
import
DataTable
from
paddlespeech.t2s.datasets.vocoder_batch_fn
import
Clip_static
from
paddlespeech.t2s.frontend
import
English
from
paddlespeech.t2s.frontend.canton_frontend
import
CantonFrontend
from
paddlespeech.t2s.frontend.mix_frontend
import
MixFrontend
from
paddlespeech.t2s.frontend.zh_frontend
import
Frontend
from
paddlespeech.t2s.modules.normalizer
import
ZScore
...
...
@@ -111,7 +112,7 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
if
line
.
strip
()
!=
""
:
items
=
re
.
split
(
r
"\s+"
,
line
.
strip
(),
1
)
utt_id
=
items
[
0
]
if
lang
==
'zh'
:
if
lang
in
{
'zh'
,
'canton'
}
:
sentence
=
""
.
join
(
items
[
1
:])
elif
lang
==
'en'
:
sentence
=
" "
.
join
(
items
[
1
:])
...
...
@@ -132,8 +133,8 @@ def get_test_dataset(test_metadata: List[Dict[str, Any]],
converters
=
{}
if
am_name
==
'fastspeech2'
:
fields
=
[
"utt_id"
,
"text"
]
if
am_dataset
in
{
"aishell3"
,
"vctk"
,
"
mix
"
}
and
speaker_dict
is
not
None
:
if
am_dataset
in
{
"aishell3"
,
"vctk"
,
"mix"
,
"
canton
"
}
and
speaker_dict
is
not
None
:
print
(
"multiple speaker fastspeech2!"
)
fields
+=
[
"spk_id"
]
elif
voice_cloning
:
...
...
@@ -177,8 +178,8 @@ def get_dev_dataloader(dev_metadata: List[Dict[str, Any]],
converters
=
{}
if
am_name
==
'fastspeech2'
:
fields
=
[
"utt_id"
,
"text"
]
if
am_dataset
in
{
"aishell3"
,
"vctk"
,
"
mix
"
}
and
speaker_dict
is
not
None
:
if
am_dataset
in
{
"aishell3"
,
"vctk"
,
"mix"
,
"
canton
"
}
and
speaker_dict
is
not
None
:
print
(
"multiple speaker fastspeech2!"
)
collate_fn
=
fastspeech2_multi_spk_batch_fn_static
fields
+=
[
"spk_id"
]
...
...
@@ -266,6 +267,8 @@ def get_frontend(lang: str='zh',
phone_vocab_path
=
phones_dict
,
tone_vocab_path
=
tones_dict
,
use_rhy
=
use_rhy
)
elif
lang
==
'canton'
:
frontend
=
CantonFrontend
(
phone_vocab_path
=
phones_dict
)
elif
lang
==
'en'
:
frontend
=
English
(
phone_vocab_path
=
phones_dict
)
elif
lang
==
'mix'
:
...
...
@@ -302,6 +305,10 @@ def run_frontend(frontend: object,
if
get_tone_ids
:
tone_ids
=
input_ids
[
"tone_ids"
]
outs
.
update
({
'tone_ids'
:
tone_ids
})
elif
lang
==
'canton'
:
input_ids
=
frontend
.
get_input_ids
(
text
,
merge_sentences
=
merge_sentences
,
to_tensor
=
to_tensor
)
phone_ids
=
input_ids
[
"phone_ids"
]
elif
lang
==
'en'
:
input_ids
=
frontend
.
get_input_ids
(
text
,
merge_sentences
=
merge_sentences
,
to_tensor
=
to_tensor
)
...
...
@@ -311,7 +318,7 @@ def run_frontend(frontend: object,
text
,
merge_sentences
=
merge_sentences
,
to_tensor
=
to_tensor
)
phone_ids
=
input_ids
[
"phone_ids"
]
else
:
print
(
"lang should in {'zh', 'en', 'mix'}!"
)
print
(
"lang should in {'zh', 'en', 'mix'
, 'canton'
}!"
)
outs
.
update
({
'phone_ids'
:
phone_ids
})
return
outs
...
...
@@ -411,8 +418,8 @@ def am_to_static(am_inference,
am_name
=
am
[:
am
.
rindex
(
'_'
)]
am_dataset
=
am
[
am
.
rindex
(
'_'
)
+
1
:]
if
am_name
==
'fastspeech2'
:
if
am_dataset
in
{
"aishell3"
,
"vctk"
,
"
mix
"
}
and
speaker_dict
is
not
None
:
if
am_dataset
in
{
"aishell3"
,
"vctk"
,
"mix"
,
"
canton
"
}
and
speaker_dict
is
not
None
:
am_inference
=
jit
.
to_static
(
am_inference
,
input_spec
=
[
...
...
@@ -424,8 +431,8 @@ def am_to_static(am_inference,
am_inference
,
input_spec
=
[
InputSpec
([
-
1
],
dtype
=
paddle
.
int64
)])
elif
am_name
==
'speedyspeech'
:
if
am_dataset
in
{
"aishell3"
,
"vctk"
,
"
mix
"
}
and
speaker_dict
is
not
None
:
if
am_dataset
in
{
"aishell3"
,
"vctk"
,
"mix"
,
"
canton
"
}
and
speaker_dict
is
not
None
:
am_inference
=
jit
.
to_static
(
am_inference
,
input_spec
=
[
...
...
@@ -575,7 +582,7 @@ def get_am_output(
get_tone_ids
=
False
if
am_name
==
'speedyspeech'
:
get_tone_ids
=
True
if
am_dataset
in
{
"aishell3"
,
"vctk"
,
"mix"
}
and
speaker_dict
:
if
am_dataset
in
{
"aishell3"
,
"vctk"
,
"mix"
,
"canton"
}
and
speaker_dict
:
get_spk_id
=
True
spk_id
=
np
.
array
([
spk_id
])
...
...
paddlespeech/t2s/exps/synthesize.py
浏览文件 @
1af9bd47
...
...
@@ -136,7 +136,8 @@ def parse_args():
choices
=
[
'speedyspeech_csmsc'
,
'fastspeech2_csmsc'
,
'fastspeech2_ljspeech'
,
'fastspeech2_aishell3'
,
'fastspeech2_vctk'
,
'tacotron2_csmsc'
,
'tacotron2_ljspeech'
,
'tacotron2_aishell3'
,
'fastspeech2_mix'
'tacotron2_ljspeech'
,
'tacotron2_aishell3'
,
'fastspeech2_mix'
,
'fastspeech2_canton'
],
help
=
'Choose acoustic model type of tts task.'
)
parser
.
add_argument
(
...
...
paddlespeech/t2s/exps/synthesize_e2e.py
浏览文件 @
1af9bd47
...
...
@@ -119,7 +119,7 @@ def evaluate(args):
# acoustic model
if
am_name
==
'fastspeech2'
:
# multi speaker
if
am_dataset
in
{
"aishell3"
,
"vctk"
,
"mix"
}:
if
am_dataset
in
{
"aishell3"
,
"vctk"
,
"mix"
,
"canton"
}:
spk_id
=
paddle
.
to_tensor
(
args
.
spk_id
)
mel
=
am_inference
(
part_phone_ids
,
spk_id
)
else
:
...
...
@@ -167,7 +167,8 @@ def parse_args():
choices
=
[
'speedyspeech_csmsc'
,
'speedyspeech_aishell3'
,
'fastspeech2_csmsc'
,
'fastspeech2_ljspeech'
,
'fastspeech2_aishell3'
,
'fastspeech2_vctk'
,
'tacotron2_csmsc'
,
'tacotron2_ljspeech'
,
'fastspeech2_mix'
'tacotron2_csmsc'
,
'tacotron2_ljspeech'
,
'fastspeech2_mix'
,
'fastspeech2_canton'
],
help
=
'Choose acoustic model type of tts task.'
)
parser
.
add_argument
(
...
...
paddlespeech/t2s/frontend/canton_frontend.py
0 → 100644
浏览文件 @
1af9bd47
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
typing
import
Dict
from
typing
import
List
import
numpy
as
np
import
paddle
import
ToJyutping
from
paddlespeech.t2s.frontend.zh_normalization.text_normlization
import
TextNormalizer
INITIALS
=
[
'p'
,
'b'
,
't'
,
'd'
,
'ts'
,
'dz'
,
'k'
,
'g'
,
'kw'
,
'gw'
,
'f'
,
'h'
,
'l'
,
'm'
,
'ng'
,
'n'
,
's'
,
'y'
,
'w'
,
'c'
,
'z'
,
'j'
]
INITIALS
+=
[
'sp'
,
'spl'
,
'spn'
,
'sil'
]
def
get_lines
(
cantons
:
List
[
str
]):
phones
=
[]
for
canton
in
cantons
:
for
consonant
in
INITIALS
:
if
canton
.
startswith
(
consonant
):
c
,
v
=
canton
[:
len
(
consonant
)],
canton
[
len
(
consonant
):]
phones
=
phones
+
[
c
,
v
]
return
phones
class
CantonFrontend
():
def
__init__
(
self
,
phone_vocab_path
:
str
):
self
.
text_normalizer
=
TextNormalizer
()
self
.
punc
=
":,;。?!“”‘’':,;.?!"
self
.
vocab_phones
=
{}
if
phone_vocab_path
:
with
open
(
phone_vocab_path
,
'rt'
,
encoding
=
'utf-8'
)
as
f
:
phn_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
for
phn
,
id
in
phn_id
:
self
.
vocab_phones
[
phn
]
=
int
(
id
)
# if merge_sentences, merge all sentences into one phone sequence
def
_g2p
(
self
,
sentences
:
List
[
str
],
merge_sentences
:
bool
=
True
)
->
List
[
List
[
str
]]:
phones_list
=
[]
for
sentence
in
sentences
:
phones_str
=
ToJyutping
.
get_jyutping_text
(
sentence
)
phones_split
=
get_lines
(
phones_str
.
split
(
' '
))
phones_list
.
append
(
phones_split
)
return
phones_list
def
_p2id
(
self
,
phonemes
:
List
[
str
])
->
np
.
ndarray
:
# replace unk phone with sp
phonemes
=
[
phn
if
phn
in
self
.
vocab_phones
else
"sp"
for
phn
in
phonemes
]
phone_ids
=
[
self
.
vocab_phones
[
item
]
for
item
in
phonemes
]
return
np
.
array
(
phone_ids
,
np
.
int64
)
def
get_phonemes
(
self
,
sentence
:
str
,
merge_sentences
:
bool
=
True
,
print_info
:
bool
=
False
)
->
List
[
List
[
str
]]:
sentences
=
self
.
text_normalizer
.
normalize
(
sentence
)
phonemes
=
self
.
_g2p
(
sentences
,
merge_sentences
=
merge_sentences
)
if
print_info
:
print
(
"----------------------------"
)
print
(
"text norm results:"
)
print
(
sentences
)
print
(
"----------------------------"
)
print
(
"g2p results:"
)
print
(
phonemes
)
print
(
"----------------------------"
)
return
phonemes
def
get_input_ids
(
self
,
sentence
:
str
,
merge_sentences
:
bool
=
True
,
print_info
:
bool
=
False
,
to_tensor
:
bool
=
True
)
->
Dict
[
str
,
List
[
paddle
.
Tensor
]]:
phonemes
=
self
.
get_phonemes
(
sentence
,
merge_sentences
=
merge_sentences
,
print_info
=
print_info
)
result
=
{}
temp_phone_ids
=
[]
for
phones
in
phonemes
:
if
phones
:
phone_ids
=
self
.
_p2id
(
phones
)
# if use paddle.to_tensor() in onnxruntime, the first time will be too low
if
to_tensor
:
phone_ids
=
paddle
.
to_tensor
(
phone_ids
)
temp_phone_ids
.
append
(
phone_ids
)
if
temp_phone_ids
:
result
[
"phone_ids"
]
=
temp_phone_ids
return
result
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录