Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
d53c4994
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
d53c4994
编写于
6月 08, 2023
作者:
H
Hui Zhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix long text oom using ssml; filter comma; update polyphonic
上级
108e73e1
变更
14
隐藏空白更改
内联
并排
Showing
14 changed file
with
169 addition
and
79 deletion
+169
-79
paddlespeech/t2s/assets/__init__.py
paddlespeech/t2s/assets/__init__.py
+13
-0
paddlespeech/t2s/exps/synthesize_e2e.py
paddlespeech/t2s/exps/synthesize_e2e.py
+2
-2
paddlespeech/t2s/frontend/canton_frontend.py
paddlespeech/t2s/frontend/canton_frontend.py
+1
-1
paddlespeech/t2s/frontend/en_frontend.py
paddlespeech/t2s/frontend/en_frontend.py
+13
-0
paddlespeech/t2s/frontend/mix_frontend.py
paddlespeech/t2s/frontend/mix_frontend.py
+60
-41
paddlespeech/t2s/frontend/phonectic.py
paddlespeech/t2s/frontend/phonectic.py
+1
-1
paddlespeech/t2s/frontend/polyphonic.py
paddlespeech/t2s/frontend/polyphonic.py
+38
-0
paddlespeech/t2s/frontend/polyphonic.yaml
paddlespeech/t2s/frontend/polyphonic.yaml
+4
-1
paddlespeech/t2s/frontend/sing_frontend.py
paddlespeech/t2s/frontend/sing_frontend.py
+1
-1
paddlespeech/t2s/frontend/ssml/__init__.py
paddlespeech/t2s/frontend/ssml/__init__.py
+1
-1
paddlespeech/t2s/frontend/ssml/xml_processor.py
paddlespeech/t2s/frontend/ssml/xml_processor.py
+24
-5
paddlespeech/t2s/frontend/tone_sandhi.py
paddlespeech/t2s/frontend/tone_sandhi.py
+2
-2
paddlespeech/t2s/frontend/zh_frontend.py
paddlespeech/t2s/frontend/zh_frontend.py
+8
-23
tests/unit/tts/test_mixfrontend.py
tests/unit/tts/test_mixfrontend.py
+1
-1
未找到文件。
paddlespeech/t2s/assets/__init__.py
浏览文件 @
d53c4994
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
paddlespeech/t2s/exps/synthesize_e2e.py
浏览文件 @
d53c4994
...
...
@@ -117,7 +117,7 @@ def evaluate(args):
sentences
=
get_sentences
(
text_file
=
args
.
text
,
lang
=
args
.
lang
)
for
utt_id
,
sentence
in
sentences
:
print
(
f
"
{
utt_id
}
{
sentence
}
...
"
)
print
(
f
"
{
utt_id
}
{
sentence
}
"
)
with
timer
()
as
t
:
if
am_name
==
"diffsinger"
:
text
=
""
...
...
@@ -135,7 +135,7 @@ def evaluate(args):
lang
=
args
.
lang
,
svs_input
=
svs_input
)
phone_ids
=
frontend_dict
[
'phone_ids'
]
# pprint(f"
process:
{utt_id} {phone_ids}")
# pprint(f"{utt_id} {phone_ids}")
with
paddle
.
no_grad
():
flags
=
0
...
...
paddlespeech/t2s/frontend/canton_frontend.py
浏览文件 @
d53c4994
...
...
@@ -48,7 +48,7 @@ def jyuping_to_phonemes(cantons: List[str]):
class
CantonFrontend
():
def
__init__
(
self
,
phone_vocab_path
:
str
):
self
.
text_normalizer
=
TextNormalizer
()
self
.
punc
=
":,;。?!“”‘’':,;.?!"
self
.
punc
=
"
、
:,;。?!“”‘’':,;.?!"
self
.
vocab_phones
=
{}
if
phone_vocab_path
:
...
...
paddlespeech/t2s/frontend/en_frontend.py
浏览文件 @
d53c4994
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.phonectic
import
English
paddlespeech/t2s/frontend/mix_frontend.py
浏览文件 @
d53c4994
...
...
@@ -106,76 +106,95 @@ class MixFrontend():
get_tone_ids
:
bool
=
False
,
add_sp
:
bool
=
True
,
to_tensor
:
bool
=
True
)
->
Dict
[
str
,
List
[
paddle
.
Tensor
]]:
''' 1. 添加SSML支持,先列出 文字 和 <say-as>标签内容,
然后添加到tmpSegments数组里
'''
d_inputs
=
MixTextProcessor
.
get_dom_split
(
sentence
)
tmpSegments
=
[]
for
instr
in
d_inputs
:
''' 暂时只支持 say-as '''
if
instr
.
lower
().
startswith
(
"<say-as"
):
tmpSegments
.
append
((
instr
,
"zh"
))
# XML Document Object Model (DOM)
doms
=
MixTextProcessor
.
get_dom_split
(
sentence
)
lang_splits
=
[]
for
dom
in
doms
:
if
dom
.
lower
().
startswith
(
"<say-as pinyin="
):
# `<say-as pinyin=` for zh lang
lang_splits
.
append
((
dom
,
"zh"
))
else
:
tmpSegments
.
extend
(
self
.
split_by_lang
(
instr
))
''' 2. 把zh的merge到一起,避免合成结果中间停顿
'''
# process zh, en and zh/en
lang_splits
.
extend
(
self
.
split_by_lang
(
dom
))
# merge adjacent zh segment
segments
=
[]
currentSeg
=
[
""
,
""
]
for
seg
in
tmpSegmen
ts
:
for
seg
in
lang_spli
ts
:
if
seg
[
1
]
==
"en"
or
seg
[
1
]
==
"other"
:
if
currentSeg
[
0
]
==
''
:
# first see
segments
.
append
(
seg
)
else
:
# zh
currentSeg
[
0
]
=
"<speak>"
+
currentSeg
[
0
]
+
"</speak>"
segments
.
append
(
tuple
(
currentSeg
))
# en
segments
.
append
(
seg
)
# reset
currentSeg
=
[
""
,
""
]
else
:
# zh
if
currentSeg
[
0
]
==
''
:
# first see
currentSeg
[
0
]
=
seg
[
0
]
currentSeg
[
1
]
=
seg
[
1
]
else
:
# merge zh
currentSeg
[
0
]
=
currentSeg
[
0
]
+
seg
[
0
]
if
currentSeg
[
0
]
!=
''
:
# last zh
currentSeg
[
0
]
=
"<speak>"
+
currentSeg
[
0
]
+
"</speak>"
segments
.
append
(
tuple
(
currentSeg
))
phones_list
=
[]
result
=
{}
# 008 我们要去云南 team building, 非常非常 happy.
# seg ('我们要去云南 ', 'zh')
# seg ('team building, ', 'en')
# seg ('非常非常 ', 'zh')
# seg ('happy.', 'en')
# [('<speak>我们要去云南 </speak>', 'zh'), ('team building, ', 'en'), ('<speak>非常非常 </speak>', 'zh'), ('happy.', 'en')]
for
seg
in
segments
:
content
=
seg
[
0
]
lang
=
seg
[
1
]
if
content
!=
''
:
if
lang
==
"en"
:
input_ids
=
self
.
en_frontend
.
get_input_ids
(
content
,
merge_sentences
=
False
,
to_tensor
=
to_tensor
)
if
not
content
:
continue
if
lang
==
"en"
:
input_ids
=
self
.
en_frontend
.
get_input_ids
(
content
,
merge_sentences
=
False
,
to_tensor
=
to_tensor
)
else
:
if
content
.
strip
()
!=
""
and
\
re
.
match
(
r
".*?<speak>.*?</speak>.*"
,
content
,
re
.
DOTALL
):
# process ssml
input_ids
=
self
.
zh_frontend
.
get_input_ids_ssml
(
content
,
merge_sentences
=
False
,
get_tone_ids
=
get_tone_ids
,
to_tensor
=
to_tensor
)
else
:
''' 3. 把带speak tag的中文和普通文字分开处理
'''
if
content
.
strip
()
!=
""
and
\
re
.
match
(
r
".*?<speak>.*?</speak>.*"
,
content
,
re
.
DOTALL
):
input_ids
=
self
.
zh_frontend
.
get_input_ids_ssml
(
content
,
merge_sentences
=
False
,
get_tone_ids
=
get_tone_ids
,
to_tensor
=
to_tensor
)
else
:
input_ids
=
self
.
zh_frontend
.
get_input_ids
(
content
,
merge_sentences
=
False
,
get_tone_ids
=
get_tone_ids
,
to_tensor
=
to_tensor
)
if
add_sp
:
if
to_tensor
:
input_ids
[
"phone_ids"
][
-
1
]
=
paddle
.
concat
(
[
input_ids
[
"phone_ids"
][
-
1
],
self
.
sp_id_tensor
])
else
:
input_ids
[
"phone_ids"
][
-
1
]
=
np
.
concatenate
(
(
input_ids
[
"phone_ids"
][
-
1
],
self
.
sp_id_numpy
))
# process plain text
input_ids
=
self
.
zh_frontend
.
get_input_ids
(
content
,
merge_sentences
=
False
,
get_tone_ids
=
get_tone_ids
,
to_tensor
=
to_tensor
)
if
add_sp
:
# add sp between zh and en
if
to_tensor
:
input_ids
[
"phone_ids"
][
-
1
]
=
paddle
.
concat
(
[
input_ids
[
"phone_ids"
][
-
1
],
self
.
sp_id_tensor
])
else
:
input_ids
[
"phone_ids"
][
-
1
]
=
np
.
concatenate
(
(
input_ids
[
"phone_ids"
][
-
1
],
self
.
sp_id_numpy
))
for
phones
in
input_ids
[
"phone_ids"
]:
phones_list
.
append
(
phones
)
phones_list
.
extend
(
input_ids
[
"phone_ids"
])
if
merge_sentences
:
merge_list
=
paddle
.
concat
(
phones_list
)
...
...
paddlespeech/t2s/frontend/phonectic.py
浏览文件 @
d53c4994
...
...
@@ -55,7 +55,7 @@ class English(Phonetics):
self
.
punctuations
=
get_punctuations
(
"en"
)
self
.
vocab
=
Vocab
(
self
.
phonemes
+
self
.
punctuations
)
self
.
vocab_phones
=
{}
self
.
punc
=
":,;。?!“”‘’':,;.?!"
self
.
punc
=
"
、
:,;。?!“”‘’':,;.?!"
self
.
text_normalizer
=
TextNormalizer
()
if
phone_vocab_path
:
with
open
(
phone_vocab_path
,
'rt'
,
encoding
=
'utf-8'
)
as
f
:
...
...
paddlespeech/t2s/frontend/polyphonic.py
0 → 100644
浏览文件 @
d53c4994
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
yaml
class
Polyphonic
():
def
__init__
(
self
):
with
open
(
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
'polyphonic.yaml'
),
'r'
,
encoding
=
'utf-8'
)
as
polyphonic_file
:
# 解析yaml
polyphonic_dict
=
yaml
.
load
(
polyphonic_file
,
Loader
=
yaml
.
FullLoader
)
self
.
polyphonic_words
=
polyphonic_dict
[
"polyphonic"
]
def
correct_pronunciation
(
self
,
word
,
pinyin
):
# 词汇被词典收录则返回纠正后的读音
print
(
word
,
pinyin
)
if
word
in
self
.
polyphonic_words
.
keys
():
pinyin
=
self
.
polyphonic_words
[
word
]
print
(
'new'
,
pinyin
)
# 否则返回原读音
return
pinyin
paddlespeech/t2s/frontend/polyphonic.yaml
浏览文件 @
d53c4994
...
...
@@ -48,4 +48,7 @@ polyphonic:
唉
:
[
'
ai4'
]
扎实
:
[
'
zha1'
,
'
shi2'
]
干将
:
[
'
gan4'
,
'
jiang4'
]
陈威行
:
[
'
chen2'
,
'
wei1'
,
'
hang2'
]
\ No newline at end of file
陈威行
:
[
'
chen2'
,
'
wei1'
,
'
hang2'
]
郭晟
:
[
'
guo1'
,
'
sheng4'
]
中标
:
[
'
zhong4'
,
'
biao1'
]
抗住
:
[
'
kang2'
,
'
zhu4'
]
\ No newline at end of file
paddlespeech/t2s/frontend/sing_frontend.py
浏览文件 @
d53c4994
...
...
@@ -29,7 +29,7 @@ class SingFrontend():
pinyin_phone_path (str): pinyin to phone file path, a 'pinyin|phones' (like: ba|b a ) pair per line.
phone_vocab_path (str): phone to phone id file path, a 'phone phone id' (like: a 4 ) pair per line.
"""
self
.
punc
=
'[:,;。?!“”‘’
\'
:,;.?!]'
self
.
punc
=
'[
、
:,;。?!“”‘’
\'
:,;.?!]'
self
.
pinyin_phones
=
{
'AP'
:
'AP'
,
'SP'
:
'SP'
}
if
pinyin_phone_path
:
...
...
paddlespeech/t2s/frontend/ssml/__init__.py
浏览文件 @
d53c4994
# Copyright (c) 202
0
PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 202
3
PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
paddlespeech/t2s/frontend/ssml/xml_processor.py
浏览文件 @
d53c4994
# -*- coding: utf-8 -*-
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
re
import
xml.dom.minidom
import
xml.parsers.expat
...
...
@@ -68,7 +81,8 @@ class MixTextProcessor():
after_xml
=
mat
.
group
(
3
)
# pre with none syllable
ctlist
.
append
([
pre_xml
,
[]])
if
pre_xml
:
ctlist
.
append
([
pre_xml
,
[]])
# between with syllable
# [(sub sentence, [syllables]), ...]
...
...
@@ -77,9 +91,11 @@ class MixTextProcessor():
ctlist
=
ctlist
+
pinyinlist
# post with none syllable
ctlist
.
append
([
after_xml
,
[]])
if
after_xml
:
ctlist
.
append
([
after_xml
,
[]])
else
:
ctlist
.
append
([
mixstr
,
[]])
return
ctlist
@
classmethod
...
...
@@ -94,15 +110,18 @@ class MixTextProcessor():
in_xml
=
mat
.
group
(
2
)
after_xml
=
mat
.
group
(
3
)
ctlist
.
append
(
pre_xml
)
if
pre_xml
:
ctlist
.
append
(
pre_xml
)
dom
=
DomXml
(
in_xml
)
tags
=
dom
.
get_text_and_sayas_tags
()
ctlist
.
extend
(
tags
)
ctlist
.
append
(
after_xml
)
return
ctlist
if
after_xml
:
ctlist
.
append
(
after_xml
)
else
:
ctlist
.
append
(
mixstr
)
return
ctlist
...
...
paddlespeech/t2s/frontend/tone_sandhi.py
浏览文件 @
d53c4994
...
...
@@ -68,9 +68,9 @@ class ToneSandhi():
'男子'
,
'女子'
,
'分子'
,
'原子'
,
'量子'
,
'莲子'
,
'石子'
,
'瓜子'
,
'电子'
,
'人人'
,
'虎虎'
,
'幺幺'
,
'干嘛'
,
'学子'
,
'哈哈'
,
'数数'
,
'袅袅'
,
'局地'
,
'以下'
,
'娃哈哈'
,
'花花草草'
,
'留得'
,
'耕地'
,
'想想'
,
'熙熙'
,
'攘攘'
,
'卵子'
,
'死死'
,
'冉冉'
,
'恳恳'
,
'佼佼'
,
'吵吵'
,
'打打'
,
'考考'
,
'整整'
,
'莘莘'
,
'落地'
,
'算子'
,
'家家户户'
'考考'
,
'整整'
,
'莘莘'
,
'落地'
,
'算子'
,
'家家户户'
,
'青青'
}
self
.
punc
=
":,;。?!“”‘’':,;.?!"
self
.
punc
=
"
、
:,;。?!“”‘’':,;.?!"
def
_split_word
(
self
,
word
:
str
)
->
List
[
str
]:
word_list
=
jieba
.
cut_for_search
(
word
)
...
...
paddlespeech/t2s/frontend/zh_frontend.py
浏览文件 @
d53c4994
...
...
@@ -31,6 +31,7 @@ from pypinyin_dict.phrase_pinyin_data import large_pinyin
from
paddlespeech.t2s.frontend.g2pw
import
G2PWOnnxConverter
from
paddlespeech.t2s.frontend.generate_lexicon
import
generate_lexicon
from
paddlespeech.t2s.frontend.polyphonic
import
Polyphonic
from
paddlespeech.t2s.frontend.rhy_prediction.rhy_predictor
import
RhyPredictor
from
paddlespeech.t2s.frontend.ssml.xml_processor
import
MixTextProcessor
from
paddlespeech.t2s.frontend.tone_sandhi
import
ToneSandhi
...
...
@@ -68,26 +69,6 @@ def insert_after_character(lst, item):
return
result
class
Polyphonic
():
def
__init__
(
self
):
with
open
(
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
'polyphonic.yaml'
),
'r'
,
encoding
=
'utf-8'
)
as
polyphonic_file
:
# 解析yaml
polyphonic_dict
=
yaml
.
load
(
polyphonic_file
,
Loader
=
yaml
.
FullLoader
)
self
.
polyphonic_words
=
polyphonic_dict
[
"polyphonic"
]
def
correct_pronunciation
(
self
,
word
,
pinyin
):
# 词汇被词典收录则返回纠正后的读音
if
word
in
self
.
polyphonic_words
.
keys
():
pinyin
=
self
.
polyphonic_words
[
word
]
# 否则返回原读音
return
pinyin
class
Frontend
():
def
__init__
(
self
,
g2p_model
=
"g2pW"
,
...
...
@@ -95,7 +76,7 @@ class Frontend():
tone_vocab_path
=
None
,
use_rhy
=
False
):
self
.
punc
=
":,;。?!“”‘’':,;.?!"
self
.
punc
=
"
、
:,;。?!“”‘’':,;.?!"
self
.
rhy_phns
=
[
'sp1'
,
'sp2'
,
'sp3'
,
'sp4'
]
self
.
phrases_dict
=
{
'开户行'
:
[[
'ka1i'
],
[
'hu4'
],
[
'hang2'
]],
...
...
@@ -567,6 +548,7 @@ class Frontend():
phones
=
[]
for
c
,
v
in
zip
(
initials
,
finals
):
# c for consonant, v for vowel
# NOTE: post process for pypinyin outputs
# we discriminate i, ii and iii
if
c
and
c
not
in
self
.
punc
:
...
...
@@ -633,16 +615,19 @@ class Frontend():
new_phonemes
.
append
(
new_sentence
)
all_phonemes
=
new_phonemes
if
merge_sentences
:
all_phonemes
=
[
sum
(
all_phonemes
,
[])]
if
print_info
:
print
(
"----------------------------"
)
print
(
"text norm results:"
)
print
(
sentences
)
print
(
"----------------------------"
)
print
(
"g2p results:"
)
print
(
all_phonemes
[
0
]
)
print
(
all_phonemes
)
print
(
"----------------------------"
)
return
[
sum
(
all_phonemes
,
[])]
return
all_phonemes
def
add_sp_if_no
(
self
,
phonemes
):
"""
...
...
tests/unit/tts/test_mixfrontend.py
浏览文件 @
d53c4994
...
...
@@ -423,7 +423,7 @@ if __name__ == '__main__':
segs
=
frontend
.
split_by_lang
(
text
)
print
(
segs
)
# 对于SSML的xml标记处理不好。
# 对于SSML的xml标记处理不好。
需要先解析SSML,后处理中英的划分。
text
=
"<speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸,不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>"
print
(
text
)
# [('<speak>', 'en'), ('我们的声学模型使用了 ', 'zh'), ('Fast Speech Two。', 'en'), ('前浪<', 'zh'), ("say-as pinyin='dao3'>", 'en'), ('倒</', 'zh'), ('say-as>', 'en'), ('在沙滩上,沙滩上倒了一堆<', 'zh'), ("say-as pinyin='tu3'>", 'en'), ('土</', 'zh'), ('say-as>。 ', 'en'), ('想象<', 'zh'), ("say-as pinyin='gan1 gan1'>", 'en'), ('干干</', 'zh'), ('say-as>', 'en'), ('的树干<', 'zh'), ("say-as pinyin='dao3'>", 'en'), ('倒</', 'zh'), ('say-as>', 'en'), ('了, 里面有个干尸,不知是被谁<', 'zh'), ("say-as pinyin='gan4'>", 'en'), ('干</', 'zh'), ('say-as>', 'en'), ('死的。</', 'zh'), ('speak>', 'en')]
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录