Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
13a7fa98
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
13a7fa98
编写于
10月 14, 2022
作者:
D
david.95
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
enable chinese words' pinyin specified in text of ssml formats, test=tts
上级
b76968e6
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
323 addition
and
2 deletion
+323
-2
paddlespeech/t2s/exps/syn_utils.py
paddlespeech/t2s/exps/syn_utils.py
+4
-2
paddlespeech/t2s/frontend/zh_frontend.py
paddlespeech/t2s/frontend/zh_frontend.py
+156
-0
paddlespeech/t2s/ssml/xml_processor.py
paddlespeech/t2s/ssml/xml_processor.py
+163
-0
未找到文件。
paddlespeech/t2s/exps/syn_utils.py
浏览文件 @
13a7fa98
...
@@ -13,6 +13,7 @@
...
@@ -13,6 +13,7 @@
# limitations under the License.
# limitations under the License.
import
math
import
math
import
os
import
os
import
re
from
pathlib
import
Path
from
pathlib
import
Path
from
typing
import
Any
from
typing
import
Any
from
typing
import
Dict
from
typing
import
Dict
...
@@ -33,6 +34,7 @@ from paddlespeech.t2s.frontend.mix_frontend import MixFrontend
...
@@ -33,6 +34,7 @@ from paddlespeech.t2s.frontend.mix_frontend import MixFrontend
from
paddlespeech.t2s.frontend.zh_frontend
import
Frontend
from
paddlespeech.t2s.frontend.zh_frontend
import
Frontend
from
paddlespeech.t2s.modules.normalizer
import
ZScore
from
paddlespeech.t2s.modules.normalizer
import
ZScore
from
paddlespeech.utils.dynamic_import
import
dynamic_import
from
paddlespeech.utils.dynamic_import
import
dynamic_import
# remove [W:onnxruntime: xxx] from ort
# remove [W:onnxruntime: xxx] from ort
ort
.
set_default_logger_severity
(
3
)
ort
.
set_default_logger_severity
(
3
)
...
@@ -103,7 +105,7 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
...
@@ -103,7 +105,7 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
sentences
=
[]
sentences
=
[]
with
open
(
text_file
,
'rt'
)
as
f
:
with
open
(
text_file
,
'rt'
)
as
f
:
for
line
in
f
:
for
line
in
f
:
items
=
line
.
strip
().
split
(
)
items
=
re
.
split
(
r
"\s+"
,
line
.
strip
(),
1
)
utt_id
=
items
[
0
]
utt_id
=
items
[
0
]
if
lang
==
'zh'
:
if
lang
==
'zh'
:
sentence
=
""
.
join
(
items
[
1
:])
sentence
=
""
.
join
(
items
[
1
:])
...
@@ -180,7 +182,7 @@ def run_frontend(frontend: object,
...
@@ -180,7 +182,7 @@ def run_frontend(frontend: object,
to_tensor
:
bool
=
True
):
to_tensor
:
bool
=
True
):
outs
=
dict
()
outs
=
dict
()
if
lang
==
'zh'
:
if
lang
==
'zh'
:
input_ids
=
frontend
.
get_input_ids
(
input_ids
=
frontend
.
get_input_ids
_ssml
(
text
,
text
,
merge_sentences
=
merge_sentences
,
merge_sentences
=
merge_sentences
,
get_tone_ids
=
get_tone_ids
,
get_tone_ids
=
get_tone_ids
,
...
...
paddlespeech/t2s/frontend/zh_frontend.py
浏览文件 @
13a7fa98
...
@@ -13,6 +13,7 @@
...
@@ -13,6 +13,7 @@
# limitations under the License.
# limitations under the License.
import
os
import
os
import
re
import
re
from
operator
import
itemgetter
from
typing
import
Dict
from
typing
import
Dict
from
typing
import
List
from
typing
import
List
...
@@ -31,6 +32,7 @@ from paddlespeech.t2s.frontend.g2pw import G2PWOnnxConverter
...
@@ -31,6 +32,7 @@ from paddlespeech.t2s.frontend.g2pw import G2PWOnnxConverter
from
paddlespeech.t2s.frontend.generate_lexicon
import
generate_lexicon
from
paddlespeech.t2s.frontend.generate_lexicon
import
generate_lexicon
from
paddlespeech.t2s.frontend.tone_sandhi
import
ToneSandhi
from
paddlespeech.t2s.frontend.tone_sandhi
import
ToneSandhi
from
paddlespeech.t2s.frontend.zh_normalization.text_normlization
import
TextNormalizer
from
paddlespeech.t2s.frontend.zh_normalization.text_normlization
import
TextNormalizer
from
paddlespeech.t2s.ssml.xml_processor
import
MixTextProcessor
INITIALS
=
[
INITIALS
=
[
'b'
,
'p'
,
'm'
,
'f'
,
'd'
,
't'
,
'n'
,
'l'
,
'g'
,
'k'
,
'h'
,
'zh'
,
'ch'
,
'sh'
,
'b'
,
'p'
,
'm'
,
'f'
,
'd'
,
't'
,
'n'
,
'l'
,
'g'
,
'k'
,
'h'
,
'zh'
,
'ch'
,
'sh'
,
...
@@ -81,6 +83,7 @@ class Frontend():
...
@@ -81,6 +83,7 @@ class Frontend():
g2p_model
=
"g2pW"
,
g2p_model
=
"g2pW"
,
phone_vocab_path
=
None
,
phone_vocab_path
=
None
,
tone_vocab_path
=
None
):
tone_vocab_path
=
None
):
self
.
mix_ssml_processor
=
MixTextProcessor
()
self
.
tone_modifier
=
ToneSandhi
()
self
.
tone_modifier
=
ToneSandhi
()
self
.
text_normalizer
=
TextNormalizer
()
self
.
text_normalizer
=
TextNormalizer
()
self
.
punc
=
":,;。?!“”‘’':,;.?!"
self
.
punc
=
":,;。?!“”‘’':,;.?!"
...
@@ -143,6 +146,7 @@ class Frontend():
...
@@ -143,6 +146,7 @@ class Frontend():
tone_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
tone_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
for
tone
,
id
in
tone_id
:
for
tone
,
id
in
tone_id
:
self
.
vocab_tones
[
tone
]
=
int
(
id
)
self
.
vocab_tones
[
tone
]
=
int
(
id
)
self
.
mix_ssml_processor
.
__repr__
()
def
_init_pypinyin
(
self
):
def
_init_pypinyin
(
self
):
large_pinyin
.
load
()
large_pinyin
.
load
()
...
@@ -281,6 +285,65 @@ class Frontend():
...
@@ -281,6 +285,65 @@ class Frontend():
phones_list
.
append
(
merge_list
)
phones_list
.
append
(
merge_list
)
return
phones_list
return
phones_list
def
_split_word_to_char
(
self
,
words
):
res
=
[]
for
x
in
words
:
res
.
append
(
x
)
return
res
# if using ssml, have pingyin specified, assign pinyin to words
def
_g2p_assign
(
self
,
words
:
List
[
str
],
pinyin_spec
:
List
[
str
],
merge_sentences
:
bool
=
True
)
->
List
[
List
[
str
]]:
phones_list
=
[]
initials
=
[]
finals
=
[]
words
=
self
.
_split_word_to_char
(
words
[
0
])
for
pinyin
,
char
in
zip
(
pinyin_spec
,
words
):
sub_initials
=
[]
sub_finals
=
[]
pinyin
=
pinyin
.
replace
(
"u:"
,
"v"
)
#self.pinyin2phone: is a dict with all pinyin mapped with sheng_mu yun_mu
if
pinyin
in
self
.
pinyin2phone
:
initial_final_list
=
self
.
pinyin2phone
[
pinyin
].
split
(
" "
)
if
len
(
initial_final_list
)
==
2
:
sub_initials
.
append
(
initial_final_list
[
0
])
sub_finals
.
append
(
initial_final_list
[
1
])
elif
len
(
initial_final_list
)
==
1
:
sub_initials
.
append
(
''
)
sub_finals
.
append
(
initial_final_list
[
1
])
else
:
# If it's not pinyin (possibly punctuation) or no conversion is required
sub_initials
.
append
(
pinyin
)
sub_finals
.
append
(
pinyin
)
initials
.
append
(
sub_initials
)
finals
.
append
(
sub_finals
)
initials
=
sum
(
initials
,
[])
finals
=
sum
(
finals
,
[])
phones
=
[]
for
c
,
v
in
zip
(
initials
,
finals
):
# NOTE: post process for pypinyin outputs
# we discriminate i, ii and iii
if
c
and
c
not
in
self
.
punc
:
phones
.
append
(
c
)
if
c
and
c
in
self
.
punc
:
phones
.
append
(
'sp'
)
if
v
and
v
not
in
self
.
punc
:
phones
.
append
(
v
)
phones_list
.
append
(
phones
)
if
merge_sentences
:
merge_list
=
sum
(
phones_list
,
[])
# rm the last 'sp' to avoid the noise at the end
# cause in the training data, no 'sp' in the end
if
merge_list
[
-
1
]
==
'sp'
:
merge_list
=
merge_list
[:
-
1
]
phones_list
=
[]
phones_list
.
append
(
merge_list
)
return
phones_list
def
_merge_erhua
(
self
,
def
_merge_erhua
(
self
,
initials
:
List
[
str
],
initials
:
List
[
str
],
finals
:
List
[
str
],
finals
:
List
[
str
],
...
@@ -396,6 +459,52 @@ class Frontend():
...
@@ -396,6 +459,52 @@ class Frontend():
print
(
"----------------------------"
)
print
(
"----------------------------"
)
return
phonemes
return
phonemes
#@an added for ssml pinyin
def
get_phonemes_ssml
(
self
,
ssml_inputs
:
list
,
merge_sentences
:
bool
=
True
,
with_erhua
:
bool
=
True
,
robot
:
bool
=
False
,
print_info
:
bool
=
False
)
->
List
[
List
[
str
]]:
all_phonemes
=
[]
for
word_pinyin_item
in
ssml_inputs
:
phonemes
=
[]
sentence
,
pinyin_spec
=
itemgetter
(
0
,
1
)(
word_pinyin_item
)
sentences
=
self
.
text_normalizer
.
normalize
(
sentence
)
if
len
(
pinyin_spec
)
==
0
:
phonemes
=
self
.
_g2p
(
sentences
,
merge_sentences
=
merge_sentences
,
with_erhua
=
with_erhua
)
else
:
# phonemes should be pinyin_spec
phonemes
=
self
.
_g2p_assign
(
sentences
,
pinyin_spec
,
merge_sentences
=
merge_sentences
)
all_phonemes
=
all_phonemes
+
phonemes
if
robot
:
new_phonemes
=
[]
for
sentence
in
all_phonemes
:
new_sentence
=
[]
for
item
in
sentence
:
# `er` only have tone `2`
if
item
[
-
1
]
in
"12345"
and
item
!=
"er2"
:
item
=
item
[:
-
1
]
+
"1"
new_sentence
.
append
(
item
)
new_phonemes
.
append
(
new_sentence
)
all_phonemes
=
new_phonemes
if
print_info
:
print
(
"----------------------------"
)
print
(
"text norm results:"
)
print
(
sentences
)
print
(
"----------------------------"
)
print
(
"g2p results:"
)
print
(
all_phonemes
[
0
])
print
(
"----------------------------"
)
return
[
sum
(
all_phonemes
,
[])]
def
get_input_ids
(
self
,
def
get_input_ids
(
self
,
sentence
:
str
,
sentence
:
str
,
merge_sentences
:
bool
=
True
,
merge_sentences
:
bool
=
True
,
...
@@ -405,6 +514,7 @@ class Frontend():
...
@@ -405,6 +514,7 @@ class Frontend():
add_blank
:
bool
=
False
,
add_blank
:
bool
=
False
,
blank_token
:
str
=
"<pad>"
,
blank_token
:
str
=
"<pad>"
,
to_tensor
:
bool
=
True
)
->
Dict
[
str
,
List
[
paddle
.
Tensor
]]:
to_tensor
:
bool
=
True
)
->
Dict
[
str
,
List
[
paddle
.
Tensor
]]:
phonemes
=
self
.
get_phonemes
(
phonemes
=
self
.
get_phonemes
(
sentence
,
sentence
,
merge_sentences
=
merge_sentences
,
merge_sentences
=
merge_sentences
,
...
@@ -437,3 +547,49 @@ class Frontend():
...
@@ -437,3 +547,49 @@ class Frontend():
if
temp_phone_ids
:
if
temp_phone_ids
:
result
[
"phone_ids"
]
=
temp_phone_ids
result
[
"phone_ids"
]
=
temp_phone_ids
return
result
return
result
# @an added for ssml
def
get_input_ids_ssml
(
self
,
sentence
:
str
,
merge_sentences
:
bool
=
True
,
get_tone_ids
:
bool
=
False
,
robot
:
bool
=
False
,
print_info
:
bool
=
False
,
add_blank
:
bool
=
False
,
blank_token
:
str
=
"<pad>"
,
to_tensor
:
bool
=
True
)
->
Dict
[
str
,
List
[
paddle
.
Tensor
]]:
l_inputs
=
MixTextProcessor
.
get_pinyin_split
(
sentence
)
phonemes
=
self
.
get_phonemes_ssml
(
l_inputs
,
merge_sentences
=
merge_sentences
,
print_info
=
print_info
,
robot
=
robot
)
result
=
{}
phones
=
[]
tones
=
[]
temp_phone_ids
=
[]
temp_tone_ids
=
[]
for
part_phonemes
in
phonemes
:
phones
,
tones
=
self
.
_get_phone_tone
(
part_phonemes
,
get_tone_ids
=
get_tone_ids
)
if
add_blank
:
phones
=
insert_after_character
(
phones
,
blank_token
)
if
tones
:
tone_ids
=
self
.
_t2id
(
tones
)
if
to_tensor
:
tone_ids
=
paddle
.
to_tensor
(
tone_ids
)
temp_tone_ids
.
append
(
tone_ids
)
if
phones
:
phone_ids
=
self
.
_p2id
(
phones
)
# if use paddle.to_tensor() in onnxruntime, the first time will be too low
if
to_tensor
:
phone_ids
=
paddle
.
to_tensor
(
phone_ids
)
temp_phone_ids
.
append
(
phone_ids
)
if
temp_tone_ids
:
result
[
"tone_ids"
]
=
temp_tone_ids
if
temp_phone_ids
:
result
[
"phone_ids"
]
=
temp_phone_ids
return
result
paddlespeech/t2s/ssml/xml_processor.py
0 → 100644
浏览文件 @
13a7fa98
# -*- coding: utf-8 -*-
import
re
import
xml.dom.minidom
import
xml.parsers.expat
from
xml.dom.minidom
import
Node
from
xml.dom.minidom
import
parseString
'''
Note: xml 有5种特殊字符, &<>"'
其一,采用<![CDATA[ ]]>特殊标签,将包含特殊字符的字符串封装起来。
例如:
<TitleName><![CDATA["姓名"]]></TitleName>
其二,使用XML转义序列表示这些特殊的字符,这5个特殊字符所对应XML转义序列为:
& &
< <
> >
" "
' '
例如:
<TitleName>"姓名"</TitleName>
'''
class
MixTextProcessor
():
def
__repr__
(
self
):
print
(
"@an MixTextProcessor class"
)
def
get_xml_content
(
self
,
mixstr
):
'''返回字符串的 xml 内容'''
xmlptn
=
re
.
compile
(
r
"<speak>.*?</speak>"
,
re
.
M
|
re
.
S
)
ctn
=
re
.
search
(
xmlptn
,
mixstr
)
if
ctn
:
return
ctn
.
group
(
0
)
else
:
return
None
def
get_content_split
(
self
,
mixstr
):
''' 文本分解,顺序加了列表中,按非xml 和 xml 分开,对应的字符串,带标点符号
不能去除空格,因为xml 中tag 属性带空格
'''
ctlist
=
[]
# print("Testing:",mixstr[:20])
patn
=
re
.
compile
(
r
'(.*\s*?)(<speak>.*?</speak>)(.*\s*)$'
,
re
.
M
|
re
.
S
)
mat
=
re
.
match
(
patn
,
mixstr
)
if
mat
:
pre_xml
=
mat
.
group
(
1
)
in_xml
=
mat
.
group
(
2
)
after_xml
=
mat
.
group
(
3
)
ctlist
.
append
(
pre_xml
)
ctlist
.
append
(
in_xml
)
ctlist
.
append
(
after_xml
)
return
ctlist
else
:
ctlist
.
append
(
mixstr
)
return
ctlist
@
classmethod
def
get_pinyin_split
(
self
,
mixstr
):
ctlist
=
[]
patn
=
re
.
compile
(
r
'(.*\s*?)(<speak>.*?</speak>)(.*\s*)$'
,
re
.
M
|
re
.
S
)
mat
=
re
.
match
(
patn
,
mixstr
)
if
mat
:
pre_xml
=
mat
.
group
(
1
)
in_xml
=
mat
.
group
(
2
)
after_xml
=
mat
.
group
(
3
)
ctlist
.
append
([
pre_xml
,
[]])
dom
=
DomXml
(
in_xml
)
pinyinlist
=
dom
.
get_pinyins_for_xml
()
ctlist
=
ctlist
+
pinyinlist
ctlist
.
append
([
after_xml
,
[]])
else
:
ctlist
.
append
([
mixstr
,
[]])
return
ctlist
class
DomXml
():
def
__init__
(
self
,
xmlstr
):
print
(
"Parse xml str:"
,
xmlstr
)
self
.
tdom
=
parseString
(
xmlstr
)
#Document
# print("tdom:",type(self.tdom))
self
.
root
=
self
.
tdom
.
documentElement
#Element
# print("root:",type(self.root))
self
.
rnode
=
self
.
tdom
.
childNodes
#NodeList
# print("rnode:",type(self.rnode))
pass
def
get_text
(
self
):
'''返回xml 内容的所有文本内容的 列表'''
res
=
[]
for
x1
in
self
.
rnode
:
if
x1
.
nodeType
==
Node
.
TEXT_NODE
:
res
.
append
(
x1
.
value
)
else
:
for
x2
in
x1
.
childNodes
:
if
isinstance
(
x2
,
xml
.
dom
.
minidom
.
Text
):
res
.
append
(
x2
.
data
)
else
:
for
x3
in
x2
.
childNodes
:
if
isinstance
(
x3
,
xml
.
dom
.
minidom
.
Text
):
res
.
append
(
x3
.
data
)
else
:
print
(
"len(nodes of x3):"
,
len
(
x3
.
childNodes
))
return
res
def
get_xmlchild_list
(
self
):
'''返回xml 内容的列表, 包括所有文本内容(不带tag)'''
res
=
[]
for
x1
in
self
.
rnode
:
if
x1
.
nodeType
==
Node
.
TEXT_NODE
:
res
.
append
(
x1
.
value
)
else
:
for
x2
in
x1
.
childNodes
:
if
isinstance
(
x2
,
xml
.
dom
.
minidom
.
Text
):
res
.
append
(
x2
.
data
)
else
:
for
x3
in
x2
.
childNodes
:
if
isinstance
(
x3
,
xml
.
dom
.
minidom
.
Text
):
res
.
append
(
x3
.
data
)
else
:
print
(
"len(nodes of x3):"
,
len
(
x3
.
childNodes
))
print
(
res
)
return
res
def
get_pinyins_for_xml
(
self
):
'''返回xml 内容,如果字符串 和 拼音的 list , 如 ['''
res
=
[]
for
x1
in
self
.
rnode
:
if
x1
.
nodeType
==
Node
.
TEXT_NODE
:
t
=
re
.
sub
(
r
"\s+"
,
""
,
x1
.
value
)
res
.
append
([
t
,
[]])
else
:
for
x2
in
x1
.
childNodes
:
if
isinstance
(
x2
,
xml
.
dom
.
minidom
.
Text
):
t
=
re
.
sub
(
r
"\s+"
,
""
,
x2
.
data
)
res
.
append
([
t
,
[]])
else
:
# print("x2",x2,x2.tagName)
if
x2
.
hasAttribute
(
'pinyin'
):
pinyin_value
=
x2
.
getAttribute
(
"pinyin"
)
pinyins
=
pinyin_value
.
split
(
" "
)
for
x3
in
x2
.
childNodes
:
# print('x3',x3)
if
isinstance
(
x3
,
xml
.
dom
.
minidom
.
Text
):
t
=
re
.
sub
(
r
"\s+"
,
""
,
x3
.
data
)
res
.
append
([
t
,
pinyins
])
else
:
print
(
"len(nodes of x3):"
,
len
(
x3
.
childNodes
))
return
res
def
get_all_tags
(
self
,
tag_name
):
'''获取所有的tag 及属性值'''
alltags
=
self
.
root
.
getElementsByTagName
(
tag_name
)
for
x
in
alltags
:
if
x
.
hasAttribute
(
'pinyin'
):
# pinyin
print
(
x
.
tagName
,
'pinyin'
,
x
.
getAttribute
(
'pinyin'
),
x
.
firstChild
.
data
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录