Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
0359c3f6
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 1 年 前同步成功
通知
206
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
0359c3f6
编写于
10月 08, 2022
作者:
L
liangym
提交者:
GitHub
10月 08, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix mix front (#2493)
* update mix frontend, test=tts
上级
467cfd4e
变更
1
显示空白变更内容
内联
并排
Showing
1 changed file
with
39 addition
and
177 deletion
+39
-177
paddlespeech/t2s/frontend/mix_frontend.py
paddlespeech/t2s/frontend/mix_frontend.py
+39
-177
未找到文件。
paddlespeech/t2s/frontend/mix_frontend.py
浏览文件 @
0359c3f6
...
...
@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
re
from
typing
import
Dict
from
typing
import
List
...
...
@@ -30,7 +29,6 @@ class MixFrontend():
self
.
zh_frontend
=
Frontend
(
phone_vocab_path
=
phone_vocab_path
,
tone_vocab_path
=
tone_vocab_path
)
self
.
en_frontend
=
English
(
phone_vocab_path
=
phone_vocab_path
)
self
.
SENTENCE_SPLITOR
=
re
.
compile
(
r
'([:、,;。?!,;?!][”’]?)'
)
self
.
sp_id
=
self
.
zh_frontend
.
vocab_phones
[
"sp"
]
self
.
sp_id_tensor
=
paddle
.
to_tensor
([
self
.
sp_id
])
...
...
@@ -47,188 +45,56 @@ class MixFrontend():
else
:
return
False
def
is_number
(
self
,
char
):
if
char
>=
'
\u0030
'
and
char
<=
'
\u0039
'
:
return
True
else
:
return
False
def
is_other
(
self
,
char
):
if
not
(
self
.
is_chinese
(
char
)
or
self
.
is_number
(
char
)
or
self
.
is_alphabet
(
char
)):
return
True
else
:
return
False
def
is_end
(
self
,
before_char
,
after_char
)
->
bool
:
flag
=
0
for
char
in
(
before_char
,
after_char
):
if
self
.
is_alphabet
(
char
)
or
char
==
" "
:
flag
+=
1
if
flag
==
2
:
if
not
(
self
.
is_chinese
(
char
)
or
self
.
is_alphabet
(
char
)):
return
True
else
:
return
False
def
_replace
(
self
,
text
:
str
)
->
str
:
new_text
=
""
# get "." indexs
point
=
"."
point_indexs
=
[]
index
=
-
1
for
i
in
range
(
text
.
count
(
point
)):
index
=
text
.
find
(
"."
,
index
+
1
,
len
(
text
))
point_indexs
.
append
(
index
)
# replace "." -> "。" when English sentence ending
if
len
(
point_indexs
)
==
0
:
new_text
=
text
elif
len
(
point_indexs
)
==
1
:
point_index
=
point_indexs
[
0
]
if
point_index
==
0
or
point_index
==
len
(
text
)
-
1
:
new_text
=
text
else
:
if
not
self
.
is_end
(
text
[
point_index
-
1
],
text
[
point_index
+
1
]):
new_text
=
text
else
:
new_text
=
text
[:
point_index
]
+
"。"
+
text
[
point_index
+
1
:]
elif
len
(
point_indexs
)
==
2
:
first_index
=
point_indexs
[
0
]
end_index
=
point_indexs
[
1
]
# first
if
first_index
!=
0
:
if
not
self
.
is_end
(
text
[
first_index
-
1
],
text
[
first_index
+
1
]):
new_text
+=
(
text
[:
first_index
]
+
"."
)
else
:
new_text
+=
(
text
[:
first_index
]
+
"。"
)
else
:
new_text
+=
"."
# last
if
end_index
!=
len
(
text
)
-
1
:
if
not
self
.
is_end
(
text
[
end_index
-
1
],
text
[
end_index
+
1
]):
new_text
+=
text
[
point_indexs
[
-
2
]
+
1
:]
else
:
new_text
+=
(
text
[
point_indexs
[
-
2
]
+
1
:
end_index
]
+
"。"
+
text
[
end_index
+
1
:])
else
:
new_text
+=
"."
else
:
first_index
=
point_indexs
[
0
]
end_index
=
point_indexs
[
-
1
]
# first
if
first_index
!=
0
:
if
not
self
.
is_end
(
text
[
first_index
-
1
],
text
[
first_index
+
1
]):
new_text
+=
(
text
[:
first_index
]
+
"."
)
else
:
new_text
+=
(
text
[:
first_index
]
+
"。"
)
else
:
new_text
+=
"."
# middle
for
j
in
range
(
1
,
len
(
point_indexs
)
-
1
):
point_index
=
point_indexs
[
j
]
if
not
self
.
is_end
(
text
[
point_index
-
1
],
text
[
point_index
+
1
]):
new_text
+=
(
text
[
point_indexs
[
j
-
1
]
+
1
:
point_index
]
+
"."
)
else
:
new_text
+=
(
text
[
point_indexs
[
j
-
1
]
+
1
:
point_index
]
+
"。"
)
# last
if
end_index
!=
len
(
text
)
-
1
:
if
not
self
.
is_end
(
text
[
end_index
-
1
],
text
[
end_index
+
1
]):
new_text
+=
text
[
point_indexs
[
-
2
]
+
1
:]
else
:
new_text
+=
(
text
[
point_indexs
[
-
2
]
+
1
:
end_index
]
+
"。"
+
text
[
end_index
+
1
:])
else
:
new_text
+=
"."
return
new_text
def
_split
(
self
,
text
:
str
)
->
List
[
str
]:
text
=
re
.
sub
(
r
'[《》【】<=>{}()()#&@“”^_|…\\]'
,
''
,
text
)
# 替换英文句子的句号 "." --> "。" 用于后续分句
text
=
self
.
_replace
(
text
)
text
=
self
.
SENTENCE_SPLITOR
.
sub
(
r
'\1\n'
,
text
)
text
=
text
.
strip
()
sentences
=
[
sentence
.
strip
()
for
sentence
in
re
.
split
(
r
'\n+'
,
text
)]
return
sentences
def
_distinguish
(
self
,
text
:
str
)
->
List
[
str
]:
def
get_segment
(
self
,
text
:
str
)
->
List
[
str
]:
# sentence --> [ch_part, en_part, ch_part, ...]
segments
=
[]
types
=
[]
flag
=
0
temp_seg
=
""
temp_lang
=
""
# Determine the type of each character. type: blank, chinese, alphabet, number, unk and point.
for
ch
in
text
:
if
ch
==
"."
:
types
.
append
(
"point"
)
elif
self
.
is_chinese
(
ch
):
if
self
.
is_chinese
(
ch
):
types
.
append
(
"zh"
)
elif
self
.
is_alphabet
(
ch
):
types
.
append
(
"en"
)
elif
ch
==
" "
:
types
.
append
(
"blank"
)
elif
self
.
is_number
(
ch
):
types
.
append
(
"num"
)
else
:
types
.
append
(
"
unk
"
)
types
.
append
(
"
other
"
)
assert
len
(
types
)
==
len
(
text
)
for
i
in
range
(
len
(
types
)):
# find the first char of the seg
if
flag
==
0
:
# 首个字符是中文,英文或者数字
if
types
[
i
]
==
"zh"
or
types
[
i
]
==
"en"
or
types
[
i
]
==
"num"
:
temp_seg
+=
text
[
i
]
temp_lang
=
types
[
i
]
flag
=
1
else
:
# 数字和小数点均与前面的字符合并,类型属于前面一个字符的类型
if
types
[
i
]
==
temp_lang
or
types
[
i
]
==
"num"
or
types
[
i
]
==
"point"
:
if
temp_lang
==
"other"
:
if
types
[
i
]
==
temp_lang
:
temp_seg
+=
text
[
i
]
# 数字与后面的任意字符都拼接
elif
temp_lang
==
"num"
:
else
:
temp_seg
+=
text
[
i
]
if
types
[
i
]
==
"zh"
or
types
[
i
]
==
"en"
:
temp_lang
=
types
[
i
]
# 如果是空格则与前面字符拼接
elif
types
[
i
]
==
"blank"
:
else
:
if
types
[
i
]
==
temp_lang
:
temp_seg
+=
text
[
i
]
elif
types
[
i
]
==
"other"
:
temp_seg
+=
text
[
i
]
elif
types
[
i
]
==
"unk"
:
pass
else
:
segments
.
append
((
temp_seg
,
temp_lang
))
if
types
[
i
]
==
"zh"
or
types
[
i
]
==
"en"
:
temp_seg
=
text
[
i
]
temp_lang
=
types
[
i
]
flag
=
1
else
:
flag
=
0
temp_seg
=
""
temp_lang
=
""
segments
.
append
((
temp_seg
,
temp_lang
))
...
...
@@ -241,33 +107,29 @@ class MixFrontend():
add_sp
:
bool
=
True
,
to_tensor
:
bool
=
True
)
->
Dict
[
str
,
List
[
paddle
.
Tensor
]]:
sentences
=
self
.
_split
(
sentence
)
segments
=
self
.
get_segment
(
sentence
)
phones_list
=
[]
result
=
{}
for
text
in
sentences
:
phones_seg
=
[]
segments
=
self
.
_distinguish
(
text
)
for
seg
in
segments
:
content
=
seg
[
0
]
lang
=
seg
[
1
]
if
content
!=
''
:
if
lang
==
"en"
:
input_ids
=
self
.
en_frontend
.
get_input_ids
(
content
,
merge_sentences
=
Tru
e
,
to_tensor
=
to_tensor
)
content
,
merge_sentences
=
Fals
e
,
to_tensor
=
to_tensor
)
else
:
input_ids
=
self
.
zh_frontend
.
get_input_ids
(
content
,
merge_sentences
=
Tru
e
,
merge_sentences
=
Fals
e
,
get_tone_ids
=
get_tone_ids
,
to_tensor
=
to_tensor
)
phones_seg
.
append
(
input_ids
[
"phone_ids"
][
0
])
if
add_sp
:
phones_seg
.
append
(
self
.
sp_id_tensor
)
input_ids
[
"phone_ids"
][
-
1
]
=
paddle
.
concat
(
[
input_ids
[
"phone_ids"
][
-
1
],
self
.
sp_id_tensor
])
if
phones_seg
==
[]:
phones_seg
.
append
(
self
.
sp_id_tensor
)
phones
=
paddle
.
concat
(
phones_seg
)
for
phones
in
input_ids
[
"phone_ids"
]:
phones_list
.
append
(
phones
)
if
merge_sentences
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录