未验证 提交 1c3d2cb8 编写于 作者: D David An (An Hongliang) 提交者: GitHub

add double byte char for zh normalization (#2661)

上级 94a487bd
...@@ -19,7 +19,7 @@ from pypinyin.constants import SUPPORT_UCS4 ...@@ -19,7 +19,7 @@ from pypinyin.constants import SUPPORT_UCS4
# 全角半角转换 # 全角半角转换
# 英文字符全角 -> 半角映射表 (num: 52) # 英文字符全角 -> 半角映射表 (num: 52)
F2H_ASCII_LETTERS = { F2H_ASCII_LETTERS = {
chr(ord(char) + 65248): char ord(char) + 65248: ord(char)
for char in string.ascii_letters for char in string.ascii_letters
} }
...@@ -27,12 +27,12 @@ F2H_ASCII_LETTERS = { ...@@ -27,12 +27,12 @@ F2H_ASCII_LETTERS = {
H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()} H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()}
# 数字字符全角 -> 半角映射表 (num: 10) # 数字字符全角 -> 半角映射表 (num: 10)
F2H_DIGITS = {chr(ord(char) + 65248): char for char in string.digits} F2H_DIGITS = {ord(char) + 65248: ord(char) for char in string.digits}
# 数字字符半角 -> 全角映射表 # 数字字符半角 -> 全角映射表
H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()} H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()}
# 标点符号全角 -> 半角映射表 (num: 32) # 标点符号全角 -> 半角映射表 (num: 32)
F2H_PUNCTUATIONS = {chr(ord(char) + 65248): char for char in string.punctuation} F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation}
# 标点符号半角 -> 全角映射表 # 标点符号半角 -> 全角映射表
H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()} H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()}
......
...@@ -74,6 +74,17 @@ class TextNormalizer(): ...@@ -74,6 +74,17 @@ class TextNormalizer():
def _post_replace(self, sentence: str) -> str: def _post_replace(self, sentence: str) -> str:
sentence = sentence.replace('/', '每') sentence = sentence.replace('/', '每')
sentence = sentence.replace('~', '至') sentence = sentence.replace('~', '至')
sentence = sentence.replace('~', '至')
sentence = sentence.replace('①', '一')
sentence = sentence.replace('②', '二')
sentence = sentence.replace('③', '三')
sentence = sentence.replace('④', '四')
sentence = sentence.replace('⑤', '五')
sentence = sentence.replace('⑥', '六')
sentence = sentence.replace('⑦', '七')
sentence = sentence.replace('⑧', '八')
sentence = sentence.replace('⑨', '九')
sentence = sentence.replace('⑩', '十')
return sentence return sentence
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册