diff --git a/examples/text_normalization/local/test_normalization.py b/examples/text_normalization/local/test_normalization.py index 38a38460ec61bfad3407def3646c09848eabe19d..bcf7ee0dae0aa36f39a567003e8e6b03e5f621ee 100644 --- a/examples/text_normalization/local/test_normalization.py +++ b/examples/text_normalization/local/test_normalization.py @@ -1,7 +1,22 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import argparse + from text_processing import normalization -parser = argparse.ArgumentParser(description="Normalize text in Chinese with some rules.") +parser = argparse.ArgumentParser( + description="Normalize text in Chinese with some rules.") parser.add_argument("input", type=str, help="the input sentences") parser.add_argument("output", type=str, help="path to save the output file.") args = parser.parse_args() diff --git a/examples/text_normalization/path.sh b/examples/text_normalization/path.sh index c8b1f1c2f606e890eb0e9fe448b50917eed1e993..7cec3a24d3d80db44e7db6339e80cdd698fe2aec 100644 --- a/examples/text_normalization/path.sh +++ b/examples/text_normalization/path.sh @@ -1,5 +1,4 @@ export MAIN_ROOT=${PWD}/../../ - export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export LC_ALL=C diff --git a/examples/text_normalization/run.sh b/examples/text_normalization/run.sh index b39de2a208c2265a17d32dc25163ea4fef6b5e8e..c4043a319846bac9daacf41d1c1aeb5b69e7afc2 100755 --- a/examples/text_normalization/run.sh +++ b/examples/text_normalization/run.sh @@ -1,5 +1,4 @@ #!/usr/bin/env bash - source path.sh stage=-1 diff --git a/third_party/text_processing/__ini__.py b/third_party/text_processing/__ini__.py new file mode 100644 index 0000000000000000000000000000000000000000..8d1c8b69c3fce7bea45c73efd06983e3c419a92f --- /dev/null +++ b/third_party/text_processing/__ini__.py @@ -0,0 +1 @@ + diff --git a/third_party/text_processing/normalization/char_convert.py b/third_party/text_processing/normalization/char_convert.py index 1c035a80ea9d3a23bc0db32ba3bc99f94c6f3e1a..bd328f695880ce1af61d1a770012f8445a8662a8 100644 --- a/third_party/text_processing/normalization/char_convert.py +++ b/third_party/text_processing/normalization/char_convert.py @@ -2,6 +2,7 @@ `opencc `_. """ + import opencc _t2s_converter = opencc.OpenCC("t2s.json") @@ -11,4 +12,4 @@ def tranditional_to_simplified(text: str) -> str: return _t2s_converter.convert(text) def simplified_to_traditional(text: str) -> str: - return _s2t_converter.convert(text) \ No newline at end of file + return _s2t_converter.convert(text) diff --git a/third_party/text_processing/normalization/chronology.py b/third_party/text_processing/normalization/chronology.py index 727bbd65076d5f8d509869cd2588306c84471cfe..7143eb58c12ec275f336bd1c81d72ddda64f14f7 100644 --- a/third_party/text_processing/normalization/chronology.py +++ b/third_party/text_processing/normalization/chronology.py @@ -1,6 +1,7 @@ import re from .num import verbalize_cardinal, verbalize_digit, num2str, DIGITS + def _time_num2str(num_string: str) -> str: """A special case for verbalizing number in time.""" result = num2str(num_string.lstrip('0')) @@ -60,4 +61,4 @@ def replace_date2(match: re.Match) -> str: result += f"{verbalize_cardinal(month)}月" if day: result += f"{verbalize_cardinal(day)}日" - return result \ No newline at end of file + return result diff --git a/third_party/text_processing/normalization/constants.py b/third_party/text_processing/normalization/constants.py index bbfccb67b1996830755a01ef6ea4324b83cd2ac5..d5c04a761b7ee1007b241e7db7f327e706bec01b 100644 --- a/third_party/text_processing/normalization/constants.py +++ b/third_party/text_processing/normalization/constants.py @@ -2,6 +2,7 @@ import string import re from pypinyin.constants import SUPPORT_UCS4 + # 全角半角转换 # 英文字符全角 -> 半角映射表 (num: 52) F2H_ASCII_LETTERS = { diff --git a/third_party/text_processing/normalization/num.py b/third_party/text_processing/normalization/num.py index 9b8b0ab3aa77db9825a30cba83ae61033f91f115..60fc1686d08ae40933eb62a22e3a32d650851d0a 100644 --- a/third_party/text_processing/normalization/num.py +++ b/third_party/text_processing/normalization/num.py @@ -2,6 +2,7 @@ Rules to verbalize numbers into Chinese characters. https://zh.wikipedia.org/wiki/中文数字#現代中文 """ + import re from typing import List from collections import OrderedDict diff --git a/third_party/text_processing/normalization/phone.py b/third_party/text_processing/normalization/phone.py index e8bdecd755d31aa341e2f2f7caf7e33f21ac450c..1acc183658b26308b8e7b656df419557307e3f94 100644 --- a/third_party/text_processing/normalization/phone.py +++ b/third_party/text_processing/normalization/phone.py @@ -1,6 +1,7 @@ import re from .num import verbalize_digit + # 规范化固话/手机号码 # 手机 # http://www.jihaoba.com/news/show/13680 @@ -27,4 +28,4 @@ def phone2str(phone_string: str, mobile=True) -> str: def replace_phone(match: re.Match) -> str: - return phone2str(match.group(0)) \ No newline at end of file + return phone2str(match.group(0)) diff --git a/third_party/text_processing/normalization/quantifier.py b/third_party/text_processing/normalization/quantifier.py index 836fc88c2b48eb14de2c589cc321e9728d1203e1..024eb6e017419a8d13aacadd0ab583dc6f9c5960 100644 --- a/third_party/text_processing/normalization/quantifier.py +++ b/third_party/text_processing/normalization/quantifier.py @@ -1,6 +1,7 @@ import re from .num import num2str + # 温度表达式,温度会影响负号的读法 # -3°C 零下三度 RE_TEMPERATURE = re.compile( @@ -14,4 +15,4 @@ def replace_temperature(match: re.Match) -> str: temperature: str = num2str(temperature) unit: str = "摄氏度" if unit == "摄氏度" else "度" result = f"{sign}{temperature}{unit}" - return result \ No newline at end of file + return result diff --git a/third_party/text_processing/normalization/sentence_split.py b/third_party/text_processing/normalization/sentence_split.py index 451371da85e300c4f17f765497e5adfecf4ddb12..5867342ba01998f0ce662891cc857c0c60d95574 100644 --- a/third_party/text_processing/normalization/sentence_split.py +++ b/third_party/text_processing/normalization/sentence_split.py @@ -1,6 +1,7 @@ import re from typing import List + SENTENCE_SPLITOR = re.compile(r'([。!?][”’]?)') def split(text: str) -> List[str]: