fix tone sand_hi bugs for Chinese frontend

f7780658 · 小湉湉 · 83e10fad · f7780658 · f7780658 · f7780658
6 changed file
--- a/examples/other/g2p/README.md
+++ b/examples/other/g2p/README.md
@@ -7,18 +7,18 @@ We use `WER` as an evaluation criterion.

 # Start
 Run the command below to get the results of the test.
+
 ```bash
 ./run.sh
 ```

-The `avg WER` of g2p is: 0.028952373312476395
+The `avg WER` of g2p is: 0.024219452438490413

 ```text
     ,--------------------------------------------------------------------.
     |                         ./exp/g2p/text.g2p                         |
     |--------------------------------------------------------------------|
     | SPKR   | # Snt    # Wrd  | Corr    Sub    Del    Ins    Err  S.Err |
-     |--------+-----------------+-----------------------------------------|
-     | Sum/Avg|  9996   299181  | 97.2    2.8    0.0    0.1    2.9   53.3 |
+     | Sum/Avg|  9996   299181  | 97.6    2.4    0.0    0.0    2.4   49.2 |
     `--------------------------------------------------------------------'
 ```
--- a/paddlespeech/resource/pretrained_models.py
+++ b/paddlespeech/resource/pretrained_models.py
@@ -1359,9 +1359,9 @@ g2pw_onnx_models = {
    'G2PWModel': {
        '1.0': {
            'url':
-            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel.tar',
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.0.zip',
            'md5':
-            '63bc0894af15a5a591e58b2130a2bcac',
+            '7e049a55547da840502cf99e8a64f20e',
        },
    },
 }
--- a/paddlespeech/t2s/frontend/g2pw/onnx_api.py
+++ b/paddlespeech/t2s/frontend/g2pw/onnx_api.py
@@ -31,8 +31,11 @@ from paddlespeech.t2s.frontend.g2pw.dataset import get_char_phoneme_labels
 from paddlespeech.t2s.frontend.g2pw.dataset import get_phoneme_labels
 from paddlespeech.t2s.frontend.g2pw.dataset import prepare_onnx_input
 from paddlespeech.t2s.frontend.g2pw.utils import load_config
+from paddlespeech.t2s.frontend.zh_normalization.char_convert import tranditional_to_simplified
 from paddlespeech.utils.env import MODEL_HOME

+model_version = '1.0'
+

 def predict(session, onnx_input, labels):
    all_preds = []
@@ -62,34 +65,38 @@ class G2PWOnnxConverter:
                 style='bopomofo',
                 model_source=None,
                 enable_non_tradional_chinese=False):
-        if not os.path.exists(os.path.join(model_dir, 'G2PWModel/g2pW.onnx')):
-            uncompress_path = download_and_decompress(
-                g2pw_onnx_models['G2PWModel']['1.0'], model_dir)
+        uncompress_path = download_and_decompress(
+            g2pw_onnx_models['G2PWModel'][model_version], model_dir)

        sess_options = onnxruntime.SessionOptions()
        sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
        sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
        sess_options.intra_op_num_threads = 2
        self.session_g2pW = onnxruntime.InferenceSession(
-            os.path.join(model_dir, 'G2PWModel/g2pW.onnx'),
+            os.path.join(uncompress_path, 'g2pW.onnx'),
            sess_options=sess_options)
        self.config = load_config(
-            os.path.join(model_dir, 'G2PWModel/config.py'), use_default=True)
+            os.path.join(uncompress_path, 'config.py'), use_default=True)

        self.model_source = model_source if model_source else self.config.model_source
        self.enable_opencc = enable_non_tradional_chinese

        self.tokenizer = BertTokenizer.from_pretrained(self.config.model_source)

-        polyphonic_chars_path = os.path.join(model_dir,
-                                             'G2PWModel/POLYPHONIC_CHARS.txt')
-        monophonic_chars_path = os.path.join(model_dir,
-                                             'G2PWModel/MONOPHONIC_CHARS.txt')
+        polyphonic_chars_path = os.path.join(uncompress_path,
+                                             'POLYPHONIC_CHARS.txt')
+        monophonic_chars_path = os.path.join(uncompress_path,
+                                             'MONOPHONIC_CHARS.txt')
        self.polyphonic_chars = [
            line.split('\t')
            for line in open(polyphonic_chars_path, encoding='utf-8').read()
            .strip().split('\n')
        ]
+        self.non_polyphonic = {
+            '一', '不', '和', '咋', '嗲', '剖', '差', '攢', '倒', '難', '奔', '勁', '拗',
+            '肖', '瘙', '誒', '泊'
+        }
+        self.non_monophonic = {'似', '攢'}
        self.monophonic_chars = [
            line.split('\t')
            for line in open(monophonic_chars_path, encoding='utf-8').read()
@@ -101,13 +108,27 @@ class G2PWOnnxConverter:
            self.polyphonic_chars)

        self.chars = sorted(list(self.char2phonemes.keys()))
+
+        self.polyphonic_chars_new = set(self.chars)
+        for char in self.non_polyphonic:
+            if char in self.polyphonic_chars_new:
+                self.polyphonic_chars_new.remove(char)
+
+        self.monophonic_chars_dict = {
+            char: phoneme
+            for char, phoneme in self.monophonic_chars
+        }
+        for char in self.non_monophonic:
+            if char in self.monophonic_chars_dict:
+                self.monophonic_chars_dict.pop(char)
+
        self.pos_tags = [
            'UNK', 'A', 'C', 'D', 'I', 'N', 'P', 'T', 'V', 'DE', 'SHI'
        ]

        with open(
-                os.path.join(model_dir,
-                             'G2PWModel/bopomofo_to_pinyin_wo_tune_dict.json'),
+                os.path.join(uncompress_path,
+                             'bopomofo_to_pinyin_wo_tune_dict.json'),
                'r',
                encoding='utf-8') as fr:
            self.bopomofo_convert_dict = json.load(fr)
@@ -117,7 +138,7 @@ class G2PWOnnxConverter:
        }[style]

        with open(
-                os.path.join(model_dir, 'G2PWModel/char_bopomofo_dict.json'),
+                os.path.join(uncompress_path, 'char_bopomofo_dict.json'),
                'r',
                encoding='utf-8') as fr:
            self.char_bopomofo_dict = json.load(fr)
@@ -175,25 +196,25 @@ class G2PWOnnxConverter:
        return results

    def _prepare_data(self, sentences):
-        polyphonic_chars = set(self.chars)
-        monophonic_chars_dict = {
-            char: phoneme
-            for char, phoneme in self.monophonic_chars
-        }
        texts, query_ids, sent_ids, partial_results = [], [], [], []
        for sent_id, sent in enumerate(sentences):
-            pypinyin_result = pinyin(sent, style=Style.TONE3)
+            # pypinyin works well for Simplified Chinese than Traditional Chinese
+            sent_s = tranditional_to_simplified(sent)
+            pypinyin_result = pinyin(sent_s, style=Style.TONE3)
            partial_result = [None] * len(sent)
            for i, char in enumerate(sent):
-                if char in polyphonic_chars:
+                if char in self.polyphonic_chars_new:
                    texts.append(sent)
                    query_ids.append(i)
                    sent_ids.append(sent_id)
-                elif char in monophonic_chars_dict:
+                elif char in self.monophonic_chars_dict:
                    partial_result[i] = self.style_convert_func(
-                        monophonic_chars_dict[char])
+                        self.monophonic_chars_dict[char])
                elif char in self.char_bopomofo_dict:
                    partial_result[i] = pypinyin_result[i][0]
                    # partial_result[i] =  self.style_convert_func(self.char_bopomofo_dict[char][0])
+                else:
+                    partial_result[i] = pypinyin_result[i][0]
+
            partial_results.append(partial_result)
        return texts, query_ids, sent_ids, partial_results
--- a/paddlespeech/t2s/frontend/polyphonic.yaml
+++ b/paddlespeech/t2s/frontend/polyphonic.yaml
@@ -23,4 +23,22 @@ polyphonic:
    鸭绿江: ['ya1','lu4','jiang1']
    撒切尔: ['sa4','qie4','er3']
    比比皆是: ['bi3','bi3','jie1','shi4']
-    身无长物: ['shen1','wu2','chang2','wu4']
\ No newline at end of file
+    身无长物: ['shen1','wu2','chang2','wu4']
+    手里: ['shou2','li3']
+    关卡: ['guan1','qia3']
+    怀揣: ['huai2','chuai1']
+    挑剔: ['tiao1','ti4']
+    供称: ['gong4','cheng1']
+    作坊: ['zuo1', 'fang5']
+    中医: ['zhong1','yi1']
+    嚷嚷: ['rang1','rang5']
+    商厦: ['shang1','sha4']
+    大厦: ['da4','sha4']
+    刹车: ['sha1','che1']
+    嘚瑟: ['de4','se5']
+    朝鲜: ['chao2','xian3']
+    阿房宫: ['e1','pang2','gong1']
+    阿胶: ['e1','jiao1']
+    咖喱: ['ga1','li5']
+    时分: ['shi2','fen1']
+    蚌埠: ['beng4','bu4']
--- a/paddlespeech/t2s/frontend/tone_sandhi.py
+++ b/paddlespeech/t2s/frontend/tone_sandhi.py
@@ -41,30 +41,32 @@ class ToneSandhi():
            '棺材', '棒槌', '棉花', '核桃', '栅栏', '柴火', '架势', '枕头', '枇杷', '机灵', '本事',
            '木头', '木匠', '朋友', '月饼', '月亮', '暖和', '明白', '时候', '新鲜', '故事', '收拾',
            '收成', '提防', '挖苦', '挑剔', '指甲', '指头', '拾掇', '拳头', '拨弄', '招牌', '招呼',
-            '抬举', '护士', '折腾', '扫帚', '打量', '打算', '打点', '打扮', '打听', '打发', '扎实',
-            '扁担', '戒指', '懒得', '意识', '意思', '情形', '悟性', '怪物', '思量', '怎么', '念头',
-            '念叨', '快活', '忙活', '志气', '心思', '得罪', '张罗', '弟兄', '开通', '应酬', '庄稼',
-            '干事', '帮手', '帐篷', '希罕', '师父', '师傅', '巴结', '巴掌', '差事', '工夫', '岁数',
-            '屁股', '尾巴', '少爷', '小气', '小伙', '将就', '对头', '对付', '寡妇', '家伙', '客气',
-            '实在', '官司', '学问', '学生', '字号', '嫁妆', '媳妇', '媒人', '婆家', '娘家', '委屈',
-            '姑娘', '姐夫', '妯娌', '妥当', '妖精', '奴才', '女婿', '头发', '太阳', '大爷', '大方',
-            '大意', '大夫', '多少', '多么', '外甥', '壮实', '地道', '地方', '在乎', '困难', '嘴巴',
-            '嘱咐', '嘟囔', '嘀咕', '喜欢', '喇嘛', '喇叭', '商量', '唾沫', '哑巴', '哈欠', '哆嗦',
-            '咳嗽', '和尚', '告诉', '告示', '含糊', '吓唬', '后头', '名字', '名堂', '合同', '吆喝',
-            '叫唤', '口袋', '厚道', '厉害', '千斤', '包袱', '包涵', '匀称', '勤快', '动静', '动弹',
-            '功夫', '力气', '前头', '刺猬', '刺激', '别扭', '利落', '利索', '利害', '分析', '出息',
-            '凑合', '凉快', '冷战', '冤枉', '冒失', '养活', '关系', '先生', '兄弟', '便宜', '使唤',
-            '佩服', '作坊', '体面', '位置', '似的', '伙计', '休息', '什么', '人家', '亲戚', '亲家',
-            '交情', '云彩', '事情', '买卖', '主意', '丫头', '丧气', '两口', '东西', '东家', '世故',
-            '不由', '不在', '下水', '下巴', '上头', '上司', '丈夫', '丈人', '一辈', '那个', '菩萨',
-            '父亲', '母亲', '咕噜', '邋遢', '费用', '冤家', '甜头', '介绍', '荒唐', '大人', '泥鳅',
-            '幸福', '熟悉', '计划', '扑腾', '蜡烛', '姥爷', '照顾', '喉咙', '吉他', '弄堂', '蚂蚱',
-            '凤凰', '拖沓', '寒碜', '糟蹋', '倒腾', '报复', '逻辑', '盘缠', '喽啰', '牢骚', '咖喱',
-            '扫把', '惦记'
+            '抬举', '护士', '折腾', '扫帚', '打量', '打算', '打扮', '打听', '打发', '扎实', '扁担',
+            '戒指', '懒得', '意识', '意思', '情形', '悟性', '怪物', '思量', '怎么', '念头', '念叨',
+            '快活', '忙活', '志气', '心思', '得罪', '张罗', '弟兄', '开通', '应酬', '庄稼', '干事',
+            '帮手', '帐篷', '希罕', '师父', '师傅', '巴结', '巴掌', '差事', '工夫', '岁数', '屁股',
+            '尾巴', '少爷', '小气', '小伙', '将就', '对头', '对付', '寡妇', '家伙', '客气', '实在',
+            '官司', '学问', '字号', '嫁妆', '媳妇', '媒人', '婆家', '娘家', '委屈', '姑娘', '姐夫',
+            '妯娌', '妥当', '妖精', '奴才', '女婿', '头发', '太阳', '大爷', '大方', '大意', '大夫',
+            '多少', '多么', '外甥', '壮实', '地道', '地方', '在乎', '困难', '嘴巴', '嘱咐', '嘟囔',
+            '嘀咕', '喜欢', '喇嘛', '喇叭', '商量', '唾沫', '哑巴', '哈欠', '哆嗦', '咳嗽', '和尚',
+            '告诉', '告示', '含糊', '吓唬', '后头', '名字', '名堂', '合同', '吆喝', '叫唤', '口袋',
+            '厚道', '厉害', '千斤', '包袱', '包涵', '匀称', '勤快', '动静', '动弹', '功夫', '力气',
+            '前头', '刺猬', '刺激', '别扭', '利落', '利索', '利害', '分析', '出息', '凑合', '凉快',
+            '冷战', '冤枉', '冒失', '养活', '关系', '先生', '兄弟', '便宜', '使唤', '佩服', '作坊',
+            '体面', '位置', '似的', '伙计', '休息', '什么', '人家', '亲戚', '亲家', '交情', '云彩',
+            '事情', '买卖', '主意', '丫头', '丧气', '两口', '东西', '东家', '世故', '不由', '下水',
+            '下巴', '上头', '上司', '丈夫', '丈人', '一辈', '那个', '菩萨', '父亲', '母亲', '咕噜',
+            '邋遢', '费用', '冤家', '甜头', '介绍', '荒唐', '大人', '泥鳅', '幸福', '熟悉', '计划',
+            '扑腾', '蜡烛', '姥爷', '照顾', '喉咙', '吉他', '弄堂', '蚂蚱', '凤凰', '拖沓', '寒碜',
+            '糟蹋', '倒腾', '报复', '逻辑', '盘缠', '喽啰', '牢骚', '咖喱', '扫把', '惦记', '戏弄',
+            '将军', '别人'
        }
        self.must_not_neural_tone_words = {
-            "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子", "人人", "虎虎",
-            "幺幺"
+            '男子', '女子', '分子', '原子', '量子', '莲子', '石子', '瓜子', '电子', '人人', '虎虎',
+            '幺幺', '干嘛', '学子', '哈哈', '数数', '袅袅', '局地', '以下', '娃哈哈', '花花草草', '留得',
+            '耕地', '想想', '熙熙', '攘攘', '卵子', '死死', '冉冉', '恳恳', '佼佼', '吵吵', '打打',
+            '考考', '整整', '莘莘'
        }
        self.punc = "：，；。？！“”‘’':,;.?!"

@@ -75,27 +77,24 @@ class ToneSandhi():
    # finals: ['ia1', 'i3']
    def _neural_sandhi(self, word: str, pos: str,
                       finals: List[str]) -> List[str]:
-
+        if word in self.must_not_neural_tone_words:
+            return finals
        # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
        for j, item in enumerate(word):
-            if j - 1 >= 0 and item == word[j - 1] and pos[0] in {
-                    "n", "v", "a"
-            } and word not in self.must_not_neural_tone_words:
+            if j - 1 >= 0 and item == word[j - 1] and pos[0] in {"n", "v", "a"}:
                finals[j] = finals[j][:-1] + "5"
        ge_idx = word.find("个")
-        if len(word) >= 1 and word[-1] in "吧呢哈啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
+        if len(word) >= 1 and word[-1] in "吧呢啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
            finals[-1] = finals[-1][:-1] + "5"
        elif len(word) >= 1 and word[-1] in "的地得":
            finals[-1] = finals[-1][:-1] + "5"
        # e.g. 走了, 看着, 去过
        elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}:
            finals[-1] = finals[-1][:-1] + "5"
-        elif len(word) > 1 and word[-1] in "们子" and pos in {
-                "r", "n"
-        } and word not in self.must_not_neural_tone_words:
+        elif len(word) > 1 and word[-1] in "们子" and pos in {"r", "n"}:
            finals[-1] = finals[-1][:-1] + "5"
-        # e.g. 桌上, 地下, 家里
-        elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}:
+        # e.g. 桌上, 地下
+        elif len(word) > 1 and word[-1] in "上下" and pos in {"s", "l", "f"}:
            finals[-1] = finals[-1][:-1] + "5"
        # e.g. 上来, 下去
        elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开":
@@ -147,7 +146,7 @@ class ToneSandhi():
            for i, char in enumerate(word):
                if char == "一" and i + 1 < len(word):
                    # "一" before tone4 should be yi2, e.g. 一段
-                    if finals[i + 1][-1] == "4":
+                    if finals[i + 1][-1] in {'4', '5'}:
                        finals[i] = finals[i][:-1] + "2"
                    # "一" before non-tone4 should be yi4, e.g. 一天
                    else:
@@ -239,7 +238,12 @@ class ToneSandhi():
        for i, (word, pos) in enumerate(seg):
            if i - 1 >= 0 and word == "一" and i + 1 < len(seg) and seg[i - 1][
                    0] == seg[i + 1][0] and seg[i - 1][1] == "v":
-                new_seg[i - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0]
+                if i - 1 < len(new_seg):
+                    new_seg[i -
+                            1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0]
+                else:
+                    new_seg.append([word, pos])
+                    new_seg.append([seg[i + 1][0], pos])
            else:
                if i - 2 >= 0 and seg[i - 1][0] == "一" and seg[i - 2][
                        0] == word and pos == "v":

--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@@ -84,6 +84,24 @@ class Frontend():
        self.tone_modifier = ToneSandhi()
        self.text_normalizer = TextNormalizer()
        self.punc = "：，；。？！“”‘’':,;.?!"
+        self.phrases_dict = {
+            '开户行': [['ka1i'], ['hu4'], ['hang2']],
+            '发卡行': [['fa4'], ['ka3'], ['hang2']],
+            '放款行': [['fa4ng'], ['kua3n'], ['hang2']],
+            '茧行': [['jia3n'], ['hang2']],
+            '行号': [['hang2'], ['ha4o']],
+            '各地': [['ge4'], ['di4']],
+            '借还款': [['jie4'], ['hua2n'], ['kua3n']],
+            '时间为': [['shi2'], ['jia1n'], ['we2i']],
+            '为准': [['we2i'], ['zhu3n']],
+            '色差': [['se4'], ['cha1']],
+            '嗲': [['dia3']],
+            '呗': [['bei5']],
+            '不': [['bu4']],
+            '咗': [['zuo5']],
+            '嘞': [['lei5']],
+            '掺和': [['chan1'], ['huo5']]
+        }
        # g2p_model can be pypinyin and g2pM and g2pW
        self.g2p_model = g2p_model
        if self.g2p_model == "g2pM":
@@ -91,6 +109,8 @@ class Frontend():
            self.pinyin2phone = generate_lexicon(
                with_tone=True, with_erhua=False)
        elif self.g2p_model == "g2pW":
+            # use pypinyin as backup for non polyphonic characters in g2pW
+            self._init_pypinyin()
            self.corrector = Polyphonic()
            self.g2pM_model = G2pM()
            self.g2pW_model = G2PWOnnxConverter(
@@ -99,8 +119,10 @@ class Frontend():
                with_tone=True, with_erhua=False)

        else:
-            self.__init__pypinyin()
-        self.must_erhua = {"小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿"}
+            self._init_pypinyin()
+        self.must_erhua = {
+            "小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿", "媳妇儿"
+        }
        self.not_erhua = {
            "虐儿", "为儿", "护儿", "瞒儿", "救儿", "替儿", "有儿", "一儿", "我儿", "俺儿", "妻儿",
            "拐儿", "聋儿", "乞儿", "患儿", "幼儿", "孤儿", "婴儿", "婴幼儿", "连体儿", "脑瘫儿",
@@ -108,6 +130,7 @@ class Frontend():
            "孙儿", "侄孙儿", "女儿", "男儿", "红孩儿", "花儿", "虫儿", "马儿", "鸟儿", "猪儿", "猫儿",
            "狗儿"
        }
+
        self.vocab_phones = {}
        self.vocab_tones = {}
        if phone_vocab_path:
@@ -121,20 +144,9 @@ class Frontend():
            for tone, id in tone_id:
                self.vocab_tones[tone] = int(id)

-    def __init__pypinyin(self):
+    def _init_pypinyin(self):
        large_pinyin.load()
-
-        load_phrases_dict({u'开户行': [[u'ka1i'], [u'hu4'], [u'hang2']]})
-        load_phrases_dict({u'发卡行': [[u'fa4'], [u'ka3'], [u'hang2']]})
-        load_phrases_dict({u'放款行': [[u'fa4ng'], [u'kua3n'], [u'hang2']]})
-        load_phrases_dict({u'茧行': [[u'jia3n'], [u'hang2']]})
-        load_phrases_dict({u'行号': [[u'hang2'], [u'ha4o']]})
-        load_phrases_dict({u'各地': [[u'ge4'], [u'di4']]})
-        load_phrases_dict({u'借还款': [[u'jie4'], [u'hua2n'], [u'kua3n']]})
-        load_phrases_dict({u'时间为': [[u'shi2'], [u'jia1n'], [u'we2i']]})
-        load_phrases_dict({u'为准': [[u'we2i'], [u'zhu3n']]})
-        load_phrases_dict({u'色差': [[u'se4'], [u'cha1']]})
-
+        load_phrases_dict(self.phrases_dict)
        # 调整字的拼音顺序
        load_single_dict({ord(u'地'): u'de,di4'})

@@ -258,7 +270,6 @@ class Frontend():
                    phones.append('sp')
                if v and v not in self.punc:
                    phones.append(v)
-
            phones_list.append(phones)
        if merge_sentences:
            merge_list = sum(phones_list, [])
@@ -275,6 +286,10 @@ class Frontend():
                     finals: List[str],
                     word: str,
                     pos: str) -> List[List[str]]:
+        # fix er1
+        for i, phn in enumerate(finals):
+            if i == len(finals) - 1 and word[i] == "儿" and phn == 'er1':
+                finals[i] = 'er2'
        if word not in self.must_erhua and (word in self.not_erhua or
                                            pos in {"a", "j", "nr"}):
            return initials, finals