fix long text oom using ssml; filter comma; update polyphonic

d53c4994 · Hui Zhang · 108e73e1 · d53c4994 · d53c4994 · d53c4994
14 changed file
--- a/paddlespeech/t2s/assets/__init__.py
+++ b/paddlespeech/t2s/assets/__init__.py
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@@ -117,7 +117,7 @@ def evaluate(args):
        sentences = get_sentences(text_file=args.text, lang=args.lang)

    for utt_id, sentence in sentences:
-        print(f"{utt_id} {sentence} ...")
+        print(f"{utt_id} {sentence}")
        with timer() as t:
            if am_name == "diffsinger":
                text = ""
@@ -135,7 +135,7 @@ def evaluate(args):
                lang=args.lang,
                svs_input=svs_input)
            phone_ids = frontend_dict['phone_ids']
-            # pprint(f"process: {utt_id} {phone_ids}")
+            # pprint(f"{utt_id} {phone_ids}")

            with paddle.no_grad():
                flags = 0

--- a/paddlespeech/t2s/frontend/canton_frontend.py
+++ b/paddlespeech/t2s/frontend/canton_frontend.py
@@ -48,7 +48,7 @@ def jyuping_to_phonemes(cantons: List[str]):
 class CantonFrontend():
    def __init__(self, phone_vocab_path: str):
        self.text_normalizer = TextNormalizer()
-        self.punc = "：，；。？！“”‘’':,;.?!"
+        self.punc = "、：，；。？！“”‘’':,;.?!"

        self.vocab_phones = {}
        if phone_vocab_path:

--- a/paddlespeech/t2s/frontend/en_frontend.py
+++ b/paddlespeech/t2s/frontend/en_frontend.py
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from .phonectic import English
--- a/paddlespeech/t2s/frontend/mix_frontend.py
+++ b/paddlespeech/t2s/frontend/mix_frontend.py
@@ -106,76 +106,95 @@ class MixFrontend():
                      get_tone_ids: bool=False,
                      add_sp: bool=True,
                      to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
-        ''' 1. 添加SSML支持，先列出 文字 和 <say-as>标签内容，
-                然后添加到tmpSegments数组里
-        '''
-        d_inputs = MixTextProcessor.get_dom_split(sentence)
-        tmpSegments = []
-        for instr in d_inputs:
-            ''' 暂时只支持 say-as '''
-            if instr.lower().startswith("<say-as"):
-                tmpSegments.append((instr, "zh"))
+        # XML Document Object Model (DOM)
+        doms = MixTextProcessor.get_dom_split(sentence)
+
+        lang_splits = []
+        for dom in doms:
+            if dom.lower().startswith("<say-as pinyin="):
+                # `<say-as pinyin=` for zh lang
+                lang_splits.append((dom, "zh"))
            else:
-                tmpSegments.extend(self.split_by_lang(instr))
-        ''' 2. 把zh的merge到一起，避免合成结果中间停顿
-        '''
+                # process zh, en and zh/en
+                lang_splits.extend(self.split_by_lang(dom))
+
+        # merge adjacent zh segment
        segments = []
        currentSeg = ["", ""]
-        for seg in tmpSegments:
+        for seg in lang_splits:
            if seg[1] == "en" or seg[1] == "other":
                if currentSeg[0] == '':
+                    # first see
                    segments.append(seg)
                else:
+                    # zh
                    currentSeg[0] = "<speak>" + currentSeg[0] + "</speak>"
                    segments.append(tuple(currentSeg))
+                    # en
                    segments.append(seg)
+                    # reset
                    currentSeg = ["", ""]
            else:
+                # zh
                if currentSeg[0] == '':
+                    # first see
                    currentSeg[0] = seg[0]
                    currentSeg[1] = seg[1]
                else:
+                    # merge zh 
                    currentSeg[0] = currentSeg[0] + seg[0]
+
        if currentSeg[0] != '':
+            # last zh
            currentSeg[0] = "<speak>" + currentSeg[0] + "</speak>"
            segments.append(tuple(currentSeg))

        phones_list = []
        result = {}

+        # 008 我们要去云南 team building, 非常非常 happy.
+        # seg ('我们要去云南 ', 'zh')
+        # seg ('team building, ', 'en')
+        # seg ('非常非常 ', 'zh')
+        # seg ('happy.', 'en')
+        # [('<speak>我们要去云南 </speak>', 'zh'), ('team building, ', 'en'), ('<speak>非常非常 </speak>', 'zh'), ('happy.', 'en')]
        for seg in segments:
            content = seg[0]
            lang = seg[1]
-            if content != '':
-                if lang == "en":
-                    input_ids = self.en_frontend.get_input_ids(
-                        content, merge_sentences=False, to_tensor=to_tensor)
+
+            if not content:
+                continue
+
+            if lang == "en":
+                input_ids = self.en_frontend.get_input_ids(
+                    content, merge_sentences=False, to_tensor=to_tensor)
+            else:
+                if content.strip() != "" and \
+                    re.match(r".*?<speak>.*?</speak>.*", content, re.DOTALL):
+                    # process ssml
+                    input_ids = self.zh_frontend.get_input_ids_ssml(
+                        content,
+                        merge_sentences=False,
+                        get_tone_ids=get_tone_ids,
+                        to_tensor=to_tensor)
                else:
-                    ''' 3. 把带speak tag的中文和普通文字分开处理
-                    '''
-                    if content.strip() != "" and \
-                        re.match(r".*?<speak>.*?</speak>.*", content, re.DOTALL):
-                        input_ids = self.zh_frontend.get_input_ids_ssml(
-                            content,
-                            merge_sentences=False,
-                            get_tone_ids=get_tone_ids,
-                            to_tensor=to_tensor)
-                    else:
-                        input_ids = self.zh_frontend.get_input_ids(
-                            content,
-                            merge_sentences=False,
-                            get_tone_ids=get_tone_ids,
-                            to_tensor=to_tensor)
-                if add_sp:
-                    if to_tensor:
-                        input_ids["phone_ids"][-1] = paddle.concat(
-                            [input_ids["phone_ids"][-1], self.sp_id_tensor])
-                    else:
-                        input_ids["phone_ids"][-1] = np.concatenate(
-                            (input_ids["phone_ids"][-1], self.sp_id_numpy))
+                    # process plain text
+                    input_ids = self.zh_frontend.get_input_ids(
+                        content,
+                        merge_sentences=False,
+                        get_tone_ids=get_tone_ids,
+                        to_tensor=to_tensor)
+
+            if add_sp:
+                # add sp between zh and en
+                if to_tensor:
+                    input_ids["phone_ids"][-1] = paddle.concat(
+                        [input_ids["phone_ids"][-1], self.sp_id_tensor])
+                else:
+                    input_ids["phone_ids"][-1] = np.concatenate(
+                        (input_ids["phone_ids"][-1], self.sp_id_numpy))

-                for phones in input_ids["phone_ids"]:
-                    phones_list.append(phones)
+            phones_list.extend(input_ids["phone_ids"])

        if merge_sentences:
            merge_list = paddle.concat(phones_list)

--- a/paddlespeech/t2s/frontend/phonectic.py
+++ b/paddlespeech/t2s/frontend/phonectic.py
@@ -55,7 +55,7 @@ class English(Phonetics):
        self.punctuations = get_punctuations("en")
        self.vocab = Vocab(self.phonemes + self.punctuations)
        self.vocab_phones = {}
-        self.punc = "：，；。？！“”‘’':,;.?!"
+        self.punc = "、：，；。？！“”‘’':,;.?!"
        self.text_normalizer = TextNormalizer()
        if phone_vocab_path:
            with open(phone_vocab_path, 'rt', encoding='utf-8') as f:

--- a/paddlespeech/t2s/frontend/polyphonic.py
+++ b/paddlespeech/t2s/frontend/polyphonic.py
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import yaml
+
+
+class Polyphonic():
+    def __init__(self):
+        with open(
+                os.path.join(
+                    os.path.dirname(os.path.abspath(__file__)),
+                    'polyphonic.yaml'),
+                'r',
+                encoding='utf-8') as polyphonic_file:
+            # 解析yaml
+            polyphonic_dict = yaml.load(polyphonic_file, Loader=yaml.FullLoader)
+        self.polyphonic_words = polyphonic_dict["polyphonic"]
+
+    def correct_pronunciation(self, word, pinyin):
+        # 词汇被词典收录则返回纠正后的读音
+        print(word, pinyin)
+        if word in self.polyphonic_words.keys():
+            pinyin = self.polyphonic_words[word]
+        print('new', pinyin)
+        # 否则返回原读音
+        return pinyin
--- a/paddlespeech/t2s/frontend/polyphonic.yaml
+++ b/paddlespeech/t2s/frontend/polyphonic.yaml
@@ -48,4 +48,7 @@ polyphonic:
    唉: ['ai4']
    扎实: ['zha1','shi2']
    干将: ['gan4','jiang4']
-    陈威行: ['chen2', 'wei1', 'hang2']
\ No newline at end of file
+    陈威行: ['chen2', 'wei1', 'hang2']
+    郭晟: ['guo1', 'sheng4']
+    中标: ['zhong4', 'biao1']
+    抗住: ['kang2', 'zhu4']
\ No newline at end of file
--- a/paddlespeech/t2s/frontend/sing_frontend.py
+++ b/paddlespeech/t2s/frontend/sing_frontend.py
@@ -29,7 +29,7 @@ class SingFrontend():
            pinyin_phone_path (str): pinyin to phone file path, a 'pinyin|phones' (like: ba|b a ) pair per line.
            phone_vocab_path (str): phone to phone id file path, a 'phone phone id' (like: a 4 ) pair per line.
        """
-        self.punc = '[：，；。？！“”‘’\':,;.?!]'
+        self.punc = '[、：，；。？！“”‘’\':,;.?!]'

        self.pinyin_phones = {'AP': 'AP', 'SP': 'SP'}
        if pinyin_phone_path:

--- a/paddlespeech/t2s/frontend/ssml/__init__.py
+++ b/paddlespeech/t2s/frontend/ssml/__init__.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/paddlespeech/t2s/frontend/ssml/xml_processor.py
+++ b/paddlespeech/t2s/frontend/ssml/xml_processor.py
 # -*- coding: utf-8 -*-
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import re
 import xml.dom.minidom
 import xml.parsers.expat
@@ -68,7 +81,8 @@ class MixTextProcessor():
            after_xml = mat.group(3)

            # pre with none syllable
-            ctlist.append([pre_xml, []])
+            if pre_xml:
+                ctlist.append([pre_xml, []])

            # between with syllable
            # [(sub sentence, [syllables]), ...]
@@ -77,9 +91,11 @@ class MixTextProcessor():
            ctlist = ctlist + pinyinlist

            # post with none syllable
-            ctlist.append([after_xml, []])
+            if after_xml:
+                ctlist.append([after_xml, []])
        else:
            ctlist.append([mixstr, []])
+
        return ctlist

    @classmethod
@@ -94,15 +110,18 @@ class MixTextProcessor():
            in_xml = mat.group(2)
            after_xml = mat.group(3)

-            ctlist.append(pre_xml)
+            if pre_xml:
+                ctlist.append(pre_xml)
+
            dom = DomXml(in_xml)
            tags = dom.get_text_and_sayas_tags()
            ctlist.extend(tags)

-            ctlist.append(after_xml)
-            return ctlist
+            if after_xml:
+                ctlist.append(after_xml)
        else:
            ctlist.append(mixstr)
+
        return ctlist



--- a/paddlespeech/t2s/frontend/tone_sandhi.py
+++ b/paddlespeech/t2s/frontend/tone_sandhi.py
@@ -68,9 +68,9 @@ class ToneSandhi():
            '男子', '女子', '分子', '原子', '量子', '莲子', '石子', '瓜子', '电子', '人人', '虎虎',
            '幺幺', '干嘛', '学子', '哈哈', '数数', '袅袅', '局地', '以下', '娃哈哈', '花花草草', '留得',
            '耕地', '想想', '熙熙', '攘攘', '卵子', '死死', '冉冉', '恳恳', '佼佼', '吵吵', '打打',
-            '考考', '整整', '莘莘', '落地', '算子', '家家户户'
+            '考考', '整整', '莘莘', '落地', '算子', '家家户户', '青青'
        }
-        self.punc = "：，；。？！“”‘’':,;.?!"
+        self.punc = "、：，；。？！“”‘’':,;.?!"

    def _split_word(self, word: str) -> List[str]:
        word_list = jieba.cut_for_search(word)

--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@@ -31,6 +31,7 @@ from pypinyin_dict.phrase_pinyin_data import large_pinyin

 from paddlespeech.t2s.frontend.g2pw import G2PWOnnxConverter
 from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon
+from paddlespeech.t2s.frontend.polyphonic import Polyphonic
 from paddlespeech.t2s.frontend.rhy_prediction.rhy_predictor import RhyPredictor
 from paddlespeech.t2s.frontend.ssml.xml_processor import MixTextProcessor
 from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi
@@ -68,26 +69,6 @@ def insert_after_character(lst, item):
    return result


-class Polyphonic():
-    def __init__(self):
-        with open(
-                os.path.join(
-                    os.path.dirname(os.path.abspath(__file__)),
-                    'polyphonic.yaml'),
-                'r',
-                encoding='utf-8') as polyphonic_file:
-            # 解析yaml
-            polyphonic_dict = yaml.load(polyphonic_file, Loader=yaml.FullLoader)
-        self.polyphonic_words = polyphonic_dict["polyphonic"]
-
-    def correct_pronunciation(self, word, pinyin):
-        # 词汇被词典收录则返回纠正后的读音
-        if word in self.polyphonic_words.keys():
-            pinyin = self.polyphonic_words[word]
-        # 否则返回原读音
-        return pinyin
-
-
 class Frontend():
    def __init__(self,
                 g2p_model="g2pW",
@@ -95,7 +76,7 @@ class Frontend():
                 tone_vocab_path=None,
                 use_rhy=False):

-        self.punc = "：，；。？！“”‘’':,;.?!"
+        self.punc = "、：，；。？！“”‘’':,;.?!"
        self.rhy_phns = ['sp1', 'sp2', 'sp3', 'sp4']
        self.phrases_dict = {
            '开户行': [['ka1i'], ['hu4'], ['hang2']],
@@ -567,6 +548,7 @@ class Frontend():

        phones = []
        for c, v in zip(initials, finals):
+            # c for consonant, v for vowel
            # NOTE: post process for pypinyin outputs
            # we discriminate i, ii and iii
            if c and c not in self.punc:
@@ -633,16 +615,19 @@ class Frontend():
                new_phonemes.append(new_sentence)
            all_phonemes = new_phonemes

+        if merge_sentences:
+            all_phonemes = [sum(all_phonemes, [])]
+
        if print_info:
            print("----------------------------")
            print("text norm results:")
            print(sentences)
            print("----------------------------")
            print("g2p results:")
-            print(all_phonemes[0])
+            print(all_phonemes)
            print("----------------------------")

-        return [sum(all_phonemes, [])]
+        return all_phonemes

    def add_sp_if_no(self, phonemes):
        """

--- a/tests/unit/tts/test_mixfrontend.py
+++ b/tests/unit/tts/test_mixfrontend.py
@@ -423,7 +423,7 @@ if __name__ == '__main__':
        segs = frontend.split_by_lang(text)
        print(segs)

-        # 对于SSML的xml标记处理不好。
+        # 对于SSML的xml标记处理不好。需要先解析SSML，后处理中英的划分。
        text = "<speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸，不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>"
        print(text)
        # [('<speak>', 'en'), ('我们的声学模型使用了 ', 'zh'), ('Fast Speech Two。', 'en'), ('前浪<', 'zh'), ("say-as pinyin='dao3'>", 'en'), ('倒</', 'zh'), ('say-as>', 'en'), ('在沙滩上,沙滩上倒了一堆<', 'zh'), ("say-as pinyin='tu3'>", 'en'), ('土</', 'zh'), ('say-as>。 ', 'en'), ('想象<', 'zh'), ("say-as pinyin='gan1 gan1'>", 'en'), ('干干</', 'zh'), ('say-as>', 'en'), ('的树干<', 'zh'), ("say-as pinyin='dao3'>", 'en'), ('倒</', 'zh'), ('say-as>', 'en'), ('了, 里面有个干尸，不知是被谁<', 'zh'), ("say-as pinyin='gan4'>", 'en'), ('干</', 'zh'), ('say-as>', 'en'), ('死的。</', 'zh'), ('speak>', 'en')]