more comment on tts frontend

42f2186d · Hui Zhang · 8aa9790c · 42f2186d · 42f2186d · 42f2186d
9 changed file
--- a/paddlespeech/t2s/assets/__init__.py
+++ b/paddlespeech/t2s/assets/__init__.py
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@@ -99,14 +99,23 @@ def norm(data, mean, std):
    return (data - mean) / std
-def get_chunks(data, block_size: int, pad_size: int):
+def get_chunks(mel, chunk_size: int, pad_size: int):
-    data_len = data.shape[1]
+    """
+    Split mel by chunk size with left and right context.
+    Args:
+        mel (paddle.Tensor): mel spectrogram, shape (B, T, D)
+        chunk_size (int): chunk size
+        pad_size (int): size for left and right context.
+    """
+    T = mel.shape[1]
+    n = math.ceil(T / chunk_size)
    chunks = []
-    n = math.ceil(data_len / block_size)
    for i in range(n):
-        start = max(0, i * block_size - pad_size)
+        start = max(0, i * chunk_size - pad_size)
-        end = min((i + 1) * block_size + pad_size, data_len)
+        end = min((i + 1) * chunk_size + pad_size, T)
-        chunks.append(data[:, start:end, :])
+        chunks.append(mel[:, start:end, :])
    return chunks
@@ -117,14 +126,10 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
    with open(text_file, 'rt', encoding='utf-8') as f:
        for line in f:
            if line.strip() != "":
-                items = re.split(r"\s+", line.strip(), 1)
+                items = re.split(r"\s+", line.strip(), maxsplit=1)
+                assert len(items) == 2
                utt_id = items[0]
-                if lang in {'zh', 'canton'}:
+                sentence = items[1]
-                    sentence = "".join(items[1:])
-                elif lang == 'en':
-                    sentence = " ".join(items[1:])
-                elif lang == 'mix':
-                    sentence = " ".join(items[1:])
            sentences.append((utt_id, sentence))
    return sentences
@@ -319,6 +324,7 @@ def run_frontend(
        input_ids = {}
        if text.strip() != "" and re.match(r".*?<speak>.*?</speak>.*", text,
                                           re.DOTALL):
+            # using ssml
            input_ids = frontend.get_input_ids_ssml(
                text,
                merge_sentences=merge_sentences,
@@ -359,6 +365,7 @@ def run_frontend(
        outs.update({'is_slurs': is_slurs})
    else:
        print("lang should in {'zh', 'en', 'mix', 'canton', 'sing'}!")
    outs.update({'phone_ids': phone_ids})
    return outs

--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import argparse
 from pathlib import Path
+from pprint import pprint
 import paddle
 import soundfile as sf
@@ -78,6 +79,7 @@ def evaluate(args):
    # whether dygraph to static
    if args.inference_dir:
+        print("convert am and voc to static model.")
        # acoustic model
        am_inference = am_to_static(
            am_inference=am_inference,
@@ -92,6 +94,7 @@ def evaluate(args):
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    merge_sentences = False
    # Avoid not stopping at the end of a sub sentence when tacotron2_ljspeech dygraph to static graph
    # but still not stopping in the end (NOTE by yuantian01 Feb 9 2022)
@@ -102,12 +105,18 @@ def evaluate(args):
    if am_name == 'speedyspeech':
        get_tone_ids = True
+    # wav samples
    N = 0
+    # inference time cost
    T = 0
+    # [(uid, text), ]
    if am_name == 'diffsinger':
        sentences = get_sentences_svs(text_file=args.text)
    else:
        sentences = get_sentences(text_file=args.text, lang=args.lang)
+    pprint(f"inputs: {sentences}")
    for utt_id, sentence in sentences:
        with timer() as t:
            if am_name == "diffsinger":
@@ -116,6 +125,8 @@ def evaluate(args):
            else:
                text = sentence
                svs_input = None
+            # frontend
            frontend_dict = run_frontend(
                frontend=frontend,
                text=text,
@@ -124,25 +135,33 @@ def evaluate(args):
                lang=args.lang,
                svs_input=svs_input)
            phone_ids = frontend_dict['phone_ids']
+            # pprint(f"process: {utt_id} {phone_ids}")
            with paddle.no_grad():
                flags = 0
                for i in range(len(phone_ids)):
+                    # sub phone, split by `sp` or punctuation.
                    part_phone_ids = phone_ids[i]
                    # acoustic model
                    if am_name == 'fastspeech2':
                        # multi speaker
                        if am_dataset in {"aishell3", "vctk", "mix", "canton"}:
+                            # multi-speaker
                            spk_id = paddle.to_tensor(args.spk_id)
                            mel = am_inference(part_phone_ids, spk_id)
                        else:
+                            # single-speaker
                            mel = am_inference(part_phone_ids)
                    elif am_name == 'speedyspeech':
                        part_tone_ids = frontend_dict['tone_ids'][i]
                        if am_dataset in {"aishell3", "vctk", "mix"}:
+                            # multi-speaker
                            spk_id = paddle.to_tensor(args.spk_id)
                            mel = am_inference(part_phone_ids, part_tone_ids,
                                               spk_id)
                        else:
+                            # single-speaker
                            mel = am_inference(part_phone_ids, part_tone_ids)
                    elif am_name == 'tacotron2':
                        mel = am_inference(part_phone_ids)
@@ -155,6 +174,7 @@ def evaluate(args):
                            note=part_note_ids,
                            note_dur=part_note_durs,
                            is_slur=part_is_slurs, )
                    # vocoder
                    wav = voc_inference(mel)
                    if flags == 0:
@@ -162,17 +182,23 @@ def evaluate(args):
                        flags = 1
                    else:
                        wav_all = paddle.concat([wav_all, wav])
        wav = wav_all.numpy()
        N += wav.size
        T += t.elapse
+        # samples per second
        speed = wav.size / t.elapse
+        # generate one second wav need `RTF` seconds
        rtf = am_config.fs / speed
        print(
            f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
        )
        sf.write(
            str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs)
        print(f"{utt_id} done!")
    print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }")

--- a/paddlespeech/t2s/frontend/arpabet.py
+++ b/paddlespeech/t2s/frontend/arpabet.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from paddlespeech.t2s.frontend.phonectic import Phonetics
 """
 A phonology system with ARPABET symbols and limited punctuations. The G2P 
 conversion is done by g2p_en.
@@ -19,55 +18,68 @@ conversion is done by g2p_en.
 Note that g2p_en does not handle words with hypen well. So make sure the input
 sentence is first normalized.
 """
-from paddlespeech.t2s.frontend.vocab import Vocab
 from g2p_en import G2p
+from paddlespeech.t2s.frontend.phonectic import Phonetics
+from paddlespeech.t2s.frontend.vocab import Vocab
 class ARPABET(Phonetics):
-    """A phonology for English that uses ARPABET as the phoneme vocabulary.
+    """A phonology for English that uses ARPABET without stress as the phoneme vocabulary.
+    47 symbols = 39 phones + 4 punctuations + 4 special tokens(<pad> <unk> <s> </s>)
+    The current phoneme set contains 39 phonemes, vowels carry a lexical stress marker:
+        0    — No stress
+        1    — Primary stress
+        2    — Secondary stress
+    Phoneme Set:
+        Phoneme Example Translation
+            ------- ------- -----------
+            AA	odd     AA D
+            AE	at	AE T
+            AH	hut	HH AH T
+            AO	ought	AO T
+            AW	cow	K AW
+            AY	hide	HH AY D
+            B 	be	B IY
+            CH	cheese	CH IY Z
+            D 	dee	D IY
+            DH	thee	DH IY
+            EH	Ed	EH D
+            ER	hurt	HH ER T
+            EY	ate	EY T
+            F 	fee	F IY
+            G 	green	G R IY N
+            HH	he	HH IY
+            IH	it	IH T
+            IY	eat	IY T
+            JH	gee	JH IY
+            K 	key	K IY
+            L 	lee	L IY
+            M 	me	M IY
+            N 	knee	N IY
+            NG	ping	P IH NG
+            OW	oat	OW T
+            OY	toy	T OY
+            P 	pee	P IY
+            R 	read	R IY D
+            S 	sea	S IY
+            SH	she	SH IY
+            T 	tea	T IY
+            TH	theta	TH EY T AH
+            UH	hood	HH UH D
+            UW	two	T UW
+            V 	vee	V IY
+            W 	we	W IY
+            Y 	yield	Y IY L D
+            Z 	zee	Z IY
+            ZH	seizure	S IY ZH ER
    See http://www.speech.cs.cmu.edu/cgi-bin/cmudict for more details.
-    Phoneme Example Translation
-        ------- ------- -----------
-        AA	odd     AA D
-        AE	at	AE T
-        AH	hut	HH AH T
-        AO	ought	AO T
-        AW	cow	K AW
-        AY	hide	HH AY D
-        B 	be	B IY
-        CH	cheese	CH IY Z
-        D 	dee	D IY
-        DH	thee	DH IY
-        EH	Ed	EH D
-        ER	hurt	HH ER T
-        EY	ate	EY T
-        F 	fee	F IY
-        G 	green	G R IY N
-        HH	he	HH IY
-        IH	it	IH T
-        IY	eat	IY T
-        JH	gee	JH IY
-        K 	key	K IY
-        L 	lee	L IY
-        M 	me	M IY
-        N 	knee	N IY
-        NG	ping	P IH NG
-        OW	oat	OW T
-        OY	toy	T OY
-        P 	pee	P IY
-        R 	read	R IY D
-        S 	sea	S IY
-        SH	she	SH IY
-        T 	tea	T IY
-        TH	theta	TH EY T AH
-        UH	hood	HH UH D
-        UW	two	T UW
-        V 	vee	V IY
-        W 	we	W IY
-        Y 	yield	Y IY L D
-        Z 	zee	Z IY
-        ZH	seizure	S IY ZH ER
    """
+    # 39 phonemes
    phonemes = [
        'AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'B', 'CH', 'D', 'DH', 'EH', 'ER',
        'EY', 'F', 'G', 'HH', 'IH', 'IY', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW',
@@ -76,6 +88,8 @@ class ARPABET(Phonetics):
    ]
    punctuations = [',', '.', '?', '!']
    symbols = phonemes + punctuations
+    # vowels carry a lexical stress marker：
+    # 0 unstressed（无重音）, 1 primary stress（主重音）和 2 secondary stress（次重音）
    _stress_to_no_stress_ = {
        'AA0': 'AA',
        'AA1': 'AA',
@@ -124,7 +138,12 @@ class ARPABET(Phonetics):
        'UW2': 'UW'
    }
+    def __repr__(self):
+        fmt = "ARPABETWithoutStress(phonemes: {}, punctuations: {})"
+        return fmt.format(len(phonemes), punctuations)
    def __init__(self):
+        # https://github.com/Kyubyong/g2p/blob/master/g2p_en/g2p.py
        self.backend = G2p()
        self.vocab = Vocab(self.phonemes + self.punctuations)
@@ -139,6 +158,7 @@ class ARPABET(Phonetics):
        Returns:
            List[str]: The list of pronunciation sequence.
        """
+        # g2p and remove vowel stress
        phonemes = [
            self._remove_vowels(item) for item in self.backend(sentence)
        ]
@@ -158,6 +178,7 @@ class ARPABET(Phonetics):
        Returns:
            List[int]: The list of pronunciation id sequence.
        """
+        # phonemes to ids
        ids = [self.vocab.lookup(item) for item in phonemes]
        return ids
@@ -189,11 +210,16 @@ class ARPABET(Phonetics):
    def vocab_size(self):
        """ Vocab size.
        """
-        # 47 = 39 phones + 4 punctuations + 4 special tokens
+        # 47 = 39 phones + 4 punctuations + 4 special tokens(<pad> <unk> <s> </s>)
        return len(self.vocab)
 class ARPABETWithStress(Phonetics):
+    """
+    A phonology for English that uses ARPABET with stress as the phoneme vocabulary.
+    77 symbols = 69 phones + 4 punctuations + 4 special tokens
+    """
    phonemes = [
        'AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
        'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D',
@@ -206,6 +232,10 @@ class ARPABETWithStress(Phonetics):
    punctuations = [',', '.', '?', '!']
    symbols = phonemes + punctuations
+    def __repr__(self):
+        fmt = "ARPABETWithStress(phonemes: {}, punctuations: {})"
+        return fmt.format(len(phonemes), punctuations)
    def __init__(self):
        self.backend = G2p()
        self.vocab = Vocab(self.phonemes + self.punctuations)

--- a/paddlespeech/t2s/frontend/polyphonic.yaml
+++ b/paddlespeech/t2s/frontend/polyphonic.yaml
@@ -47,4 +47,5 @@ polyphonic:
    恶行: ['e4','xing2']
    唉: ['ai4']
    扎实: ['zha1','shi2']
    干将: ['gan4','jiang4']
\ No newline at end of file
+    陈威行: ['chen2', 'wei1', 'hang2']
\ No newline at end of file
--- a/paddlespeech/t2s/ssml/__init__.py
+++ b/paddlespeech/t2s/ssml/__init__.py
--- a/paddlespeech/t2s/ssml/xml_processor.py
+++ b/paddlespeech/t2s/ssml/xml_processor.py
@@ -90,13 +90,14 @@ class MixTextProcessor():
            dom = DomXml(in_xml)
            tags = dom.get_text_and_sayas_tags()
            ctlist.extend(tags)
            ctlist.append(after_xml)
            return ctlist
        else:
            ctlist.append(mixstr)
        return ctlist
 class DomXml():
    def __init__(self, xmlstr):
        self.tdom = parseString(xmlstr)  #Document

--- a/paddlespeech/t2s/frontend/tone_sandhi.py
+++ b/paddlespeech/t2s/frontend/tone_sandhi.py
@@ -20,6 +20,9 @@ from pypinyin import Style
 class ToneSandhi():
+    def __repr__(self):
+        return "MandarinToneSandhi"
    def __init__(self):
        self.must_neural_tone_words = {
            '麻烦', '麻利', '鸳鸯', '高粱', '骨头', '骆驼', '马虎', '首饰', '馒头', '馄饨', '风筝',
@@ -69,6 +72,19 @@ class ToneSandhi():
        }
        self.punc = "：，；。？！“”‘’':,;.?!"
+    def _split_word(self, word: str) -> List[str]:
+        word_list = jieba.cut_for_search(word)
+        word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
+        first_subword = word_list[0]
+        first_begin_idx = word.find(first_subword)
+        if first_begin_idx == 0:
+            second_subword = word[len(first_subword):]
+            new_word_list = [first_subword, second_subword]
+        else:
+            second_subword = word[:-len(first_subword)]
+            new_word_list = [second_subword, first_subword]
+        return new_word_list
    # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
    # e.g.
    # word: "家里"
@@ -154,18 +170,8 @@ class ToneSandhi():
                            finals[i] = finals[i][:-1] + "4"
        return finals
-    def _split_word(self, word: str) -> List[str]:
+    def _all_tone_three(self, finals: List[str]) -> bool:
-        word_list = jieba.cut_for_search(word)
+        return all(x[-1] == "3" for x in finals)
-        word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
-        first_subword = word_list[0]
-        first_begin_idx = word.find(first_subword)
-        if first_begin_idx == 0:
-            second_subword = word[len(first_subword):]
-            new_word_list = [first_subword, second_subword]
-        else:
-            second_subword = word[:-len(first_subword)]
-            new_word_list = [second_subword, first_subword]
-        return new_word_list
    def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
@@ -207,9 +213,6 @@ class ToneSandhi():
        return finals
-    def _all_tone_three(self, finals: List[str]) -> bool:
-        return all(x[-1] == "3" for x in finals)
    # merge "不" and the word behind it
    # if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error
    def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
@@ -336,6 +339,9 @@ class ToneSandhi():
    def pre_merge_for_modify(
            self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+        """
+            seg: [(word, pos), ...]
+        """
        seg = self._merge_bu(seg)
        seg = self._merge_yi(seg)
        seg = self._merge_reduplication(seg)
@@ -346,7 +352,11 @@ class ToneSandhi():
    def modified_tone(self, word: str, pos: str,
                      finals: List[str]) -> List[str]:
+        """
+            word: 分词
+            pos: 词性
+            finals: 带调韵母, [final1, ..., finaln]
+        """
        finals = self._bu_sandhi(word, finals)
        finals = self._yi_sandhi(word, finals)
        finals = self._neural_sandhi(word, pos, finals)

--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py