From 42f2186d71c047938d0c94c2b7dfc797ee92f5c6 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Wed, 7 Jun 2023 07:47:12 +0000 Subject: [PATCH] more comment on tts frontend --- paddlespeech/t2s/assets/__init__.py | 0 paddlespeech/t2s/exps/syn_utils.py | 33 +- paddlespeech/t2s/exps/synthesize_e2e.py | 26 ++ paddlespeech/t2s/frontend/arpabet.py | 120 +++--- paddlespeech/t2s/frontend/polyphonic.yaml | 3 +- .../t2s/{ => frontend}/ssml/__init__.py | 0 .../t2s/{ => frontend}/ssml/xml_processor.py | 3 +- paddlespeech/t2s/frontend/tone_sandhi.py | 42 +- paddlespeech/t2s/frontend/zh_frontend.py | 382 ++++++++++++------ 9 files changed, 409 insertions(+), 200 deletions(-) create mode 100644 paddlespeech/t2s/assets/__init__.py rename paddlespeech/t2s/{ => frontend}/ssml/__init__.py (100%) rename paddlespeech/t2s/{ => frontend}/ssml/xml_processor.py (99%) diff --git a/paddlespeech/t2s/assets/__init__.py b/paddlespeech/t2s/assets/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py index 57c79dee..d15a93bf 100644 --- a/paddlespeech/t2s/exps/syn_utils.py +++ b/paddlespeech/t2s/exps/syn_utils.py @@ -99,14 +99,23 @@ def norm(data, mean, std): return (data - mean) / std -def get_chunks(data, block_size: int, pad_size: int): - data_len = data.shape[1] +def get_chunks(mel, chunk_size: int, pad_size: int): + """ + Split mel by chunk size with left and right context. + + Args: + mel (paddle.Tensor): mel spectrogram, shape (B, T, D) + chunk_size (int): chunk size + pad_size (int): size for left and right context. + """ + T = mel.shape[1] + n = math.ceil(T / chunk_size) + chunks = [] - n = math.ceil(data_len / block_size) for i in range(n): - start = max(0, i * block_size - pad_size) - end = min((i + 1) * block_size + pad_size, data_len) - chunks.append(data[:, start:end, :]) + start = max(0, i * chunk_size - pad_size) + end = min((i + 1) * chunk_size + pad_size, T) + chunks.append(mel[:, start:end, :]) return chunks @@ -117,14 +126,10 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'): with open(text_file, 'rt', encoding='utf-8') as f: for line in f: if line.strip() != "": - items = re.split(r"\s+", line.strip(), 1) + items = re.split(r"\s+", line.strip(), maxsplit=1) + assert len(items) == 2 utt_id = items[0] - if lang in {'zh', 'canton'}: - sentence = "".join(items[1:]) - elif lang == 'en': - sentence = " ".join(items[1:]) - elif lang == 'mix': - sentence = " ".join(items[1:]) + sentence = items[1] sentences.append((utt_id, sentence)) return sentences @@ -319,6 +324,7 @@ def run_frontend( input_ids = {} if text.strip() != "" and re.match(r".*?.*?.*", text, re.DOTALL): + # using ssml input_ids = frontend.get_input_ids_ssml( text, merge_sentences=merge_sentences, @@ -359,6 +365,7 @@ def run_frontend( outs.update({'is_slurs': is_slurs}) else: print("lang should in {'zh', 'en', 'mix', 'canton', 'sing'}!") + outs.update({'phone_ids': phone_ids}) return outs diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py index 0c7b34b0..bff329a4 100644 --- a/paddlespeech/t2s/exps/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/synthesize_e2e.py @@ -13,6 +13,7 @@ # limitations under the License. import argparse from pathlib import Path +from pprint import pprint import paddle import soundfile as sf @@ -78,6 +79,7 @@ def evaluate(args): # whether dygraph to static if args.inference_dir: + print("convert am and voc to static model.") # acoustic model am_inference = am_to_static( am_inference=am_inference, @@ -92,6 +94,7 @@ def evaluate(args): output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) + merge_sentences = False # Avoid not stopping at the end of a sub sentence when tacotron2_ljspeech dygraph to static graph # but still not stopping in the end (NOTE by yuantian01 Feb 9 2022) @@ -102,12 +105,18 @@ def evaluate(args): if am_name == 'speedyspeech': get_tone_ids = True + # wav samples N = 0 + # inference time cost T = 0 + + # [(uid, text), ] if am_name == 'diffsinger': sentences = get_sentences_svs(text_file=args.text) else: sentences = get_sentences(text_file=args.text, lang=args.lang) + pprint(f"inputs: {sentences}") + for utt_id, sentence in sentences: with timer() as t: if am_name == "diffsinger": @@ -116,6 +125,8 @@ def evaluate(args): else: text = sentence svs_input = None + + # frontend frontend_dict = run_frontend( frontend=frontend, text=text, @@ -124,25 +135,33 @@ def evaluate(args): lang=args.lang, svs_input=svs_input) phone_ids = frontend_dict['phone_ids'] + # pprint(f"process: {utt_id} {phone_ids}") + with paddle.no_grad(): flags = 0 for i in range(len(phone_ids)): + # sub phone, split by `sp` or punctuation. part_phone_ids = phone_ids[i] + # acoustic model if am_name == 'fastspeech2': # multi speaker if am_dataset in {"aishell3", "vctk", "mix", "canton"}: + # multi-speaker spk_id = paddle.to_tensor(args.spk_id) mel = am_inference(part_phone_ids, spk_id) else: + # single-speaker mel = am_inference(part_phone_ids) elif am_name == 'speedyspeech': part_tone_ids = frontend_dict['tone_ids'][i] if am_dataset in {"aishell3", "vctk", "mix"}: + # multi-speaker spk_id = paddle.to_tensor(args.spk_id) mel = am_inference(part_phone_ids, part_tone_ids, spk_id) else: + # single-speaker mel = am_inference(part_phone_ids, part_tone_ids) elif am_name == 'tacotron2': mel = am_inference(part_phone_ids) @@ -155,6 +174,7 @@ def evaluate(args): note=part_note_ids, note_dur=part_note_durs, is_slur=part_is_slurs, ) + # vocoder wav = voc_inference(mel) if flags == 0: @@ -162,17 +182,23 @@ def evaluate(args): flags = 1 else: wav_all = paddle.concat([wav_all, wav]) + wav = wav_all.numpy() N += wav.size T += t.elapse + + # samples per second speed = wav.size / t.elapse + # generate one second wav need `RTF` seconds rtf = am_config.fs / speed print( f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." ) + sf.write( str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs) print(f"{utt_id} done!") + print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }") diff --git a/paddlespeech/t2s/frontend/arpabet.py b/paddlespeech/t2s/frontend/arpabet.py index 7a81b645..9b2b11b3 100644 --- a/paddlespeech/t2s/frontend/arpabet.py +++ b/paddlespeech/t2s/frontend/arpabet.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from paddlespeech.t2s.frontend.phonectic import Phonetics """ A phonology system with ARPABET symbols and limited punctuations. The G2P conversion is done by g2p_en. @@ -19,55 +18,68 @@ conversion is done by g2p_en. Note that g2p_en does not handle words with hypen well. So make sure the input sentence is first normalized. """ -from paddlespeech.t2s.frontend.vocab import Vocab from g2p_en import G2p +from paddlespeech.t2s.frontend.phonectic import Phonetics +from paddlespeech.t2s.frontend.vocab import Vocab + class ARPABET(Phonetics): - """A phonology for English that uses ARPABET as the phoneme vocabulary. + """A phonology for English that uses ARPABET without stress as the phoneme vocabulary. + + 47 symbols = 39 phones + 4 punctuations + 4 special tokens( ) + + The current phoneme set contains 39 phonemes, vowels carry a lexical stress marker: + 0 — No stress + 1 — Primary stress + 2 — Secondary stress + + Phoneme Set: + Phoneme Example Translation + ------- ------- ----------- + AA odd AA D + AE at AE T + AH hut HH AH T + AO ought AO T + AW cow K AW + AY hide HH AY D + B be B IY + CH cheese CH IY Z + D dee D IY + DH thee DH IY + EH Ed EH D + ER hurt HH ER T + EY ate EY T + F fee F IY + G green G R IY N + HH he HH IY + IH it IH T + IY eat IY T + JH gee JH IY + K key K IY + L lee L IY + M me M IY + N knee N IY + NG ping P IH NG + OW oat OW T + OY toy T OY + P pee P IY + R read R IY D + S sea S IY + SH she SH IY + T tea T IY + TH theta TH EY T AH + UH hood HH UH D + UW two T UW + V vee V IY + W we W IY + Y yield Y IY L D + Z zee Z IY + ZH seizure S IY ZH ER + See http://www.speech.cs.cmu.edu/cgi-bin/cmudict for more details. - Phoneme Example Translation - ------- ------- ----------- - AA odd AA D - AE at AE T - AH hut HH AH T - AO ought AO T - AW cow K AW - AY hide HH AY D - B be B IY - CH cheese CH IY Z - D dee D IY - DH thee DH IY - EH Ed EH D - ER hurt HH ER T - EY ate EY T - F fee F IY - G green G R IY N - HH he HH IY - IH it IH T - IY eat IY T - JH gee JH IY - K key K IY - L lee L IY - M me M IY - N knee N IY - NG ping P IH NG - OW oat OW T - OY toy T OY - P pee P IY - R read R IY D - S sea S IY - SH she SH IY - T tea T IY - TH theta TH EY T AH - UH hood HH UH D - UW two T UW - V vee V IY - W we W IY - Y yield Y IY L D - Z zee Z IY - ZH seizure S IY ZH ER """ + # 39 phonemes phonemes = [ 'AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'B', 'CH', 'D', 'DH', 'EH', 'ER', 'EY', 'F', 'G', 'HH', 'IH', 'IY', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', @@ -76,6 +88,8 @@ class ARPABET(Phonetics): ] punctuations = [',', '.', '?', '!'] symbols = phonemes + punctuations + # vowels carry a lexical stress marker: + # 0 unstressed(无重音), 1 primary stress(主重音)和 2 secondary stress(次重音) _stress_to_no_stress_ = { 'AA0': 'AA', 'AA1': 'AA', @@ -124,7 +138,12 @@ class ARPABET(Phonetics): 'UW2': 'UW' } + def __repr__(self): + fmt = "ARPABETWithoutStress(phonemes: {}, punctuations: {})" + return fmt.format(len(phonemes), punctuations) + def __init__(self): + # https://github.com/Kyubyong/g2p/blob/master/g2p_en/g2p.py self.backend = G2p() self.vocab = Vocab(self.phonemes + self.punctuations) @@ -139,6 +158,7 @@ class ARPABET(Phonetics): Returns: List[str]: The list of pronunciation sequence. """ + # g2p and remove vowel stress phonemes = [ self._remove_vowels(item) for item in self.backend(sentence) ] @@ -158,6 +178,7 @@ class ARPABET(Phonetics): Returns: List[int]: The list of pronunciation id sequence. """ + # phonemes to ids ids = [self.vocab.lookup(item) for item in phonemes] return ids @@ -189,11 +210,16 @@ class ARPABET(Phonetics): def vocab_size(self): """ Vocab size. """ - # 47 = 39 phones + 4 punctuations + 4 special tokens + # 47 = 39 phones + 4 punctuations + 4 special tokens( ) return len(self.vocab) class ARPABETWithStress(Phonetics): + """ + A phonology for English that uses ARPABET with stress as the phoneme vocabulary. + + 77 symbols = 69 phones + 4 punctuations + 4 special tokens + """ phonemes = [ 'AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0', 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D', @@ -206,6 +232,10 @@ class ARPABETWithStress(Phonetics): punctuations = [',', '.', '?', '!'] symbols = phonemes + punctuations + def __repr__(self): + fmt = "ARPABETWithStress(phonemes: {}, punctuations: {})" + return fmt.format(len(phonemes), punctuations) + def __init__(self): self.backend = G2p() self.vocab = Vocab(self.phonemes + self.punctuations) diff --git a/paddlespeech/t2s/frontend/polyphonic.yaml b/paddlespeech/t2s/frontend/polyphonic.yaml index 6885035e..50659afb 100644 --- a/paddlespeech/t2s/frontend/polyphonic.yaml +++ b/paddlespeech/t2s/frontend/polyphonic.yaml @@ -47,4 +47,5 @@ polyphonic: 恶行: ['e4','xing2'] 唉: ['ai4'] 扎实: ['zha1','shi2'] - 干将: ['gan4','jiang4'] \ No newline at end of file + 干将: ['gan4','jiang4'] + 陈威行: ['chen2', 'wei1', 'hang2'] \ No newline at end of file diff --git a/paddlespeech/t2s/ssml/__init__.py b/paddlespeech/t2s/frontend/ssml/__init__.py similarity index 100% rename from paddlespeech/t2s/ssml/__init__.py rename to paddlespeech/t2s/frontend/ssml/__init__.py diff --git a/paddlespeech/t2s/ssml/xml_processor.py b/paddlespeech/t2s/frontend/ssml/xml_processor.py similarity index 99% rename from paddlespeech/t2s/ssml/xml_processor.py rename to paddlespeech/t2s/frontend/ssml/xml_processor.py index 892ca371..3e713d5d 100644 --- a/paddlespeech/t2s/ssml/xml_processor.py +++ b/paddlespeech/t2s/frontend/ssml/xml_processor.py @@ -90,13 +90,14 @@ class MixTextProcessor(): dom = DomXml(in_xml) tags = dom.get_text_and_sayas_tags() ctlist.extend(tags) - + ctlist.append(after_xml) return ctlist else: ctlist.append(mixstr) return ctlist + class DomXml(): def __init__(self, xmlstr): self.tdom = parseString(xmlstr) #Document diff --git a/paddlespeech/t2s/frontend/tone_sandhi.py b/paddlespeech/t2s/frontend/tone_sandhi.py index 42f7b8b2..5902540c 100644 --- a/paddlespeech/t2s/frontend/tone_sandhi.py +++ b/paddlespeech/t2s/frontend/tone_sandhi.py @@ -20,6 +20,9 @@ from pypinyin import Style class ToneSandhi(): + def __repr__(self): + return "MandarinToneSandhi" + def __init__(self): self.must_neural_tone_words = { '麻烦', '麻利', '鸳鸯', '高粱', '骨头', '骆驼', '马虎', '首饰', '馒头', '馄饨', '风筝', @@ -69,6 +72,19 @@ class ToneSandhi(): } self.punc = ":,;。?!“”‘’':,;.?!" + def _split_word(self, word: str) -> List[str]: + word_list = jieba.cut_for_search(word) + word_list = sorted(word_list, key=lambda i: len(i), reverse=False) + first_subword = word_list[0] + first_begin_idx = word.find(first_subword) + if first_begin_idx == 0: + second_subword = word[len(first_subword):] + new_word_list = [first_subword, second_subword] + else: + second_subword = word[:-len(first_subword)] + new_word_list = [second_subword, first_subword] + return new_word_list + # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041 # e.g. # word: "家里" @@ -154,18 +170,8 @@ class ToneSandhi(): finals[i] = finals[i][:-1] + "4" return finals - def _split_word(self, word: str) -> List[str]: - word_list = jieba.cut_for_search(word) - word_list = sorted(word_list, key=lambda i: len(i), reverse=False) - first_subword = word_list[0] - first_begin_idx = word.find(first_subword) - if first_begin_idx == 0: - second_subword = word[len(first_subword):] - new_word_list = [first_subword, second_subword] - else: - second_subword = word[:-len(first_subword)] - new_word_list = [second_subword, first_subword] - return new_word_list + def _all_tone_three(self, finals: List[str]) -> bool: + return all(x[-1] == "3" for x in finals) def _three_sandhi(self, word: str, finals: List[str]) -> List[str]: @@ -207,9 +213,6 @@ class ToneSandhi(): return finals - def _all_tone_three(self, finals: List[str]) -> bool: - return all(x[-1] == "3" for x in finals) - # merge "不" and the word behind it # if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: @@ -336,6 +339,9 @@ class ToneSandhi(): def pre_merge_for_modify( self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: + """ + seg: [(word, pos), ...] + """ seg = self._merge_bu(seg) seg = self._merge_yi(seg) seg = self._merge_reduplication(seg) @@ -346,7 +352,11 @@ class ToneSandhi(): def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]: - + """ + word: 分词 + pos: 词性 + finals: 带调韵母, [final1, ..., finaln] + """ finals = self._bu_sandhi(word, finals) finals = self._yi_sandhi(word, finals) finals = self._neural_sandhi(word, pos, finals) diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index 35b97a93..498a09fa 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -31,9 +31,9 @@ from pypinyin_dict.phrase_pinyin_data import large_pinyin from paddlespeech.t2s.frontend.g2pw import G2PWOnnxConverter from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon from paddlespeech.t2s.frontend.rhy_prediction.rhy_predictor import RhyPredictor +from paddlespeech.t2s.frontend.ssml.xml_processor import MixTextProcessor from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi from paddlespeech.t2s.frontend.zh_normalization.text_normlization import TextNormalizer -from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor INITIALS = [ 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'zh', 'ch', 'sh', @@ -49,13 +49,18 @@ def intersperse(lst, item): def insert_after_character(lst, item): + """ + inset `item` after finals. + """ result = [item] + for phone in lst: result.append(phone) if phone not in INITIALS: # finals has tones # assert phone[-1] in "12345" result.append(item) + return result @@ -85,9 +90,7 @@ class Frontend(): phone_vocab_path=None, tone_vocab_path=None, use_rhy=False): - self.mix_ssml_processor = MixTextProcessor() - self.tone_modifier = ToneSandhi() - self.text_normalizer = TextNormalizer() + self.punc = ":,;。?!“”‘’':,;.?!" self.rhy_phns = ['sp1', 'sp2', 'sp3', 'sp4'] self.phrases_dict = { @@ -108,28 +111,7 @@ class Frontend(): '嘞': [['lei5']], '掺和': [['chan1'], ['huo5']] } - self.use_rhy = use_rhy - if use_rhy: - self.rhy_predictor = RhyPredictor() - print("Rhythm predictor loaded.") - # g2p_model can be pypinyin and g2pM and g2pW - self.g2p_model = g2p_model - if self.g2p_model == "g2pM": - self.g2pM_model = G2pM() - self.pinyin2phone = generate_lexicon( - with_tone=True, with_erhua=False) - elif self.g2p_model == "g2pW": - # use pypinyin as backup for non polyphonic characters in g2pW - self._init_pypinyin() - self.corrector = Polyphonic() - self.g2pM_model = G2pM() - self.g2pW_model = G2PWOnnxConverter( - style='pinyin', enable_non_tradional_chinese=True) - self.pinyin2phone = generate_lexicon( - with_tone=True, with_erhua=False) - else: - self._init_pypinyin() self.must_erhua = { "小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿", "媳妇儿" } @@ -154,13 +136,51 @@ class Frontend(): for tone, id in tone_id: self.vocab_tones[tone] = int(id) + # SSML + self.mix_ssml_processor = MixTextProcessor() + # tone sandhi + self.tone_modifier = ToneSandhi() + # TN + self.text_normalizer = TextNormalizer() + + # prosody + self.use_rhy = use_rhy + if use_rhy: + self.rhy_predictor = RhyPredictor() + print("Rhythm predictor loaded.") + + # g2p + assert g2p_model in ('pypinyin', 'g2pM', 'g2pW') + self.g2p_model = g2p_model + if self.g2p_model == "g2pM": + self.g2pM_model = G2pM() + self.pinyin2phone = generate_lexicon( + with_tone=True, with_erhua=False) + elif self.g2p_model == "g2pW": + # use pypinyin as backup for non polyphonic characters in g2pW + self._init_pypinyin() + self.corrector = Polyphonic() + self.g2pM_model = G2pM() + self.g2pW_model = G2PWOnnxConverter( + style='pinyin', enable_non_tradional_chinese=True) + self.pinyin2phone = generate_lexicon( + with_tone=True, with_erhua=False) + else: + self._init_pypinyin() + def _init_pypinyin(self): + """ + Load pypinyin G2P module. + """ large_pinyin.load() load_phrases_dict(self.phrases_dict) # 调整字的拼音顺序 load_single_dict({ord(u'地'): u'de,di4'}) def _get_initials_finals(self, word: str) -> List[List[str]]: + """ + Get word initial and final by pypinyin or g2pM + """ initials = [] finals = [] if self.g2p_model == "pypinyin": @@ -171,11 +191,14 @@ class Frontend(): for c, v in zip(orig_initials, orig_finals): if re.match(r'i\d', v): if c in ['z', 'c', 's']: + # zi, ci, si v = re.sub('i', 'ii', v) elif c in ['zh', 'ch', 'sh', 'r']: + # zhi, chi, shi v = re.sub('i', 'iii', v) initials.append(c) finals.append(v) + elif self.g2p_model == "g2pM": pinyins = self.g2pM_model(word, tone=True, char_split=False) for pinyin in pinyins: @@ -192,58 +215,123 @@ class Frontend(): # If it's not pinyin (possibly punctuation) or no conversion is required initials.append(pinyin) finals.append(pinyin) + return initials, finals + def _merge_erhua(self, + initials: List[str], + finals: List[str], + word: str, + pos: str) -> List[List[str]]: + """ + Do erhub. + """ + # fix er1 + for i, phn in enumerate(finals): + if i == len(finals) - 1 and word[i] == "儿" and phn == 'er1': + finals[i] = 'er2' + + # 发音 + if word not in self.must_erhua and (word in self.not_erhua or + pos in {"a", "j", "nr"}): + return initials, finals + + # "……" 等情况直接返回 + if len(finals) != len(word): + return initials, finals + + assert len(finals) == len(word) + + # 不发音 + new_initials = [] + new_finals = [] + for i, phn in enumerate(finals): + if i == len(finals) - 1 and word[i] == "儿" and phn in { + "er2", "er5" + } and word[-2:] not in self.not_erhua and new_finals: + new_finals[-1] = new_finals[-1][:-1] + "r" + new_finals[-1][-1] + else: + new_initials.append(initials[i]) + new_finals.append(phn) + + return new_initials, new_finals + # if merge_sentences, merge all sentences into one phone sequence def _g2p(self, sentences: List[str], merge_sentences: bool=True, with_erhua: bool=True) -> List[List[str]]: + """ + Return: list of list phonemes. + [['w', 'o3', 'm', 'en2', 'sp'], ...] + """ segments = sentences phones_list = [] + + # split by punctuation for seg in segments: if self.use_rhy: seg = self.rhy_predictor._clean_text(seg) - phones = [] - # Replace all English words in the sentence + + # remove all English words in the sentence seg = re.sub('[a-zA-Z]+', '', seg) + + # add prosody mark if self.use_rhy: seg = self.rhy_predictor.get_prediction(seg) + + # [(word, pos), ...] seg_cut = psg.lcut(seg) - initials = [] - finals = [] + # fix wordseg bad case for sandhi seg_cut = self.tone_modifier.pre_merge_for_modify(seg_cut) + # 为了多音词获得更好的效果,这里采用整句预测 + phones = [] + initials = [] + finals = [] if self.g2p_model == "g2pW": try: + # undo prosody if self.use_rhy: seg = self.rhy_predictor._clean_text(seg) + + # g2p pinyins = self.g2pW_model(seg)[0] except Exception: - # g2pW采用模型采用繁体输入,如果有cover不了的简体词,采用g2pM预测 + # g2pW 模型采用繁体输入,如果有cover不了的简体词,采用g2pM预测 print("[%s] not in g2pW dict,use g2pM" % seg) pinyins = self.g2pM_model(seg, tone=True, char_split=False) + + # do prosody if self.use_rhy: rhy_text = self.rhy_predictor.get_prediction(seg) final_py = self.rhy_predictor.pinyin_align(pinyins, rhy_text) pinyins = final_py + pre_word_length = 0 for word, pos in seg_cut: sub_initials = [] sub_finals = [] now_word_length = pre_word_length + len(word) + + # skip english word if pos == 'eng': pre_word_length = now_word_length continue + word_pinyins = pinyins[pre_word_length:now_word_length] - # 矫正发音 + + # 多音字消歧 word_pinyins = self.corrector.correct_pronunciation( word, word_pinyins) + for pinyin, char in zip(word_pinyins, word): if pinyin is None: pinyin = char + pinyin = pinyin.replace("u:", "v") + if pinyin in self.pinyin2phone: initial_final_list = self.pinyin2phone[ pinyin].split(" ") @@ -257,28 +345,41 @@ class Frontend(): # If it's not pinyin (possibly punctuation) or no conversion is required sub_initials.append(pinyin) sub_finals.append(pinyin) + pre_word_length = now_word_length + # tone sandhi sub_finals = self.tone_modifier.modified_tone(word, pos, sub_finals) + # er hua if with_erhua: sub_initials, sub_finals = self._merge_erhua( sub_initials, sub_finals, word, pos) + initials.append(sub_initials) finals.append(sub_finals) # assert len(sub_initials) == len(sub_finals) == len(word) else: + # pypinyin, g2pM for word, pos in seg_cut: if pos == 'eng': + # skip english word continue + + # g2p sub_initials, sub_finals = self._get_initials_finals(word) + # tone sandhi sub_finals = self.tone_modifier.modified_tone(word, pos, sub_finals) + # er hua if with_erhua: sub_initials, sub_finals = self._merge_erhua( sub_initials, sub_finals, word, pos) + initials.append(sub_initials) finals.append(sub_finals) # assert len(sub_initials) == len(sub_finals) == len(word) + + # sum(iterable[, start]) initials = sum(initials, []) finals = sum(finals, []) @@ -287,111 +388,34 @@ class Frontend(): # we discriminate i, ii and iii if c and c not in self.punc: phones.append(c) + # replace punctuation by `sp` if c and c in self.punc: phones.append('sp') + if v and v not in self.punc and v not in self.rhy_phns: phones.append(v) - phones_list.append(phones) - if merge_sentences: - merge_list = sum(phones_list, []) - # rm the last 'sp' to avoid the noise at the end - # cause in the training data, no 'sp' in the end - if merge_list[-1] == 'sp': - merge_list = merge_list[:-1] - phones_list = [] - phones_list.append(merge_list) - return phones_list - - def _split_word_to_char(self, words): - res = [] - for x in words: - res.append(x) - return res - - # if using ssml, have pingyin specified, assign pinyin to words - def _g2p_assign(self, - words: List[str], - pinyin_spec: List[str], - merge_sentences: bool=True) -> List[List[str]]: - phones_list = [] - initials = [] - finals = [] - words = self._split_word_to_char(words[0]) - for pinyin, char in zip(pinyin_spec, words): - sub_initials = [] - sub_finals = [] - pinyin = pinyin.replace("u:", "v") - #self.pinyin2phone: is a dict with all pinyin mapped with sheng_mu yun_mu - if pinyin in self.pinyin2phone: - initial_final_list = self.pinyin2phone[pinyin].split(" ") - if len(initial_final_list) == 2: - sub_initials.append(initial_final_list[0]) - sub_finals.append(initial_final_list[1]) - elif len(initial_final_list) == 1: - sub_initials.append('') - sub_finals.append(initial_final_list[1]) - else: - # If it's not pinyin (possibly punctuation) or no conversion is required - sub_initials.append(pinyin) - sub_finals.append(pinyin) - initials.append(sub_initials) - finals.append(sub_finals) + phones_list.append(phones) - initials = sum(initials, []) - finals = sum(finals, []) - phones = [] - for c, v in zip(initials, finals): - # NOTE: post process for pypinyin outputs - # we discriminate i, ii and iii - if c and c not in self.punc: - phones.append(c) - if c and c in self.punc: - phones.append('sp') - if v and v not in self.punc and v not in self.rhy_phns: - phones.append(v) - phones_list.append(phones) + # merge split sub sentence into one sentence. if merge_sentences: + # sub sentence phonemes merge_list = sum(phones_list, []) # rm the last 'sp' to avoid the noise at the end # cause in the training data, no 'sp' in the end if merge_list[-1] == 'sp': merge_list = merge_list[:-1] + + # sentence phonemes phones_list = [] phones_list.append(merge_list) - return phones_list - - def _merge_erhua(self, - initials: List[str], - finals: List[str], - word: str, - pos: str) -> List[List[str]]: - # fix er1 - for i, phn in enumerate(finals): - if i == len(finals) - 1 and word[i] == "儿" and phn == 'er1': - finals[i] = 'er2' - if word not in self.must_erhua and (word in self.not_erhua or - pos in {"a", "j", "nr"}): - return initials, finals - # "……" 等情况直接返回 - if len(finals) != len(word): - return initials, finals - assert len(finals) == len(word) - - new_initials = [] - new_finals = [] - for i, phn in enumerate(finals): - if i == len(finals) - 1 and word[i] == "儿" and phn in { - "er2", "er5" - } and word[-2:] not in self.not_erhua and new_finals: - new_finals[-1] = new_finals[-1][:-1] + "r" + new_finals[-1][-1] - else: - new_finals.append(phn) - new_initials.append(initials[i]) - return new_initials, new_finals + return phones_list def _p2id(self, phonemes: List[str]) -> np.ndarray: + """ + Phoneme to Index + """ # replace unk phone with sp phonemes = [ phn if phn in self.vocab_phones else "sp" for phn in phonemes @@ -400,6 +424,9 @@ class Frontend(): return np.array(phone_ids, np.int64) def _t2id(self, tones: List[str]) -> np.ndarray: + """ + Tone to Index. + """ # replace unk phone with sp tones = [tone if tone in self.vocab_tones else "0" for tone in tones] tone_ids = [self.vocab_tones[item] for item in tones] @@ -407,6 +434,9 @@ class Frontend(): def _get_phone_tone(self, phonemes: List[str], get_tone_ids: bool=False) -> List[List[str]]: + """ + Get tone from phonemes. + """ phones = [] tones = [] if get_tone_ids and self.vocab_tones: @@ -423,13 +453,14 @@ class Frontend(): -1] == 'r' and phone not in self.vocab_phones and phone[: -1] in self.vocab_phones: phones.append(phone[:-1]) - phones.append("er") tones.append(tone) + phones.append("er") tones.append("2") else: phones.append(phone) tones.append(tone) else: + # initals with 0 tone. phones.append(full_phone) tones.append('0') else: @@ -443,6 +474,7 @@ class Frontend(): phones.append("er2") else: phones.append(phone) + return phones, tones def get_phonemes(self, @@ -451,10 +483,16 @@ class Frontend(): with_erhua: bool=True, robot: bool=False, print_info: bool=False) -> List[List[str]]: + """ + Main function to do G2P + """ + # TN & Text Segmentation sentences = self.text_normalizer.normalize(sentence) + # Prosody & WS & g2p & tone sandhi phonemes = self._g2p( sentences, merge_sentences=merge_sentences, with_erhua=with_erhua) - # change all tones to `1` + + # simulate robot pronunciation, change all tones to `1` if robot: new_phonemes = [] for sentence in phonemes: @@ -466,6 +504,7 @@ class Frontend(): new_sentence.append(item) new_phonemes.append(new_sentence) phonemes = new_phonemes + if print_info: print("----------------------------") print("text norm results:") @@ -476,25 +515,101 @@ class Frontend(): print("----------------------------") return phonemes - #@an added for ssml pinyin + def _split_word_to_char(self, words): + res = [] + for x in words: + res.append(x) + return res + + # if using ssml, have pingyin specified, assign pinyin to words + def _g2p_assign(self, + words: List[str], + pinyin_spec: List[str], + merge_sentences: bool=True) -> List[List[str]]: + """ + Replace phoneme by SSML + """ + phones_list = [] + initials = [] + finals = [] + + # to charactor list + words = self._split_word_to_char(words[0]) + + for pinyin, char in zip(pinyin_spec, words): + sub_initials = [] + sub_finals = [] + pinyin = pinyin.replace("u:", "v") + + #self.pinyin2phone: is a dict with all pinyin mapped with sheng_mu yun_mu + if pinyin in self.pinyin2phone: + initial_final_list = self.pinyin2phone[pinyin].split(" ") + if len(initial_final_list) == 2: + sub_initials.append(initial_final_list[0]) + sub_finals.append(initial_final_list[1]) + elif len(initial_final_list) == 1: + sub_initials.append('') + sub_finals.append(initial_final_list[1]) + else: + # If it's not pinyin (possibly punctuation) or no conversion is required + sub_initials.append(pinyin) + sub_finals.append(pinyin) + + initials.append(sub_initials) + finals.append(sub_finals) + + initials = sum(initials, []) + finals = sum(finals, []) + + phones = [] + for c, v in zip(initials, finals): + # NOTE: post process for pypinyin outputs + # we discriminate i, ii and iii + if c and c not in self.punc: + phones.append(c) + # replace punc to `sp` + if c and c in self.punc: + phones.append('sp') + if v and v not in self.punc and v not in self.rhy_phns: + phones.append(v) + phones_list.append(phones) + + if merge_sentences: + merge_list = sum(phones_list, []) + # rm the last 'sp' to avoid the noise at the end + # cause in the training data, no 'sp' in the end + if merge_list[-1] == 'sp': + merge_list = merge_list[:-1] + phones_list = [] + phones_list.append(merge_list) + + return phones_list + def get_phonemes_ssml(self, ssml_inputs: list, merge_sentences: bool=True, with_erhua: bool=True, robot: bool=False, print_info: bool=False) -> List[List[str]]: + """ + Main function to do G2P with SSML support. + """ all_phonemes = [] for word_pinyin_item in ssml_inputs: phonemes = [] + print("ssml inputs:", word_pinyin_item) sentence, pinyin_spec = itemgetter(0, 1)(word_pinyin_item) + print('ssml g2p:', sentence, pinyin_spec) + # TN & Text Segmentation sentences = self.text_normalizer.normalize(sentence) if len(pinyin_spec) == 0: + # g2p word w/o specified phonemes = self._g2p( sentences, merge_sentences=merge_sentences, with_erhua=with_erhua) else: - # phonemes should be pinyin_spec + # word phonemes specified by phonemes = self._g2p_assign( sentences, pinyin_spec, merge_sentences=merge_sentences) @@ -523,6 +638,9 @@ class Frontend(): return [sum(all_phonemes, [])] def add_sp_if_no(self, phonemes): + """ + Prosody mark #4 added at sentence end. + """ if not phonemes[-1][-1].startswith('sp'): phonemes[-1].append('sp4') return phonemes @@ -542,8 +660,11 @@ class Frontend(): merge_sentences=merge_sentences, print_info=print_info, robot=robot) + + # add #4 for sentence end. if self.use_rhy: phonemes = self.add_sp_if_no(phonemes) + result = {} phones = [] tones = [] @@ -551,28 +672,33 @@ class Frontend(): temp_tone_ids = [] for part_phonemes in phonemes: + phones, tones = self._get_phone_tone( part_phonemes, get_tone_ids=get_tone_ids) + if add_blank: phones = insert_after_character(phones, blank_token) + if tones: tone_ids = self._t2id(tones) if to_tensor: tone_ids = paddle.to_tensor(tone_ids) temp_tone_ids.append(tone_ids) + if phones: phone_ids = self._p2id(phones) # if use paddle.to_tensor() in onnxruntime, the first time will be too low if to_tensor: phone_ids = paddle.to_tensor(phone_ids) temp_phone_ids.append(phone_ids) + if temp_tone_ids: result["tone_ids"] = temp_tone_ids if temp_phone_ids: result["phone_ids"] = temp_phone_ids + return result - # @an added for ssml def get_input_ids_ssml( self, sentence: str, @@ -584,12 +710,15 @@ class Frontend(): blank_token: str="", to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]: + # split setence by SSML tag. l_inputs = MixTextProcessor.get_pinyin_split(sentence) + phonemes = self.get_phonemes_ssml( l_inputs, merge_sentences=merge_sentences, print_info=print_info, robot=robot) + result = {} phones = [] tones = [] @@ -599,21 +728,26 @@ class Frontend(): for part_phonemes in phonemes: phones, tones = self._get_phone_tone( part_phonemes, get_tone_ids=get_tone_ids) + if add_blank: phones = insert_after_character(phones, blank_token) + if tones: tone_ids = self._t2id(tones) if to_tensor: tone_ids = paddle.to_tensor(tone_ids) temp_tone_ids.append(tone_ids) + if phones: phone_ids = self._p2id(phones) # if use paddle.to_tensor() in onnxruntime, the first time will be too low if to_tensor: phone_ids = paddle.to_tensor(phone_ids) temp_phone_ids.append(phone_ids) + if temp_tone_ids: result["tone_ids"] = temp_tone_ids if temp_phone_ids: result["phone_ids"] = temp_phone_ids + return result -- GitLab