From fbe3c05137feccf27a07fdb22d59bdd0318ca521 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Thu, 30 Dec 2021 10:29:12 +0800
Subject: [PATCH] add style_melgan and hifigan in tts cli, test=tts (#1241)

---
 paddlespeech/cli/tts/infer.py                 | 82 ++++++++++++++-----
 paddlespeech/t2s/exps/synthesize_e2e.py       | 42 ++++++----
 paddlespeech/t2s/frontend/phonectic.py        | 49 ++++++++---
 .../zh_normalization/text_normlization.py     |  5 +-
 4 files changed, 128 insertions(+), 50 deletions(-)

diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py
index f60f4224..c934d595 100644
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@@ -178,6 +178,32 @@ pretrained_models = {
         'speech_stats':
         'feats_stats.npy',
     },
+    # style_melgan
+    "style_melgan_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip',
+        'md5':
+        '5de2d5348f396de0c966926b8c462755',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_1500000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+    # hifigan
+    "hifigan_csmsc-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip',
+        'md5':
+        'dd40a3d88dfcf64513fba2f0f961ada6',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_2500000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
 }
 
 model_alias = {
@@ -199,6 +225,14 @@ model_alias = {
     "paddlespeech.t2s.models.melgan:MelGANGenerator",
     "mb_melgan_inference":
     "paddlespeech.t2s.models.melgan:MelGANInference",
+    "style_melgan":
+    "paddlespeech.t2s.models.melgan:StyleMelGANGenerator",
+    "style_melgan_inference":
+    "paddlespeech.t2s.models.melgan:StyleMelGANInference",
+    "hifigan":
+    "paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
+    "hifigan_inference":
+    "paddlespeech.t2s.models.hifigan:HiFiGANInference",
 }
 
 
@@ -266,7 +300,7 @@ class TTSExecutor(BaseExecutor):
             default='pwgan_csmsc',
             choices=[
                 'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk',
-                'mb_melgan_csmsc'
+                'mb_melgan_csmsc', 'style_melgan_csmsc', 'hifigan_csmsc'
             ],
             help='Choose vocoder type of tts task.')
 
@@ -504,37 +538,47 @@ class TTSExecutor(BaseExecutor):
         am_name = am[:am.rindex('_')]
         am_dataset = am[am.rindex('_') + 1:]
         get_tone_ids = False
+        merge_sentences = False
         if am_name == 'speedyspeech':
             get_tone_ids = True
         if lang == 'zh':
             input_ids = self.frontend.get_input_ids(
-                text, merge_sentences=True, get_tone_ids=get_tone_ids)
+                text,
+                merge_sentences=merge_sentences,
+                get_tone_ids=get_tone_ids)
             phone_ids = input_ids["phone_ids"]
-            phone_ids = phone_ids[0]
             if get_tone_ids:
                 tone_ids = input_ids["tone_ids"]
-                tone_ids = tone_ids[0]
         elif lang == 'en':
-            input_ids = self.frontend.get_input_ids(text)
+            input_ids = self.frontend.get_input_ids(
+                text, merge_sentences=merge_sentences)
             phone_ids = input_ids["phone_ids"]
         else:
             print("lang should in {'zh', 'en'}!")
 
-        # am
-        if am_name == 'speedyspeech':
-            mel = self.am_inference(phone_ids, tone_ids)
-        # fastspeech2
-        else:
-            # multi speaker
-            if am_dataset in {"aishell3", "vctk"}:
-                mel = self.am_inference(
-                    phone_ids, spk_id=paddle.to_tensor(spk_id))
+        flags = 0
+        for i in range(len(phone_ids)):
+            part_phone_ids = phone_ids[i]
+            # am
+            if am_name == 'speedyspeech':
+                part_tone_ids = tone_ids[i]
+                mel = self.am_inference(part_phone_ids, part_tone_ids)
+            # fastspeech2
             else:
-                mel = self.am_inference(phone_ids)
-
-        # voc
-        wav = self.voc_inference(mel)
-        self._outputs['wav'] = wav
+                # multi speaker
+                if am_dataset in {"aishell3", "vctk"}:
+                    mel = self.am_inference(
+                        part_phone_ids, spk_id=paddle.to_tensor(spk_id))
+                else:
+                    mel = self.am_inference(part_phone_ids)
+            # voc
+            wav = self.voc_inference(mel)
+            if flags == 0:
+                wav_all = wav
+                flags = 1
+            else:
+                wav_all = paddle.concat([wav_all, wav])
+        self._outputs['wav'] = wav_all
 
     def postprocess(self, output: str='output.wav') -> Union[str, os.PathLike]:
         """
diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py
index 9a83ec1b..fc822b21 100644
--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@@ -196,41 +196,47 @@ def evaluate(args):
 
     output_dir = Path(args.output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
-
+    merge_sentences = False
     for utt_id, sentence in sentences:
         get_tone_ids = False
         if am_name == 'speedyspeech':
             get_tone_ids = True
         if args.lang == 'zh':
             input_ids = frontend.get_input_ids(
-                sentence, merge_sentences=True, get_tone_ids=get_tone_ids)
+                sentence, merge_sentences=merge_sentences, get_tone_ids=get_tone_ids)
             phone_ids = input_ids["phone_ids"]
-            phone_ids = phone_ids[0]
             if get_tone_ids:
                 tone_ids = input_ids["tone_ids"]
-                tone_ids = tone_ids[0]
         elif args.lang == 'en':
-            input_ids = frontend.get_input_ids(sentence)
+            input_ids = frontend.get_input_ids(sentence, merge_sentences=merge_sentences)
             phone_ids = input_ids["phone_ids"]
         else:
             print("lang should in {'zh', 'en'}!")
-
         with paddle.no_grad():
-            # acoustic model
-            if am_name == 'fastspeech2':
-                # multi speaker
-                if am_dataset in {"aishell3", "vctk"}:
-                    spk_id = paddle.to_tensor(args.spk_id)
-                    mel = am_inference(phone_ids, spk_id)
+            flags = 0
+            for i in range(len(phone_ids)):
+                part_phone_ids = phone_ids[i]
+                # acoustic model
+                if am_name == 'fastspeech2':
+                    # multi speaker
+                    if am_dataset in {"aishell3", "vctk"}:
+                        spk_id = paddle.to_tensor(args.spk_id)
+                        mel = am_inference(part_phone_ids, spk_id)
+                    else:
+                        mel = am_inference(part_phone_ids)
+                elif am_name == 'speedyspeech':
+                    part_tone_ids = tone_ids[i]
+                    mel = am_inference(part_phone_ids, part_tone_ids)
+                # vocoder
+                wav = voc_inference(mel)
+                if flags == 0:
+                    wav_all = wav
+                    flags = 1
                 else:
-                    mel = am_inference(phone_ids)
-            elif am_name == 'speedyspeech':
-                mel = am_inference(phone_ids, tone_ids)
-            # vocoder
-            wav = voc_inference(mel)
+                    wav_all = paddle.concat([wav_all, wav])
         sf.write(
             str(output_dir / (utt_id + ".wav")),
-            wav.numpy(),
+            wav_all.numpy(),
             samplerate=am_config.fs)
         print(f"{utt_id} done!")
 
diff --git a/paddlespeech/t2s/frontend/phonectic.py b/paddlespeech/t2s/frontend/phonectic.py
index fbc8fd38..25413871 100644
--- a/paddlespeech/t2s/frontend/phonectic.py
+++ b/paddlespeech/t2s/frontend/phonectic.py
@@ -13,7 +13,9 @@
 # limitations under the License.
 from abc import ABC
 from abc import abstractmethod
+from typing import List
 
+import numpy as np
 import paddle
 from g2p_en import G2p
 from g2pM import G2pM
@@ -21,6 +23,7 @@ from g2pM import G2pM
 from paddlespeech.t2s.frontend.normalizer.normalizer import normalize
 from paddlespeech.t2s.frontend.punctuation import get_punctuations
 from paddlespeech.t2s.frontend.vocab import Vocab
+from paddlespeech.t2s.frontend.zh_normalization.text_normlization import TextNormalizer
 
 # discard opencc untill we find an easy solution to install it on windows
 # from opencc import OpenCC
@@ -53,6 +56,7 @@ class English(Phonetics):
         self.vocab = Vocab(self.phonemes + self.punctuations)
         self.vocab_phones = {}
         self.punc = "：，；。？！“”‘’':,;.?!"
+        self.text_normalizer = TextNormalizer()
         if phone_vocab_path:
             with open(phone_vocab_path, 'rt') as f:
                 phn_id = [line.strip().split() for line in f.readlines()]
@@ -78,19 +82,42 @@ class English(Phonetics):
         phonemes = [item for item in phonemes if item in self.vocab.stoi]
         return phonemes
 
-    def get_input_ids(self, sentence: str) -> paddle.Tensor:
-        result = {}
-        phones = self.phoneticize(sentence)
-        # remove start_symbol and end_symbol
-        phones = phones[1:-1]
-        phones = [phn for phn in phones if not phn.isspace()]
-        phones = [
+    def _p2id(self, phonemes: List[str]) -> np.array:
+        # replace unk phone with sp
+        phonemes = [
             phn if (phn in self.vocab_phones and phn not in self.punc) else "sp"
-            for phn in phones
+            for phn in phonemes
         ]
-        phone_ids = [self.vocab_phones[phn] for phn in phones]
-        phone_ids = paddle.to_tensor(phone_ids)
-        result["phone_ids"] = phone_ids
+        phone_ids = [self.vocab_phones[item] for item in phonemes]
+        return np.array(phone_ids, np.int64)
+
+    def get_input_ids(self, sentence: str,
+                      merge_sentences: bool=False) -> paddle.Tensor:
+        result = {}
+        sentences = self.text_normalizer._split(sentence, lang="en")
+        phones_list = []
+        temp_phone_ids = []
+        for sentence in sentences:
+            phones = self.phoneticize(sentence)
+            # remove start_symbol and end_symbol
+            phones = phones[1:-1]
+            phones = [phn for phn in phones if not phn.isspace()]
+            phones_list.append(phones)
+
+        if merge_sentences:
+            merge_list = sum(phones_list, [])
+            # rm the last 'sp' to avoid the noise at the end
+            # cause in the training data, no 'sp' in the end
+            if merge_list[-1] == 'sp':
+                merge_list = merge_list[:-1]
+            phones_list = []
+            phones_list.append(merge_list)
+
+        for part_phones_list in phones_list:
+            phone_ids = self._p2id(part_phones_list)
+            phone_ids = paddle.to_tensor(phone_ids)
+            temp_phone_ids.append(phone_ids)
+        result["phone_ids"] = temp_phone_ids
         return result
 
     def numericalize(self, phonemes):
diff --git a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
index c68caeeb..c502d882 100644
--- a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
@@ -53,7 +53,7 @@ class TextNormalizer():
     def __init__(self):
         self.SENTENCE_SPLITOR = re.compile(r'([：，；。？！,;?!][”’]?)')
 
-    def _split(self, text: str) -> List[str]:
+    def _split(self, text: str, lang="zh") -> List[str]:
         """Split long text into sentences with sentence-splitting punctuations.
         Parameters
         ----------
@@ -65,7 +65,8 @@ class TextNormalizer():
             Sentences.
         """
         # Only for pure Chinese here
-        text = text.replace(" ", "")
+        if lang == "zh":
+            text = text.replace(" ", "")
         text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
         text = text.strip()
         sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
-- 
GitLab