add mix tts cli, test=tts

00e9853f · 小湉湉 · 1f128a08 · 00e9853f · 00e9853f
隐藏空白更改
内联并排

Showing with 34 addition and 14 deletion

paddlespeech/cli/tts/infer.py paddlespeech/cli/tts/infer.py +16 -14

paddlespeech/resource/pretrained_models.py paddlespeech/resource/pretrained_models.py +18 -0

未找到文件。
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@@ -29,8 +29,7 @@ from yacs.config import CfgNode
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import stats_wrapper
-from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.exps.syn_utils import get_frontend
-from paddlespeech.t2s.frontend.zh_frontend import Frontend
 from paddlespeech.t2s.modules.normalizer import ZScore
 __all__ = ['TTSExecutor']
@@ -54,6 +53,7 @@ class TTSExecutor(BaseExecutor):
                'fastspeech2_ljspeech',
                'fastspeech2_aishell3',
                'fastspeech2_vctk',
+                'fastspeech2_mix',
                'tacotron2_csmsc',
                'tacotron2_ljspeech',
            ],
@@ -135,7 +135,7 @@ class TTSExecutor(BaseExecutor):
            '--lang',
            type=str,
            default='zh',
-            help='Choose model language. zh or en')
+            help='Choose model language. zh or en or mix')
        self.parser.add_argument(
            '--device',
            type=str,
@@ -231,8 +231,11 @@ class TTSExecutor(BaseExecutor):
            use_pretrained_voc = True
        else:
            use_pretrained_voc = False
+        voc_lang = lang
-        voc_tag = voc + '-' + lang
+        # we must use ljspeech's voc for mix am now!
+        if lang == 'mix':
+            voc_lang = 'en'
+        voc_tag = voc + '-' + voc_lang
        self.task_resource.set_task_model(
            model_tag=voc_tag,
            model_type=1,  # vocoder
@@ -281,13 +284,8 @@ class TTSExecutor(BaseExecutor):
            spk_num = len(spk_id)
        # frontend
-        if lang == 'zh':
+        self.frontend = get_frontend(
-            self.frontend = Frontend(
+            lang=lang, phones_dict=self.phones_dict, tones_dict=self.tones_dict)
-                phone_vocab_path=self.phones_dict,
-                tone_vocab_path=self.tones_dict)
-        elif lang == 'en':
-            self.frontend = English(phone_vocab_path=self.phones_dict)
        # acoustic model
        odim = self.am_config.n_mels
@@ -381,8 +379,12 @@ class TTSExecutor(BaseExecutor):
            input_ids = self.frontend.get_input_ids(
                text, merge_sentences=merge_sentences)
            phone_ids = input_ids["phone_ids"]
+        elif lang == 'mix':
+            input_ids = self.frontend.get_input_ids(
+                text, merge_sentences=merge_sentences)
+            phone_ids = input_ids["phone_ids"]
        else:
-            logger.error("lang should in {'zh', 'en'}!")
+            logger.error("lang should in {'zh', 'en', 'mix'}!")
        self.frontend_time = time.time() - frontend_st
        self.am_time = 0
@@ -398,7 +400,7 @@ class TTSExecutor(BaseExecutor):
            # fastspeech2
            else:
                # multi speaker
-                if am_dataset in {"aishell3", "vctk"}:
+                if am_dataset in {'aishell3', 'vctk', 'mix'}:
                    mel = self.am_inference(
                        part_phone_ids, spk_id=paddle.to_tensor(spk_id))
                else:

--- a/paddlespeech/resource/pretrained_models.py
+++ b/paddlespeech/resource/pretrained_models.py
@@ -655,6 +655,24 @@ tts_dynamic_pretrained_models = {
            'phone_id_map.txt',
        },
    },
+    "fastspeech2_mix-mix": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_csmscljspeech_add-zhen.zip',
+            'md5':
+            '77d9d4b5a79ed6203339ead7ef6c74f9',
+            'config':
+            'default.yaml',
+            'ckpt':
+            'snapshot_iter_94000.pdz',
+            'speech_stats':
+            'speech_stats.npy',
+            'phones_dict':
+            'phone_id_map.txt',
+            'speaker_dict':
+            'speaker_id_map.txt',
+        },
+    },
    # tacotron2
    "tacotron2_csmsc-zh": {
        '1.0': {