diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py index ade8cdd6dc5f4f255a582b40fe6a7aa336b04fa0..11a7aeea9625a0957c4ab3e8eb54a54f7680c1cf 100644 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -29,8 +29,7 @@ from yacs.config import CfgNode from ..executor import BaseExecutor from ..log import logger from ..utils import stats_wrapper -from paddlespeech.t2s.frontend import English -from paddlespeech.t2s.frontend.zh_frontend import Frontend +from paddlespeech.t2s.exps.syn_utils import get_frontend from paddlespeech.t2s.modules.normalizer import ZScore __all__ = ['TTSExecutor'] @@ -54,6 +53,7 @@ class TTSExecutor(BaseExecutor): 'fastspeech2_ljspeech', 'fastspeech2_aishell3', 'fastspeech2_vctk', + 'fastspeech2_mix', 'tacotron2_csmsc', 'tacotron2_ljspeech', ], @@ -135,7 +135,7 @@ class TTSExecutor(BaseExecutor): '--lang', type=str, default='zh', - help='Choose model language. zh or en') + help='Choose model language. zh or en or mix') self.parser.add_argument( '--device', type=str, @@ -231,8 +231,11 @@ class TTSExecutor(BaseExecutor): use_pretrained_voc = True else: use_pretrained_voc = False - - voc_tag = voc + '-' + lang + voc_lang = lang + # we must use ljspeech's voc for mix am now! + if lang == 'mix': + voc_lang = 'en' + voc_tag = voc + '-' + voc_lang self.task_resource.set_task_model( model_tag=voc_tag, model_type=1, # vocoder @@ -281,13 +284,8 @@ class TTSExecutor(BaseExecutor): spk_num = len(spk_id) # frontend - if lang == 'zh': - self.frontend = Frontend( - phone_vocab_path=self.phones_dict, - tone_vocab_path=self.tones_dict) - - elif lang == 'en': - self.frontend = English(phone_vocab_path=self.phones_dict) + self.frontend = get_frontend( + lang=lang, phones_dict=self.phones_dict, tones_dict=self.tones_dict) # acoustic model odim = self.am_config.n_mels @@ -381,8 +379,12 @@ class TTSExecutor(BaseExecutor): input_ids = self.frontend.get_input_ids( text, merge_sentences=merge_sentences) phone_ids = input_ids["phone_ids"] + elif lang == 'mix': + input_ids = self.frontend.get_input_ids( + text, merge_sentences=merge_sentences) + phone_ids = input_ids["phone_ids"] else: - logger.error("lang should in {'zh', 'en'}!") + logger.error("lang should in {'zh', 'en', 'mix'}!") self.frontend_time = time.time() - frontend_st self.am_time = 0 @@ -398,7 +400,7 @@ class TTSExecutor(BaseExecutor): # fastspeech2 else: # multi speaker - if am_dataset in {"aishell3", "vctk"}: + if am_dataset in {'aishell3', 'vctk', 'mix'}: mel = self.am_inference( part_phone_ids, spk_id=paddle.to_tensor(spk_id)) else: diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py index 324bd3aea2ad83267d50b7647facfcc5bee27523..d7df0e48a38602bbb0e4ff2870980fb99ed7da99 100644 --- a/paddlespeech/resource/pretrained_models.py +++ b/paddlespeech/resource/pretrained_models.py @@ -655,6 +655,24 @@ tts_dynamic_pretrained_models = { 'phone_id_map.txt', }, }, + "fastspeech2_mix-mix": { + '1.0': { + 'url': + 'https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_csmscljspeech_add-zhen.zip', + 'md5': + '77d9d4b5a79ed6203339ead7ef6c74f9', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_94000.pdz', + 'speech_stats': + 'speech_stats.npy', + 'phones_dict': + 'phone_id_map.txt', + 'speaker_dict': + 'speaker_id_map.txt', + }, + }, # tacotron2 "tacotron2_csmsc-zh": { '1.0': {