diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py index 9f58579febf206e8ce3ecbeffb8d60ffa411c1bd..8ebfcfe7f40a999f44eabe8351bce9aacc86c3e4 100644 --- a/paddlespeech/t2s/exps/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/synthesize_e2e.py @@ -129,7 +129,10 @@ def evaluate(args): idim=vocab_size, odim=odim, spk_num=spk_num, **am_config["model"]) elif am_name == 'speedyspeech': am = am_class( - vocab_size=vocab_size, tone_size=tone_size, **am_config["model"]) + vocab_size=vocab_size, + tone_size=tone_size, + spk_num=spk_num, + **am_config["model"]) elif am_name == 'tacotron2': am = am_class(idim=vocab_size, odim=odim, **am_config["model"]) @@ -171,25 +174,31 @@ def evaluate(args): InputSpec([-1], dtype=paddle.int64), InputSpec([1], dtype=paddle.int64) ]) - paddle.jit.save(am_inference, - os.path.join(args.inference_dir, args.am)) - am_inference = paddle.jit.load( - os.path.join(args.inference_dir, args.am)) else: am_inference = jit.to_static( am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)]) - paddle.jit.save(am_inference, - os.path.join(args.inference_dir, args.am)) - am_inference = paddle.jit.load( - os.path.join(args.inference_dir, args.am)) + paddle.jit.save(am_inference, + os.path.join(args.inference_dir, args.am)) + am_inference = paddle.jit.load( + os.path.join(args.inference_dir, args.am)) elif am_name == 'speedyspeech': - am_inference = jit.to_static( - am_inference, - input_spec=[ - InputSpec([-1], dtype=paddle.int64), - InputSpec([-1], dtype=paddle.int64) - ]) + if am_dataset in {"aishell3", "vctk"} and args.speaker_dict: + am_inference = jit.to_static( + am_inference, + input_spec=[ + InputSpec([-1], dtype=paddle.int64), # text + InputSpec([-1], dtype=paddle.int64), # tone + None, # duration + InputSpec([-1], dtype=paddle.int64) # spk_id + ]) + else: + am_inference = jit.to_static( + am_inference, + input_spec=[ + InputSpec([-1], dtype=paddle.int64), + InputSpec([-1], dtype=paddle.int64) + ]) paddle.jit.save(am_inference, os.path.join(args.inference_dir, args.am)) @@ -242,7 +251,12 @@ def evaluate(args): mel = am_inference(part_phone_ids) elif am_name == 'speedyspeech': part_tone_ids = tone_ids[i] - mel = am_inference(part_phone_ids, part_tone_ids) + if am_dataset in {"aishell3", "vctk"}: + spk_id = paddle.to_tensor(args.spk_id) + mel = am_inference(part_phone_ids, part_tone_ids, + spk_id) + else: + mel = am_inference(part_phone_ids, part_tone_ids) elif am_name == 'tacotron2': mel = am_inference(part_phone_ids) # vocoder @@ -269,8 +283,9 @@ def main(): type=str, default='fastspeech2_csmsc', choices=[ - 'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_ljspeech', - 'fastspeech2_aishell3', 'fastspeech2_vctk', 'tacotron2_csmsc' + 'speedyspeech_csmsc', 'speedyspeech_aishell3', 'fastspeech2_csmsc', + 'fastspeech2_ljspeech', 'fastspeech2_aishell3', 'fastspeech2_vctk', + 'tacotron2_csmsc' ], help='Choose acoustic model type of tts task.') parser.add_argument(