diff --git a/examples/zh_en_tts/tts3/README.md b/examples/zh_en_tts/tts3/README.md index ead5742930cbb23bbd3877d01b4c98f2483859df..1752d24693385994691f515df86f12538f664129 100644 --- a/examples/zh_en_tts/tts3/README.md +++ b/examples/zh_en_tts/tts3/README.md @@ -98,9 +98,16 @@ optional arguments: ### Synthesizing -We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder. +We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the default neural vocoder. Download the pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip) and unzip it. +When speaker is `174` (csmsc), use csmsc's vocoder is better than aishell3's, we recommend that you use [hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip), please check `stage 2` of `synthesize_e2e.sh`. + +But if speaker is `175` (ljspeech), we **don't** recommend you to use ljspeech's vocoder, because ljspeech's vocoders are trained on sample rate 22.05kHz, but this acoustic model is trained on sample rate 24kHz, you can use csmsc's vocoder also, because ljspeech and csmsc are both female speakers. + +For speakers in aishell3 and vctk, we recommend you use aishell3 or vctk's vocoders, because ljspeech and csmsc are both female speakers, there vocoders may not perform well for male speakers in aishell3 and vctk, you can check speaker name and spk_id in `dump/speaker_id_map.txt` and check speakers' information ( Age / Gender / Accents / region, etc ) in [this issue](https://github.com/PaddlePaddle/PaddleSpeech/issues/1620) and choose the `spk_id` you want. + + ```bash unzip pwg_aishell3_ckpt_0.5.zip ``` diff --git a/examples/zh_en_tts/tts3/local/inference.sh b/examples/zh_en_tts/tts3/local/inference.sh index 5d3bd09e0b3982eb7e1df25f72bbfa04704436e4..16499ed0168d2c0aa96d9ecde908f2647cda0521 100755 --- a/examples/zh_en_tts/tts3/local/inference.sh +++ b/examples/zh_en_tts/tts3/local/inference.sh @@ -37,3 +37,18 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --lang=mix \ --spk_id=174 fi + +# voc: hifigan_csmsc +# when speaker is 174 (csmsc), use csmsc's vocoder is better than aishell3's +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + python3 ${BIN_DIR}/../inference.py \ + --inference_dir=${train_output_path}/inference \ + --am=fastspeech2_mix \ + --voc=hifigan_csmsc \ + --text=${BIN_DIR}/../sentences_mix.txt \ + --output_dir=${train_output_path}/pd_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --speaker_dict=dump/speaker_id_map.txt \ + --lang=mix \ + --spk_id=174 +fi diff --git a/examples/zh_en_tts/tts3/local/ort_predict.sh b/examples/zh_en_tts/tts3/local/ort_predict.sh index 86dcd115408c126da7e1f74106bc67b3357f5a8b..d80da9c91b2e3d4c803ca7f96b50accc0726d487 100755 --- a/examples/zh_en_tts/tts3/local/ort_predict.sh +++ b/examples/zh_en_tts/tts3/local/ort_predict.sh @@ -18,9 +18,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --device=cpu \ --cpu_threads=4 \ --lang=mix \ - --spk_id=174 - - + --spk_id=174 fi @@ -38,6 +36,19 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --device=cpu \ --cpu_threads=4 \ --lang=mix \ - --spk_id=174 - + --spk_id=174 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + python3 ${BIN_DIR}/../ort_predict_e2e.py \ + --inference_dir=${train_output_path}/inference_onnx \ + --am=fastspeech2_mix \ + --voc=hifigan_csmsc \ + --output_dir=${train_output_path}/onnx_infer_out_e2e \ + --text=${BIN_DIR}/../sentences_mix.txt \ + --phones_dict=dump/phone_id_map.txt \ + --device=cpu \ + --cpu_threads=4 \ + --lang=mix \ + --spk_id=174 fi diff --git a/examples/zh_en_tts/tts3/local/synthesize.sh b/examples/zh_en_tts/tts3/local/synthesize.sh index f3a0bf1504e0c011cc7e0d4b33406b6c6e010a0c..5bb947466ea9a88fc90187099f0882d7f56e8bd0 100755 --- a/examples/zh_en_tts/tts3/local/synthesize.sh +++ b/examples/zh_en_tts/tts3/local/synthesize.sh @@ -20,7 +20,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \ --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \ --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \ - --test_metadata=dump/test/norm/metadata2.jsonl \ + --test_metadata=dump/test/norm/metadata.jsonl \ --output_dir=${train_output_path}/test \ --phones_dict=dump/phone_id_map.txt \ --speaker_dict=dump/speaker_id_map.txt @@ -45,6 +45,3 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --phones_dict=dump/phone_id_map.txt \ --speaker_dict=dump/speaker_id_map.txt fi - - - diff --git a/examples/zh_en_tts/tts3/local/synthesize_e2e.sh b/examples/zh_en_tts/tts3/local/synthesize_e2e.sh index ae14e3cc8c999120cfabeaf8b6ec153a84402bec..f6ee04aefecb728f1ffb13a33af3f07c49bf4862 100755 --- a/examples/zh_en_tts/tts3/local/synthesize_e2e.sh +++ b/examples/zh_en_tts/tts3/local/synthesize_e2e.sh @@ -54,4 +54,29 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --speaker_dict=dump/speaker_id_map.txt \ --spk_id=174 \ --inference_dir=${train_output_path}/inference - fi +fi + + +# voc: hifigan_csmsc +# when speaker is 174 (csmsc), use csmsc's vocoder is better than aishell3's +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + echo "in csmsc's hifigan syn_e2e" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=fastspeech2_mix \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=hifigan_csmsc \ + --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --lang=mix \ + --text=${BIN_DIR}/../sentences_mix.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt \ + --speaker_dict=dump/speaker_id_map.txt \ + --spk_id=174 \ + --inference_dir=${train_output_path}/inference +fi \ No newline at end of file diff --git a/examples/zh_en_tts/tts3/run.sh b/examples/zh_en_tts/tts3/run.sh index 221ed7ee2883b81478860faf5fd58391ad961971..204042b12643a835d6dcb4c0d61d148bf6de3b23 100755 --- a/examples/zh_en_tts/tts3/run.sh +++ b/examples/zh_en_tts/tts3/run.sh @@ -7,7 +7,7 @@ gpus=0,1 stage=0 stop_stage=100 -datasets_root_dir=./datasets/ +datasets_root_dir=~/datasets mfa_root_dir=./mfa_results/ conf_path=conf/default.yaml train_output_path=exp/default @@ -53,11 +53,11 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_mix # considering the balance between speed and quality, we recommend that you use hifigan as vocoder ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_aishell3 - #./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_aishell3 - + # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_aishell3 + # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc fi -# inference with onnxruntime, use fastspeech2 + hifigan by default +# inference with onnxruntime, use fastspeech2 + pwgan by default if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then ./local/ort_predict.sh ${train_output_path} fi diff --git a/paddlespeech/t2s/exps/fastspeech2/preprocess.py b/paddlespeech/t2s/exps/fastspeech2/preprocess.py index 6e926d6e1f0990f694740a0d7903381ad3653421..f4acdc60b810890d7679b7a1885529f9a2b372e0 100644 --- a/paddlespeech/t2s/exps/fastspeech2/preprocess.py +++ b/paddlespeech/t2s/exps/fastspeech2/preprocess.py @@ -180,7 +180,6 @@ def process_sentences(config, results.append(record) results.sort(key=itemgetter("utt_id")) - write_metadata_method) with jsonlines.open(output_dir / "metadata.jsonl", write_metadata_method) as writer: for item in results: