提交 6b4d1f80 编写于 作者: H Hui Zhang

add t2s assets

上级 83d93da8
......@@ -241,7 +241,7 @@ fastspeech2_aishell3_ckpt_1.1.0
├── speaker_id_map.txt # speaker id map file when training a multi-speaker fastspeech2
└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2
```
You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
You can use the following scripts to synthesize for `${BIN_DIR}/../../assets/sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
```bash
source path.sh
......@@ -257,7 +257,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
--voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
--voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=exp/default/test_e2e \
--phones_dict=fastspeech2_aishell3_ckpt_1.1.0/phone_id_map.txt \
--speaker_dict=fastspeech2_aishell3_ckpt_1.1.0/speaker_id_map.txt \
......
......@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_aishell3 \
--voc=pwgan_aishell3 \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
......@@ -22,7 +22,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_aishell3 \
--voc=hifigan_aishell3 \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
......
......@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--inference_dir=${train_output_path}/pdlite \
--am=fastspeech2_aishell3 \
--voc=pwgan_aishell3 \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/lite_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
......@@ -24,7 +24,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--inference_dir=${train_output_path}/pdlite \
--am=fastspeech2_aishell3 \
--voc=hifigan_aishell3 \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/lite_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
......
......@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--am=fastspeech2_aishell3 \
--voc=pwgan_aishell3 \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../csmsc_test.txt \
--text=${BIN_DIR}/../../assets/csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2 \
......@@ -24,7 +24,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--am=fastspeech2_aishell3 \
--voc=hifigan_aishell3 \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../csmsc_test.txt \
--text=${BIN_DIR}/../../assets/csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2 \
......
......@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
--voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
......@@ -44,7 +44,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
......
......@@ -196,7 +196,7 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
--phones_dict=vits_aishell3_ckpt_1.1.0/phone_id_map.txt \
--speaker_dict=vits_aishell3_ckpt_1.1.0/speaker_id_map.txt \
--output_dir=exp/default/test_e2e \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--add-blank=${add_blank}
```
-->
......@@ -20,6 +20,6 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--speaker_dict=dump/speaker_id_map.txt \
--spk_id=0 \
--output_dir=${train_output_path}/test_e2e \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--add-blank=${add_blank}
fi
......@@ -102,7 +102,7 @@ Download the pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](
unzip pwg_aishell3_ckpt_0.5.zip
```
You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_canton.txt` using pretrained fastspeech2 and parallel wavegan models.
You can use the following scripts to synthesize for `${BIN_DIR}/../../assets/sentences_canton.txt` using pretrained fastspeech2 and parallel wavegan models.
```bash
source path.sh
......@@ -118,7 +118,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
--voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
--voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
--lang=canton \
--text=${BIN_DIR}/../sentences_canton.txt \
--text=${BIN_DIR}/../../assets/sentences_canton.txt \
--output_dir=exp/default/test_e2e \
--phones_dict=fastspeech2_canton_ckpt_1.4.0/phone_id_map.txt \
--speaker_dict=fastspeech2_canton_ckpt_1.4.0/speaker_id_map.txt \
......
......@@ -12,7 +12,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--am=fastspeech2_canton \
--voc=pwgan_aishell3 \
--spk_id=10 \
--text=${BIN_DIR}/../sentences_canton.txt \
--text=${BIN_DIR}/../../assets/sentences_canton.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
......@@ -27,7 +27,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--am=fastspeech2_canton \
--voc=mb_melgan_csmsc \
--spk_id=10 \
--text=${BIN_DIR}/../sentences_canton.txt \
--text=${BIN_DIR}/../../assets/sentences_canton.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
......@@ -41,7 +41,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--am=fastspeech2_canton \
--voc=hifigan_csmsc \
--spk_id=10 \
--text=${BIN_DIR}/../sentences_canton.txt \
--text=${BIN_DIR}/../../assets/sentences_canton.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
......@@ -55,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--am=fastspeech2_canton \
--voc=wavernn_csmsc \
--spk_id=10 \
--text=${BIN_DIR}/../sentences_canton.txt \
--text=${BIN_DIR}/../../assets/sentences_canton.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
......
......@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--voc=pwgan_aishell3 \
--spk_id=10 \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../sentences_canton.txt \
--text=${BIN_DIR}/../../assets/sentences_canton.txt \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
--lang=canton \
......@@ -26,7 +26,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--voc=mb_melgan_csmsc \
--spk_id=10 \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../sentences_canton.txt \
--text=${BIN_DIR}/../../assets/sentences_canton.txt \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
--lang=canton \
......@@ -40,7 +40,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--am=fastspeech2_canton \
--voc=hifigan_csmsc \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../sentences_canton.txt \
--text=${BIN_DIR}/../../assets/sentences_canton.txt \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
--lang=canton \
......
......@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
--voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
--lang=canton \
--text=${BIN_DIR}/../sentences_canton.txt \
--text=${BIN_DIR}/../../assets/sentences_canton.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
......@@ -44,7 +44,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
--lang=canton \
--text=${BIN_DIR}/../sentences_canton.txt \
--text=${BIN_DIR}/../../assets/sentences_canton.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
......
......@@ -9,7 +9,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/inference.py \
--inference_dir=${train_output_path}/inference \
--am=jets_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt
fi
......@@ -17,6 +17,6 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--phones_dict=dump/phone_id_map.txt \
--output_dir=${train_output_path}/test_e2e \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--inference_dir=${train_output_path}/inference
fi
......@@ -226,7 +226,7 @@ tacotron2_csmsc_ckpt_0.2.0
├── snapshot_iter_30600.pdz # model parameters and optimizer states
└── speech_stats.npy # statistics used to normalize spectrogram when training Tacotron2
```
You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained Tacotron2 and parallel wavegan models.
You can use the following scripts to synthesize for `${BIN_DIR}/../../assets/sentences.txt` using pretrained Tacotron2 and parallel wavegan models.
```bash
source path.sh
......@@ -242,7 +242,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=exp/default/test_e2e \
--inference_dir=exp/default/inference \
--phones_dict=tacotron2_csmsc_ckpt_0.2.0/phone_id_map.txt
......
......@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--inference_dir=${train_output_path}/inference \
--am=tacotron2_csmsc \
--voc=pwgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt
fi
......@@ -22,7 +22,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--inference_dir=${train_output_path}/inference \
--am=tacotron2_csmsc \
--voc=mb_melgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt
fi
......@@ -33,7 +33,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--inference_dir=${train_output_path}/inference \
--am=tacotron2_csmsc \
--voc=hifigan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt
fi
\ No newline at end of file
......@@ -22,7 +22,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--inference_dir=${train_output_path}/inference
......@@ -44,7 +44,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--inference_dir=${train_output_path}/inference
......@@ -66,7 +66,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt
# --inference_dir=${train_output_path}/inference
......@@ -87,7 +87,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--inference_dir=${train_output_path}/inference
......@@ -108,7 +108,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--inference_dir=${train_output_path}/inference
......
......@@ -248,7 +248,7 @@ speedyspeech_csmsc_ckpt_0.2.0
├── snapshot_iter_30600.pdz # model parameters and optimizer states
└── tone_id_map.txt # tone vocabulary file when training speedyspeech
```
You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained speedyspeech and parallel wavegan models.
You can use the following scripts to synthesize for `${BIN_DIR}/../../assets/sentences.txt` using pretrained speedyspeech and parallel wavegan models.
```bash
source path.sh
......@@ -264,7 +264,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=exp/default/test_e2e \
--inference_dir=exp/default/inference \
--phones_dict=speedyspeech_csmsc_ckpt_0.2.0/phone_id_map.txt \
......
......@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--inference_dir=${train_output_path}/inference \
--am=speedyspeech_csmsc \
--voc=pwgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt
......@@ -24,7 +24,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--inference_dir=${train_output_path}/inference \
--am=speedyspeech_csmsc \
--voc=mb_melgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt
......@@ -36,7 +36,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--inference_dir=${train_output_path}/inference \
--am=speedyspeech_csmsc \
--voc=hifigan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt
......
......@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--inference_dir=${train_output_path}/pdlite \
--am=speedyspeech_csmsc \
--voc=pwgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/lite_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt
......@@ -24,7 +24,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--inference_dir=${train_output_path}/pdlite \
--am=speedyspeech_csmsc \
--voc=mb_melgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/lite_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt
......@@ -36,7 +36,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--inference_dir=${train_output_path}/pdlite \
--am=speedyspeech_csmsc \
--voc=hifigan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/lite_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt
......
......@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--am=speedyspeech_csmsc \
--voc=pwgan_csmsc \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../csmsc_test.txt \
--text=${BIN_DIR}/../../assets/csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--device=cpu \
......@@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--am=speedyspeech_csmsc \
--voc=mb_melgan_csmsc \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../csmsc_test.txt \
--text=${BIN_DIR}/../../assets/csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--device=cpu \
......@@ -36,7 +36,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--am=speedyspeech_csmsc \
--voc=hifigan_csmsc \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../csmsc_test.txt \
--text=${BIN_DIR}/../../assets/csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
--device=cpu \
......
......@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
......@@ -43,7 +43,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
......@@ -66,7 +66,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt
......@@ -87,7 +87,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
......@@ -109,7 +109,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt \
......
......@@ -258,7 +258,7 @@ fastspeech2_nosil_baker_ckpt_0.4
├── snapshot_iter_76000.pdz # model parameters and optimizer states
└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2
```
You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
You can use the following scripts to synthesize for `${BIN_DIR}/../../assets/sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
If you want to use fastspeech2_conformer, you must delete this line `--inference_dir=exp/default/inference \` to skip the step of dygraph to static graph, cause we haven't tested dygraph to static graph for fastspeech2_conformer till now.
```bash
......@@ -276,7 +276,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=exp/default/test_e2e \
--inference_dir=exp/default/inference \
--phones_dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
......
......@@ -248,7 +248,7 @@ fastspeech2_nosil_baker_ckpt_0.4
├── snapshot_iter_76000.pdz # 模型参数和优化器状态
└── speech_stats.npy # 训练 fastspeech2 时用于规范化频谱图的统计数据
```
您可以使用以下脚本通过使用预训练的 fastspeech2 和 parallel wavegan 模型为 `${BIN_DIR}/../sentences.txt` 合成句子
您可以使用以下脚本通过使用预训练的 fastspeech2 和 parallel wavegan 模型为 `${BIN_DIR}/../../assets/sentences.txt` 合成句子
```bash
source path.sh
......@@ -264,7 +264,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=exp/default/test_e2e \
--inference_dir=exp/default/inference \
--phones_dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
......
......@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_csmsc \
--voc=pwgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt
fi
......@@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_csmsc \
--voc=mb_melgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt
fi
......@@ -34,7 +34,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_csmsc \
--voc=hifigan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt
fi
......@@ -45,7 +45,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_csmsc \
--voc=wavernn_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt
fi
\ No newline at end of file
......@@ -12,7 +12,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--am=fastspeech2_csmsc \
--am_stat=dump/train/speech_stats.npy \
--voc=pwgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out_streaming \
--phones_dict=dump/phone_id_map.txt \
--am_streaming=True
......@@ -26,7 +26,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--am=fastspeech2_csmsc \
--am_stat=dump/train/speech_stats.npy \
--voc=mb_melgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out_streaming \
--phones_dict=dump/phone_id_map.txt \
--am_streaming=True
......@@ -39,7 +39,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--am=fastspeech2_csmsc \
--am_stat=dump/train/speech_stats.npy \
--voc=hifigan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out_streaming \
--phones_dict=dump/phone_id_map.txt \
--am_streaming=True
......
......@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--inference_dir=${train_output_path}/pdlite \
--am=fastspeech2_csmsc \
--voc=pwgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/lite_infer_out \
--phones_dict=dump/phone_id_map.txt
fi
......@@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--inference_dir=${train_output_path}/pdlite \
--am=fastspeech2_csmsc \
--voc=mb_melgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/lite_infer_out \
--phones_dict=dump/phone_id_map.txt
fi
......@@ -34,7 +34,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--inference_dir=${train_output_path}/pdlite \
--am=fastspeech2_csmsc \
--voc=hifigan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/lite_infer_out \
--phones_dict=dump/phone_id_map.txt
fi
......@@ -12,7 +12,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--am=fastspeech2_csmsc \
--am_stat=dump/train/speech_stats.npy \
--voc=pwgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/lite_infer_out_streaming \
--phones_dict=dump/phone_id_map.txt \
--am_streaming=True
......@@ -26,7 +26,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--am=fastspeech2_csmsc \
--am_stat=dump/train/speech_stats.npy \
--voc=mb_melgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/lite_infer_out_streaming \
--phones_dict=dump/phone_id_map.txt \
--am_streaming=True
......@@ -39,7 +39,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--am=fastspeech2_csmsc \
--am_stat=dump/train/speech_stats.npy \
--voc=hifigan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/lite_infer_out_streaming \
--phones_dict=dump/phone_id_map.txt \
--am_streaming=True
......
......@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--am=fastspeech2_csmsc \
--voc=pwgan_csmsc \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../csmsc_test.txt \
--text=${BIN_DIR}/../../assets/csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2
......@@ -22,7 +22,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--am=fastspeech2_csmsc \
--voc=mb_melgan_csmsc \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../csmsc_test.txt \
--text=${BIN_DIR}/../../assets/csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2
......@@ -34,7 +34,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--am=fastspeech2_csmsc \
--voc=hifigan_csmsc \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../csmsc_test.txt \
--text=${BIN_DIR}/../../assets/csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2
......
......@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--am_stat=dump/train/speech_stats.npy \
--voc=pwgan_csmsc \
--output_dir=${train_output_path}/onnx_infer_out_streaming \
--text=${BIN_DIR}/../csmsc_test.txt \
--text=${BIN_DIR}/../../assets/csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2 \
......@@ -25,7 +25,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--am_stat=dump/train/speech_stats.npy \
--voc=mb_melgan_csmsc \
--output_dir=${train_output_path}/onnx_infer_out_streaming \
--text=${BIN_DIR}/../csmsc_test.txt \
--text=${BIN_DIR}/../../assets/csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2 \
......@@ -39,7 +39,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--am_stat=dump/train/speech_stats.npy \
--voc=hifigan_csmsc \
--output_dir=${train_output_path}/onnx_infer_out_streaming \
--text=${BIN_DIR}/../csmsc_test.txt \
--text=${BIN_DIR}/../../assets/csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2 \
......
......@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--inference_dir=${train_output_path}/inference
......@@ -42,7 +42,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--inference_dir=${train_output_path}/inference
......@@ -64,7 +64,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt
# --inference_dir=${train_output_path}/inference
......@@ -85,7 +85,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--inference_dir=${train_output_path}/inference
......@@ -107,7 +107,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--inference_dir=${train_output_path}/inference
......
......@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e_streaming \
--phones_dict=dump/phone_id_map.txt \
--am_streaming=True \
......@@ -43,7 +43,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e_streaming \
--phones_dict=dump/phone_id_map.txt \
--am_streaming=True \
......@@ -66,7 +66,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e_streaming \
--phones_dict=dump/phone_id_map.txt \
--am_streaming=True
......@@ -87,7 +87,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e_streaming \
--phones_dict=dump/phone_id_map.txt \
--am_streaming=True \
......
......@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--inference_dir=${train_output_path}/inference \
......@@ -43,7 +43,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--inference_dir=${train_output_path}/inference \
......@@ -66,7 +66,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--use_rhy=True
......@@ -88,7 +88,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--inference_dir=${train_output_path}/inference \
......@@ -111,7 +111,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--inference_dir=${train_output_path}/inference \
......
......@@ -172,6 +172,6 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
--ckpt=vits_csmsc_ckpt_1.4.0/snapshot_iter_150000.pdz \
--phones_dict=vits_csmsc_ckpt_1.4.0/phone_id_map.txt \
--output_dir=exp/default/test_e2e \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--add-blank=${add_blank}
```
......@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/inference.py \
--inference_dir=${train_output_path}/inference \
--am=vits_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--add-blank=${add_blank}
......
......@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/lite_predict.py \
--inference_dir=${train_output_path}/pdlite \
--am=vits_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=${train_output_path}/lite_infer_out \
--phones_dict=dump/phone_id_map.txt \
--add-blank=${add_blank}
......
......@@ -18,7 +18,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--phones_dict=dump/phone_id_map.txt \
--output_dir=${train_output_path}/test_e2e \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--add-blank=${add_blank} #\
# --inference_dir=${train_output_path}/inference
fi
......@@ -239,7 +239,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
--voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
--lang=en \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=exp/default/test_e2e \
--phones_dict=tacotron2_ljspeech_ckpt_0.2.0/phone_id_map.txt
```
......@@ -16,7 +16,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
--voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
--lang=en \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
# --inference_dir=${train_output_path}/inference
\ No newline at end of file
......@@ -191,7 +191,7 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
--transformer-tts-stat=transformer_tts_ljspeech_ckpt_0.4/speech_stats.npy \
--waveflow-config=waveflow_ljspeech_ckpt_0.3/config.yaml \
--waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output-dir=exp/default/test_e2e \
--phones-dict=transformer_tts_ljspeech_ckpt_0.4/phone_id_map.txt
```
......@@ -12,6 +12,6 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
--transformer-tts-stat=dump/train/speech_stats.npy \
--waveflow-config=waveflow_ljspeech_ckpt_0.3/config.yaml \
--waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output-dir=${train_output_path}/test_e2e \
--phones-dict=dump/phone_id_map.txt
......@@ -254,7 +254,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
--voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
--lang=en \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=exp/default/test_e2e \
--inference_dir=exp/default/inference \
--phones_dict=fastspeech2_nosil_ljspeech_ckpt_0.5/phone_id_map.txt
......
......@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_ljspeech \
--voc=pwgan_ljspeech \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--lang=en
......@@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_ljspeech \
--voc=hifigan_ljspeech \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--lang=en
......
......@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--inference_dir=${train_output_path}/pdlite \
--am=fastspeech2_ljspeech \
--voc=pwgan_ljspeech \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=${train_output_path}/lite_infer_out \
--phones_dict=dump/phone_id_map.txt \
--lang=en
......@@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--inference_dir=${train_output_path}/pdlite \
--am=fastspeech2_ljspeech \
--voc=hifigan_ljspeech \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=${train_output_path}/lite_infer_out \
--phones_dict=dump/phone_id_map.txt \
--lang=en
......
......@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--am=fastspeech2_ljspeech \
--voc=pwgan_ljspeech\
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2 \
......@@ -24,7 +24,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--am=fastspeech2_ljspeech \
--voc=hifigan_ljspeech \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2 \
......
......@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
--lang=en \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \
--phones_dict=dump/phone_id_map.txt
......@@ -41,7 +41,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--voc_ckpt=hifigan_ljspeech_ckpt_0.2.0/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_ljspeech_ckpt_0.2.0/feats_stats.npy \
--lang=en \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \
--phones_dict=dump/phone_id_map.txt
......
......@@ -267,7 +267,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
--voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \
--voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \
--lang=sing \
--text=${BIN_DIR}/../sentences_sing.txt \
--text=${BIN_DIR}/../../assets/sentences_sing.txt \
--output_dir=exp/default/test_e2e \
--phones_dict=diffsinger_opencpop_ckpt_1.4.0/phone_id_map.txt \
--pinyin_phone=diffsinger_opencpop_ckpt_1.4.0/pinyin_to_phone.txt \
......
......@@ -271,7 +271,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
--voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \
--voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \
--lang=sing \
--text=${BIN_DIR}/../sentences_sing.txt \
--text=${BIN_DIR}/../../assets/sentences_sing.txt \
--output_dir=exp/default/test_e2e \
--phones_dict=diffsinger_opencpop_ckpt_1.4.0/phone_id_map.txt \
--pinyin_phone=diffsinger_opencpop_ckpt_1.4.0/pinyin_to_phone.txt \
......
......@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \
--voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \
--lang=sing \
--text=${BIN_DIR}/../sentences_sing.txt \
--text=${BIN_DIR}/../../assets/sentences_sing.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--speech_stretchs=dump/train/speech_stretchs.npy \
......@@ -44,7 +44,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--voc_ckpt=hifigan_opencpop_ckpt_1.4.0/snapshot_iter_625000.pdz \
--voc_stat=hifigan_opencpop_ckpt_1.4.0/feats_stats.npy \
--lang=sing \
--text=${BIN_DIR}/../sentences_sing.txt \
--text=${BIN_DIR}/../../assets/sentences_sing.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--speech_stretchs=dump/train/speech_stretchs.npy \
......
......@@ -99,7 +99,7 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
--voc_ckpt=pretrained_models/hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
--voc_stat=pretrained_models/hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--text=${BIN_DIR}/../../assets/sentences.txt \
--output_dir=./test_e2e/ \
--phones_dict=${dump_dir}/phone_id_map.txt \
--speaker_dict=${dump_dir}/speaker_id_map.txt \
......
......@@ -98,7 +98,7 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
--voc_ckpt=pretrained_models/hifigan_vctk_ckpt_0.2.0/snapshot_iter_2500000.pdz \
--voc_stat=pretrained_models/hifigan_vctk_ckpt_0.2.0/feats_stats.npy \
--lang=en \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=./test_e2e/ \
--phones_dict=${dump_dir}/phone_id_map.txt \
--speaker_dict=${dump_dir}/speaker_id_map.txt \
......
......@@ -100,7 +100,7 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
--voc_ckpt=pretrained_models/hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
--voc_stat=pretrained_models/hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
--lang=mix \
--text=${BIN_DIR}/../sentences_mix.txt \
--text=${BIN_DIR}/../../assets/sentences_mix.txt \
--output_dir=./test_e2e/ \
--phones_dict=${dump_dir}/phone_id_map.txt \
--speaker_dict=${dump_dir}/speaker_id_map.txt \
......
......@@ -254,7 +254,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
--voc_ckpt=pwg_vctk_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=pwg_vctk_ckpt_0.1.1/feats_stats.npy \
--lang=en \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=exp/default/test_e2e \
--phones_dict=fastspeech2_vctk_ckpt_1.2.0/phone_id_map.txt \
--speaker_dict=fastspeech2_vctk_ckpt_1.2.0/speaker_id_map.txt \
......
......@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_vctk \
--voc=pwgan_vctk \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
......@@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_vctk \
--voc=hifigan_vctk \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
......
......@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--inference_dir=${train_output_path}/pdlite \
--am=fastspeech2_vctk \
--voc=pwgan_vctk \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=${train_output_path}/lite_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
......@@ -25,7 +25,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--inference_dir=${train_output_path}/pdlite \
--am=fastspeech2_vctk \
--voc=hifigan_vctk \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=${train_output_path}/lite_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
......
......@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--am=fastspeech2_vctk \
--voc=pwgan_vctk \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2 \
......@@ -25,7 +25,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--am=fastspeech2_vctk \
--voc=hifigan_vctk \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2 \
......
......@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--voc_ckpt=pwg_vctk_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=pwg_vctk_ckpt_0.1.1/feats_stats.npy \
--lang=en \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
......@@ -43,7 +43,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--voc_ckpt=hifigan_vctk_ckpt_0.2.0/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_vctk_ckpt_0.2.0/feats_stats.npy \
--lang=en \
--text=${BIN_DIR}/../sentences_en.txt \
--text=${BIN_DIR}/../../assets/sentences_en.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
......
......@@ -252,8 +252,10 @@ optional arguments:
## Pretrained Model
Pretrained FastSpeech2 model with no silence in the edge of audios:
- [fastspeech2_mix_ckpt_1.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_ckpt_1.2.0.zip)
- [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)
The static model can be downloaded here:
- [fastspeech2_mix_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_static_0.2.0.zip)
......@@ -285,18 +287,18 @@ FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=fastspeech2_mix \
--am_config=fastspeech2_mix_ckpt_1.2.0/default.yaml \
--am_ckpt=fastspeech2_mix_ckpt_1.2.0/snapshot_iter_99200.pdz \
--am_stat=fastspeech2_mix_ckpt_1.2.0/speech_stats.npy \
--phones_dict=fastspeech2_mix_ckpt_1.2.0/phone_id_map.txt \
--speaker_dict=fastspeech2_mix_ckpt_1.2.0/speaker_id_map.txt \
--am_config=exp/pretrain/fastspeech2_mix_ckpt_1.2.0/default.yaml \
--am_ckpt=exp/pretrain/fastspeech2_mix_ckpt_1.2.0/snapshot_iter_99200.pdz \
--am_stat=exp/pretrain/fastspeech2_mix_ckpt_1.2.0/speech_stats.npy \
--phones_dict=exp/pretrain/fastspeech2_mix_ckpt_1.2.0/phone_id_map.txt \
--speaker_dict=exp/pretrain/fastspeech2_mix_ckpt_1.2.0/speaker_id_map.txt \
--spk_id=174 \
--voc=pwgan_aishell3 \
--voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
--voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
--voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
--voc_config=exp/pretrain/pwg_aishell3_ckpt_0.5/default.yaml \
--voc_ckpt=exp/pretrain/pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
--voc_stat=exp/pretrain/pwg_aishell3_ckpt_0.5/feats_stats.npy \
--lang=mix \
--text=${BIN_DIR}/../sentences_mix.txt \
--text=${BIN_DIR}/../../assets/sentences_mix.txt \
--output_dir=exp/default/test_e2e \
--inference_dir=exp/default/inference
```
......@@ -13,7 +13,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_mix \
--voc=pwgan_aishell3 \
--text=${BIN_DIR}/../sentences_mix.txt \
--text=${BIN_DIR}/../../assets/sentences_mix.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
......@@ -30,7 +30,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_mix \
--voc=hifigan_aishell3 \
--text=${BIN_DIR}/../sentences_mix.txt \
--text=${BIN_DIR}/../../assets/sentences_mix.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
......@@ -45,7 +45,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_mix \
--voc=hifigan_csmsc \
--text=${BIN_DIR}/../sentences_mix.txt \
--text=${BIN_DIR}/../../assets/sentences_mix.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
......
......@@ -8,6 +8,7 @@ mkdir -p $pretrain
pushd $pretrain
wget -c https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_ckpt_1.2.0.zip &
wget -c https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip &
wait
popd
......@@ -13,7 +13,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--am=fastspeech2_mix \
--voc=pwgan_aishell3 \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../sentences_mix.txt \
--text=${BIN_DIR}/../../assets/sentences_mix.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=4 \
......@@ -31,7 +31,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--am=fastspeech2_mix \
--voc=hifigan_aishell3 \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../sentences_mix.txt \
--text=${BIN_DIR}/../../assets/sentences_mix.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=4 \
......@@ -45,7 +45,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--am=fastspeech2_mix \
--voc=hifigan_csmsc \
--output_dir=${train_output_path}/onnx_infer_out_e2e \
--text=${BIN_DIR}/../sentences_mix.txt \
--text=${BIN_DIR}/../../assets/sentences_mix.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=4 \
......
......@@ -23,7 +23,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
--voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
--lang=mix \
--text=${BIN_DIR}/../sentences_mix.txt \
--text=${BIN_DIR}/../../assets/sentences_mix.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
......@@ -48,7 +48,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
--lang=mix \
--text=${BIN_DIR}/../sentences_mix.txt \
--text=${BIN_DIR}/../../assets/sentences_mix.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
......@@ -73,7 +73,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=mix \
--text=${BIN_DIR}/../sentences_mix.txt \
--text=${BIN_DIR}/../../assets/sentences_mix.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--speaker_dict=dump/speaker_id_map.txt \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册