From f9f014d159e28efa788f4d241794420716d369ad Mon Sep 17 00:00:00 2001 From: TianYuan Date: Wed, 25 May 2022 10:39:28 +0000 Subject: [PATCH] add VITS readme, test=tts --- examples/aishell3/tts3/README.md | 30 ++-- examples/aishell3/voc1/README.md | 2 +- examples/aishell3/voc5/README.md | 21 +-- examples/csmsc/tts0/README.md | 30 ++-- examples/csmsc/tts2/README.md | 30 ++-- examples/csmsc/tts3/README.md | 31 ++-- examples/csmsc/tts3/README_cn.md | 30 ++-- examples/csmsc/vits/README.md | 146 ++++++++++++++++++ examples/csmsc/voc1/README.md | 2 +- examples/csmsc/voc3/README.md | 2 +- examples/csmsc/voc4/README.md | 2 +- examples/csmsc/voc5/README.md | 2 +- examples/csmsc/voc6/README.md | 2 +- examples/ljspeech/tts0/README.md | 30 ++-- examples/ljspeech/tts1/README.md | 2 +- examples/ljspeech/tts3/README.md | 30 ++-- examples/ljspeech/voc1/README.md | 2 +- examples/ljspeech/voc5/README.md | 21 +-- examples/vctk/tts3/README.md | 30 ++-- examples/vctk/voc1/README.md | 2 +- examples/vctk/voc5/README.md | 21 +-- .../t2s/exps/gan_vocoder/hifigan/train.py | 3 +- .../gan_vocoder/multi_band_melgan/train.py | 2 +- .../gan_vocoder/parallelwave_gan/train.py | 2 +- .../exps/gan_vocoder/style_melgan/train.py | 3 +- .../t2s/exps/transformer_tts/train.py | 2 +- paddlespeech/t2s/exps/vits/train.py | 5 +- paddlespeech/t2s/exps/wavernn/train.py | 3 +- 28 files changed, 285 insertions(+), 203 deletions(-) diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md index d02ad1b6..93ce62c9 100644 --- a/examples/aishell3/tts3/README.md +++ b/examples/aishell3/tts3/README.md @@ -120,12 +120,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ```text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -134,11 +134,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -150,10 +149,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -169,12 +168,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -184,11 +183,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -199,10 +197,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -215,9 +213,9 @@ optional arguments: output dir. ``` 1. `--am` is acoustic model type with the format {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model. +2. `--am_config`, `--am_ckpt`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model. 3. `--voc` is vocoder type with the format {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. 5. `--lang` is the model language, which can be `zh` or `en`. 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 7. `--text` is the text file, which contains sentences to synthesize. diff --git a/examples/aishell3/voc1/README.md b/examples/aishell3/voc1/README.md index eb30e7c4..503f8a19 100644 --- a/examples/aishell3/voc1/README.md +++ b/examples/aishell3/voc1/README.md @@ -75,7 +75,7 @@ Train a ParallelWaveGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG ParallelWaveGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/aishell3/voc5/README.md b/examples/aishell3/voc5/README.md index c957c4a3..f8f28f40 100644 --- a/examples/aishell3/voc5/README.md +++ b/examples/aishell3/voc5/README.md @@ -67,15 +67,13 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] - [--run-benchmark RUN_BENCHMARK] - [--profiler_options PROFILER_OPTIONS] + [--ngpu NGPU] -Train a ParallelWaveGAN model. +Train a HiFiGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG HiFiGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA @@ -83,19 +81,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - -benchmark: - arguments related to benchmark. - - --batch-size BATCH_SIZE - batch size. - --max-iter MAX_ITER train max steps. - --run-benchmark RUN_BENCHMARK - runing benchmark or not, if True, use the --batch-size - and --max-iter. - --profiler_options PROFILER_OPTIONS - The option of profiler, which should be in format - "key1=value1;key2=value2;key3=value3". ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. diff --git a/examples/csmsc/tts0/README.md b/examples/csmsc/tts0/README.md index 01376bd6..a337c7d4 100644 --- a/examples/csmsc/tts0/README.md +++ b/examples/csmsc/tts0/README.md @@ -103,12 +103,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ```text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -117,11 +117,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -133,10 +132,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -152,12 +151,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -167,11 +166,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -182,10 +180,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -198,9 +196,9 @@ optional arguments: output dir. ``` 1. `--am` is acoustic model type with the format {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model. +2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model. 3. `--voc` is vocoder type with the format {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. 5. `--lang` is the model language, which can be `zh` or `en`. 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 7. `--text` is the text file, which contains sentences to synthesize. diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md index 081d8584..553a370c 100644 --- a/examples/csmsc/tts2/README.md +++ b/examples/csmsc/tts2/README.md @@ -109,12 +109,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ```text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -123,11 +123,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -139,10 +138,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -158,12 +157,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -173,11 +172,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -188,10 +186,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -204,9 +202,9 @@ optional arguments: output dir. ``` 1. `--am` is acoustic model type with the format {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat`, `--phones_dict` and `--tones_dict` are arguments for acoustic model, which correspond to the 5 files in the speedyspeech pretrained model. +2. `--am_config`, `--am_ckpt`, `--am_stat`, `--phones_dict` and `--tones_dict` are arguments for acoustic model, which correspond to the 5 files in the speedyspeech pretrained model. 3. `--voc` is vocoder type with the format {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. 5. `--lang` is the model language, which can be `zh` or `en`. 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 7. `--text` is the text file, which contains sentences to synthesize. diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md index c734199b..be18de7d 100644 --- a/examples/csmsc/tts3/README.md +++ b/examples/csmsc/tts3/README.md @@ -111,12 +111,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ```text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -125,11 +125,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -141,10 +140,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -160,12 +159,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -175,11 +174,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -190,10 +188,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -204,11 +202,12 @@ optional arguments: --text TEXT text to synthesize, a 'utt_id sentence' pair per line. --output_dir OUTPUT_DIR output dir. + ``` 1. `--am` is acoustic model type with the format {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model. +2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model. 3. `--voc` is vocoder type with the format {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. 5. `--lang` is the model language, which can be `zh` or `en`. 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 7. `--text` is the text file, which contains sentences to synthesize. diff --git a/examples/csmsc/tts3/README_cn.md b/examples/csmsc/tts3/README_cn.md index 25931ecb..a8861513 100644 --- a/examples/csmsc/tts3/README_cn.md +++ b/examples/csmsc/tts3/README_cn.md @@ -117,12 +117,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ```text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -131,11 +131,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -147,10 +146,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -167,12 +166,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -182,11 +181,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -197,10 +195,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -213,9 +211,9 @@ optional arguments: output dir. ``` 1. `--am` 声学模型格式是否符合 {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat` 和 `--phones_dict` 是声学模型的参数,对应于 fastspeech2 预训练模型中的 4 个文件。 +2. `--am_config`, `--am_ckpt`, `--am_stat` 和 `--phones_dict` 是声学模型的参数,对应于 fastspeech2 预训练模型中的 4 个文件。 3. `--voc` 声码器(vocoder)格式是否符合 {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` 是声码器的参数,对应于 parallel wavegan 预训练模型中的 3 个文件。 +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` 是声码器的参数,对应于 parallel wavegan 预训练模型中的 3 个文件。 5. `--lang` 对应模型的语言可以是 `zh` 或 `en` 。 6. `--test_metadata` 应为 `dump` 文件夹中 `test` 下的规范化元数据文件、 7. `--text` 是文本文件,其中包含要合成的句子。 diff --git a/examples/csmsc/vits/README.md b/examples/csmsc/vits/README.md index e69de29b..0c16840a 100644 --- a/examples/csmsc/vits/README.md +++ b/examples/csmsc/vits/README.md @@ -0,0 +1,146 @@ +# VITS with CSMSC +This example contains code used to train a [VITS](https://arxiv.org/abs/2106.06103) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html). + +## Dataset +### Download and Extract +Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source). + +### Get MFA Result and Extract +We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes for VITS, the durations of MFA are not needed here. +You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. + +## Get Started +Assume the path to the dataset is `~/datasets/BZNSYP`. +Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`. +Run the command below to +1. **source path**. +2. preprocess the dataset. +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. + - synthesize waveform from a text file. + +```bash +./run.sh +``` +You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset. +```bash +./run.sh --stage 0 --stop-stage 0 +``` +### Data Preprocessing +```bash +./local/preprocess.sh ${conf_path} +``` +When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. + +```text +dump +├── dev +│   ├── norm +│   └── raw +├── phone_id_map.txt +├── speaker_id_map.txt +├── test +│   ├── norm +│   └── raw +└── train + ├── feats_stats.npy + ├── norm + └── raw +``` +The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains wave and linear spectrogram of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/feats_stats.npy`. + +Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, feats, feats_lengths, the path of linear spectrogram features, the path of raw waves, speaker, and the id of each utterance. + +### Model Training +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} +``` +`./local/train.sh` calls `${BIN_DIR}/train.py`. +Here's the complete help message. +```text +usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] + [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] + [--ngpu NGPU] [--phones-dict PHONES_DICT] + +Train a VITS model. + +optional arguments: + -h, --help show this help message and exit + --config CONFIG config file to overwrite default config. + --train-metadata TRAIN_METADATA + training data. + --dev-metadata DEV_METADATA + dev data. + --output-dir OUTPUT_DIR + output dir. + --ngpu NGPU if ngpu == 0, use cpu. + --phones-dict PHONES_DICT + phone vocabulary file. +``` +1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. +2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. +3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. +5. `--phones-dict` is the path of the phone vocabulary file. + +### Synthesizing + +`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. + +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize.py [-h] [--config CONFIG] [--ckpt CKPT] + [--phones_dict PHONES_DICT] [--ngpu NGPU] + [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] + +Synthesize with VITS + +optional arguments: + -h, --help show this help message and exit + --config CONFIG Config of VITS. + --ckpt CKPT Checkpoint file of VITS. + --phones_dict PHONES_DICT + phone vocabulary file. + --ngpu NGPU if ngpu == 0, use cpu. + --test_metadata TEST_METADATA + test metadata. + --output_dir OUTPUT_DIR + output dir. +``` +`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize_e2e.py [-h] [--config CONFIG] [--ckpt CKPT] + [--phones_dict PHONES_DICT] [--lang LANG] + [--inference_dir INFERENCE_DIR] [--ngpu NGPU] + [--text TEXT] [--output_dir OUTPUT_DIR] + +Synthesize with VITS + +optional arguments: + -h, --help show this help message and exit + --config CONFIG Config of VITS. + --ckpt CKPT Checkpoint file of VITS. + --phones_dict PHONES_DICT + phone vocabulary file. + --lang LANG Choose model language. zh or en + --inference_dir INFERENCE_DIR + dir to save inference models + --ngpu NGPU if ngpu == 0, use cpu. + --text TEXT text to synthesize, a 'utt_id sentence' pair per line. + --output_dir OUTPUT_DIR + output dir. +``` +1. `--config`, `--ckpt`, and `--phones_dict` are arguments for acoustic model, which correspond to the 3 files in the VITS pretrained model. +2. `--lang` is the model language, which can be `zh` or `en`. +3. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. +4. `--text` is the text file, which contains sentences to synthesize. +5. `--output_dir` is the directory to save synthesized audio files. +6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. + +## Pretrained Model diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md index 77da5b18..d19fe849 100644 --- a/examples/csmsc/voc1/README.md +++ b/examples/csmsc/voc1/README.md @@ -65,7 +65,7 @@ Train a ParallelWaveGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG ParallelWaveGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md index 12adaf7f..eb771036 100644 --- a/examples/csmsc/voc3/README.md +++ b/examples/csmsc/voc3/README.md @@ -63,7 +63,7 @@ Train a Multi-Band MelGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG Multi-Band MelGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/csmsc/voc4/README.md b/examples/csmsc/voc4/README.md index b7add3e5..d9e86a88 100644 --- a/examples/csmsc/voc4/README.md +++ b/examples/csmsc/voc4/README.md @@ -63,7 +63,7 @@ Train a Style MelGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG Style MelGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/csmsc/voc5/README.md b/examples/csmsc/voc5/README.md index 94f93b48..e044a0c7 100644 --- a/examples/csmsc/voc5/README.md +++ b/examples/csmsc/voc5/README.md @@ -63,7 +63,7 @@ Train a HiFiGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG HiFiGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/csmsc/voc6/README.md b/examples/csmsc/voc6/README.md index 7dcf133b..f1a5ec3b 100644 --- a/examples/csmsc/voc6/README.md +++ b/examples/csmsc/voc6/README.md @@ -63,7 +63,7 @@ Train a WaveRNN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG WaveRNN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/ljspeech/tts0/README.md b/examples/ljspeech/tts0/README.md index ba7ad619..581f7930 100644 --- a/examples/ljspeech/tts0/README.md +++ b/examples/ljspeech/tts0/README.md @@ -103,12 +103,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ```text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -117,11 +117,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -133,10 +132,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -152,12 +151,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -167,11 +166,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -182,10 +180,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -198,9 +196,9 @@ optional arguments: output dir. ``` 1. `--am` is acoustic model type with the format {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model. +2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model. 3. `--voc` is vocoder type with the format {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. 5. `--lang` is the model language, which can be `zh` or `en`. 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 7. `--text` is the text file, which contains sentences to synthesize. diff --git a/examples/ljspeech/tts1/README.md b/examples/ljspeech/tts1/README.md index 7f32522a..f85991cb 100644 --- a/examples/ljspeech/tts1/README.md +++ b/examples/ljspeech/tts1/README.md @@ -61,7 +61,7 @@ Train a TransformerTTS model with LJSpeech TTS dataset. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG TransformerTTS config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md index e028fa05..a6724083 100644 --- a/examples/ljspeech/tts3/README.md +++ b/examples/ljspeech/tts3/README.md @@ -109,12 +109,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ``text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -123,11 +123,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -139,10 +138,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -158,12 +157,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -173,11 +172,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -188,10 +186,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -204,9 +202,9 @@ optional arguments: output dir. ``` 1. `--am` is acoustic model type with the format {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model. +2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model. 3. `--voc` is vocoder type with the format {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. 5. `--lang` is the model language, which can be `zh` or `en`. 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 7. `--text` is the text file, which contains sentences to synthesize. diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md index 4513b2a0..6fd6cbe2 100644 --- a/examples/ljspeech/voc1/README.md +++ b/examples/ljspeech/voc1/README.md @@ -65,7 +65,7 @@ Train a ParallelWaveGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG ParallelWaveGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/ljspeech/voc5/README.md b/examples/ljspeech/voc5/README.md index 9b31e265..afc1bb8b 100644 --- a/examples/ljspeech/voc5/README.md +++ b/examples/ljspeech/voc5/README.md @@ -57,15 +57,13 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] - [--run-benchmark RUN_BENCHMARK] - [--profiler_options PROFILER_OPTIONS] + [--ngpu NGPU] -Train a ParallelWaveGAN model. +Train a HiFiGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG HiFiGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA @@ -73,19 +71,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - -benchmark: - arguments related to benchmark. - - --batch-size BATCH_SIZE - batch size. - --max-iter MAX_ITER train max steps. - --run-benchmark RUN_BENCHMARK - runing benchmark or not, if True, use the --batch-size - and --max-iter. - --profiler_options PROFILER_OPTIONS - The option of profiler, which should be in format - "key1=value1;key2=value2;key3=value3". ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md index f373ca6a..379f5c0f 100644 --- a/examples/vctk/tts3/README.md +++ b/examples/vctk/tts3/README.md @@ -112,12 +112,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ```text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -126,11 +126,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -142,10 +141,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -161,12 +160,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -176,11 +175,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -191,10 +189,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -207,9 +205,9 @@ optional arguments: output dir. ``` 1. `--am` is acoustic model type with the format {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model. +2. `--am_config`, `--am_ckpt`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model. 3. `--voc` is vocoder type with the format {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. 5. `--lang` is the model language, which can be `zh` or `en`. 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 7. `--text` is the text file, which contains sentences to synthesize. diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md index 1c3016f8..c4c40d1d 100644 --- a/examples/vctk/voc1/README.md +++ b/examples/vctk/voc1/README.md @@ -70,7 +70,7 @@ Train a ParallelWaveGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG ParallelWaveGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/vctk/voc5/README.md b/examples/vctk/voc5/README.md index 4eb25c02..c53d4632 100644 --- a/examples/vctk/voc5/README.md +++ b/examples/vctk/voc5/README.md @@ -62,15 +62,13 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] - [--run-benchmark RUN_BENCHMARK] - [--profiler_options PROFILER_OPTIONS] + [--ngpu NGPU] -Train a ParallelWaveGAN model. +Train a HiFiGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG HiFiGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA @@ -78,19 +76,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - -benchmark: - arguments related to benchmark. - - --batch-size BATCH_SIZE - batch size. - --max-iter MAX_ITER train max steps. - --run-benchmark RUN_BENCHMARK - runing benchmark or not, if True, use the --batch-size - and --max-iter. - --profiler_options PROFILER_OPTIONS - The option of profiler, which should be in format - "key1=value1;key2=value2;key3=value3". ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. diff --git a/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py b/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py index c70821e7..4c733dc9 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py @@ -243,8 +243,7 @@ def main(): # parse args and config and redirect to train_sp parser = argparse.ArgumentParser(description="Train a HiFiGAN model.") - parser.add_argument( - "--config", type=str, help="config file to overwrite default config.") + parser.add_argument("--config", type=str, help="HiFiGAN config file.") parser.add_argument("--train-metadata", type=str, help="training data.") parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") diff --git a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py index 27ffded6..3b3ebb47 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py @@ -233,7 +233,7 @@ def main(): parser = argparse.ArgumentParser( description="Train a Multi-Band MelGAN model.") parser.add_argument( - "--config", type=str, help="config file to overwrite default config.") + "--config", type=str, help="Multi-Band MelGAN config file.") parser.add_argument("--train-metadata", type=str, help="training data.") parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py index 92de7a2c..b2640702 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py @@ -208,7 +208,7 @@ def main(): parser = argparse.ArgumentParser( description="Train a ParallelWaveGAN model.") parser.add_argument( - "--config", type=str, help="config file to overwrite default config.") + "--config", type=str, help="ParallelWaveGAN config file.") parser.add_argument("--train-metadata", type=str, help="training data.") parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") diff --git a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py index be3ba742..a87cc7a1 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py @@ -224,8 +224,7 @@ def main(): # parse args and config and redirect to train_sp parser = argparse.ArgumentParser(description="Train a Style MelGAN model.") - parser.add_argument( - "--config", type=str, help="config file to overwrite default config.") + parser.add_argument("--config", type=str, help="Style MelGAN config file.") parser.add_argument("--train-metadata", type=str, help="training data.") parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") diff --git a/paddlespeech/t2s/exps/transformer_tts/train.py b/paddlespeech/t2s/exps/transformer_tts/train.py index 45ecb269..da48b6b9 100644 --- a/paddlespeech/t2s/exps/transformer_tts/train.py +++ b/paddlespeech/t2s/exps/transformer_tts/train.py @@ -160,7 +160,7 @@ def main(): parser = argparse.ArgumentParser(description="Train a TransformerTTS " "model with LJSpeech TTS dataset.") parser.add_argument( - "--config", type=str, help="config file to overwrite default config.") + "--config", type=str, help="TransformerTTS config file.") parser.add_argument("--train-metadata", type=str, help="training data.") parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py index b921f92a..dbda8b71 100644 --- a/paddlespeech/t2s/exps/vits/train.py +++ b/paddlespeech/t2s/exps/vits/train.py @@ -226,9 +226,8 @@ def train_sp(args, config): def main(): # parse args and config and redirect to train_sp - parser = argparse.ArgumentParser(description="Train a HiFiGAN model.") - parser.add_argument( - "--config", type=str, help="config file to overwrite default config.") + parser = argparse.ArgumentParser(description="Train a VITS model.") + parser.add_argument("--config", type=str, help="VITS config file") parser.add_argument("--train-metadata", type=str, help="training data.") parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") diff --git a/paddlespeech/t2s/exps/wavernn/train.py b/paddlespeech/t2s/exps/wavernn/train.py index 8661d311..cf24ea26 100644 --- a/paddlespeech/t2s/exps/wavernn/train.py +++ b/paddlespeech/t2s/exps/wavernn/train.py @@ -180,8 +180,7 @@ def main(): # parse args and config and redirect to train_sp parser = argparse.ArgumentParser(description="Train a WaveRNN model.") - parser.add_argument( - "--config", type=str, help="config file to overwrite default config.") + parser.add_argument("--config", type=str, help="WaveRNN config file.") parser.add_argument("--train-metadata", type=str, help="training data.") parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") -- GitLab