diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md index d02ad1b6373c26f0cd0ffa4d58c3bd4af57f9e72..93ce62c9686f8869ccbc2257cfe0e7b886e7203a 100644 --- a/examples/aishell3/tts3/README.md +++ b/examples/aishell3/tts3/README.md @@ -120,12 +120,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ```text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -134,11 +134,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -150,10 +149,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -169,12 +168,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -184,11 +183,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -199,10 +197,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -215,9 +213,9 @@ optional arguments: output dir. ``` 1. `--am` is acoustic model type with the format {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model. +2. `--am_config`, `--am_ckpt`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model. 3. `--voc` is vocoder type with the format {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. 5. `--lang` is the model language, which can be `zh` or `en`. 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 7. `--text` is the text file, which contains sentences to synthesize. diff --git a/examples/aishell3/voc1/README.md b/examples/aishell3/voc1/README.md index eb30e7c403c30dfeb1d466f558818eabda8dabfb..503f8a19dad1743145dcc4e2e599e6332dd36e9e 100644 --- a/examples/aishell3/voc1/README.md +++ b/examples/aishell3/voc1/README.md @@ -75,7 +75,7 @@ Train a ParallelWaveGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG ParallelWaveGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/aishell3/voc5/README.md b/examples/aishell3/voc5/README.md index c957c4a3aab385cd94adf03fc2cf12afd5bb351e..f8f28f409055616fce8be6cfb5f641e496f05af1 100644 --- a/examples/aishell3/voc5/README.md +++ b/examples/aishell3/voc5/README.md @@ -67,15 +67,13 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] - [--run-benchmark RUN_BENCHMARK] - [--profiler_options PROFILER_OPTIONS] + [--ngpu NGPU] -Train a ParallelWaveGAN model. +Train a HiFiGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG HiFiGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA @@ -83,19 +81,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - -benchmark: - arguments related to benchmark. - - --batch-size BATCH_SIZE - batch size. - --max-iter MAX_ITER train max steps. - --run-benchmark RUN_BENCHMARK - runing benchmark or not, if True, use the --batch-size - and --max-iter. - --profiler_options PROFILER_OPTIONS - The option of profiler, which should be in format - "key1=value1;key2=value2;key3=value3". ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. diff --git a/examples/csmsc/tts0/README.md b/examples/csmsc/tts0/README.md index 01376bd61e08055b6da9e71b4cfb812b8e35c5c9..a337c7d4586511130400895228f78acbd2ec9901 100644 --- a/examples/csmsc/tts0/README.md +++ b/examples/csmsc/tts0/README.md @@ -103,12 +103,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ```text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -117,11 +117,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -133,10 +132,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -152,12 +151,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -167,11 +166,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -182,10 +180,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -198,9 +196,9 @@ optional arguments: output dir. ``` 1. `--am` is acoustic model type with the format {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model. +2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model. 3. `--voc` is vocoder type with the format {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. 5. `--lang` is the model language, which can be `zh` or `en`. 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 7. `--text` is the text file, which contains sentences to synthesize. diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md index 081d858489fa9df9f47a57413f169e8e4752cc37..553a370c9a0df1221a2f52372b5c68c67943a7cc 100644 --- a/examples/csmsc/tts2/README.md +++ b/examples/csmsc/tts2/README.md @@ -109,12 +109,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ```text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -123,11 +123,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -139,10 +138,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -158,12 +157,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -173,11 +172,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -188,10 +186,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -204,9 +202,9 @@ optional arguments: output dir. ``` 1. `--am` is acoustic model type with the format {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat`, `--phones_dict` and `--tones_dict` are arguments for acoustic model, which correspond to the 5 files in the speedyspeech pretrained model. +2. `--am_config`, `--am_ckpt`, `--am_stat`, `--phones_dict` and `--tones_dict` are arguments for acoustic model, which correspond to the 5 files in the speedyspeech pretrained model. 3. `--voc` is vocoder type with the format {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. 5. `--lang` is the model language, which can be `zh` or `en`. 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 7. `--text` is the text file, which contains sentences to synthesize. diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md index c734199b46d9da2dc37482f9a4f75d7375bdaa8e..be18de7d66cf2b0318cb308e71087dee36985322 100644 --- a/examples/csmsc/tts3/README.md +++ b/examples/csmsc/tts3/README.md @@ -111,12 +111,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ```text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -125,11 +125,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -141,10 +140,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -160,12 +159,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -175,11 +174,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -190,10 +188,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -204,11 +202,12 @@ optional arguments: --text TEXT text to synthesize, a 'utt_id sentence' pair per line. --output_dir OUTPUT_DIR output dir. + ``` 1. `--am` is acoustic model type with the format {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model. +2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model. 3. `--voc` is vocoder type with the format {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. 5. `--lang` is the model language, which can be `zh` or `en`. 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 7. `--text` is the text file, which contains sentences to synthesize. diff --git a/examples/csmsc/tts3/README_cn.md b/examples/csmsc/tts3/README_cn.md index 25931ecb182189e4ab76d3befb9b5185bca71caf..a88615134370a4094024f9df25810d8ca7812ac3 100644 --- a/examples/csmsc/tts3/README_cn.md +++ b/examples/csmsc/tts3/README_cn.md @@ -117,12 +117,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ```text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -131,11 +131,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -147,10 +146,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -167,12 +166,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -182,11 +181,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -197,10 +195,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -213,9 +211,9 @@ optional arguments: output dir. ``` 1. `--am` 声学模型格式是否符合 {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat` 和 `--phones_dict` 是声学模型的参数,对应于 fastspeech2 预训练模型中的 4 个文件。 +2. `--am_config`, `--am_ckpt`, `--am_stat` 和 `--phones_dict` 是声学模型的参数,对应于 fastspeech2 预训练模型中的 4 个文件。 3. `--voc` 声码器(vocoder)格式是否符合 {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` 是声码器的参数,对应于 parallel wavegan 预训练模型中的 3 个文件。 +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` 是声码器的参数,对应于 parallel wavegan 预训练模型中的 3 个文件。 5. `--lang` 对应模型的语言可以是 `zh` 或 `en` 。 6. `--test_metadata` 应为 `dump` 文件夹中 `test` 下的规范化元数据文件、 7. `--text` 是文本文件,其中包含要合成的句子。 diff --git a/examples/csmsc/vits/README.md b/examples/csmsc/vits/README.md index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0c16840a04e32be8fefb3bae6c23fb4bd853be9f 100644 --- a/examples/csmsc/vits/README.md +++ b/examples/csmsc/vits/README.md @@ -0,0 +1,146 @@ +# VITS with CSMSC +This example contains code used to train a [VITS](https://arxiv.org/abs/2106.06103) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html). + +## Dataset +### Download and Extract +Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source). + +### Get MFA Result and Extract +We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes for VITS, the durations of MFA are not needed here. +You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. + +## Get Started +Assume the path to the dataset is `~/datasets/BZNSYP`. +Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`. +Run the command below to +1. **source path**. +2. preprocess the dataset. +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. + - synthesize waveform from a text file. + +```bash +./run.sh +``` +You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset. +```bash +./run.sh --stage 0 --stop-stage 0 +``` +### Data Preprocessing +```bash +./local/preprocess.sh ${conf_path} +``` +When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. + +```text +dump +├── dev +│   ├── norm +│   └── raw +├── phone_id_map.txt +├── speaker_id_map.txt +├── test +│   ├── norm +│   └── raw +└── train + ├── feats_stats.npy + ├── norm + └── raw +``` +The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains wave and linear spectrogram of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/feats_stats.npy`. + +Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, feats, feats_lengths, the path of linear spectrogram features, the path of raw waves, speaker, and the id of each utterance. + +### Model Training +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} +``` +`./local/train.sh` calls `${BIN_DIR}/train.py`. +Here's the complete help message. +```text +usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] + [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] + [--ngpu NGPU] [--phones-dict PHONES_DICT] + +Train a VITS model. + +optional arguments: + -h, --help show this help message and exit + --config CONFIG config file to overwrite default config. + --train-metadata TRAIN_METADATA + training data. + --dev-metadata DEV_METADATA + dev data. + --output-dir OUTPUT_DIR + output dir. + --ngpu NGPU if ngpu == 0, use cpu. + --phones-dict PHONES_DICT + phone vocabulary file. +``` +1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. +2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. +3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. +5. `--phones-dict` is the path of the phone vocabulary file. + +### Synthesizing + +`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. + +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize.py [-h] [--config CONFIG] [--ckpt CKPT] + [--phones_dict PHONES_DICT] [--ngpu NGPU] + [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] + +Synthesize with VITS + +optional arguments: + -h, --help show this help message and exit + --config CONFIG Config of VITS. + --ckpt CKPT Checkpoint file of VITS. + --phones_dict PHONES_DICT + phone vocabulary file. + --ngpu NGPU if ngpu == 0, use cpu. + --test_metadata TEST_METADATA + test metadata. + --output_dir OUTPUT_DIR + output dir. +``` +`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize_e2e.py [-h] [--config CONFIG] [--ckpt CKPT] + [--phones_dict PHONES_DICT] [--lang LANG] + [--inference_dir INFERENCE_DIR] [--ngpu NGPU] + [--text TEXT] [--output_dir OUTPUT_DIR] + +Synthesize with VITS + +optional arguments: + -h, --help show this help message and exit + --config CONFIG Config of VITS. + --ckpt CKPT Checkpoint file of VITS. + --phones_dict PHONES_DICT + phone vocabulary file. + --lang LANG Choose model language. zh or en + --inference_dir INFERENCE_DIR + dir to save inference models + --ngpu NGPU if ngpu == 0, use cpu. + --text TEXT text to synthesize, a 'utt_id sentence' pair per line. + --output_dir OUTPUT_DIR + output dir. +``` +1. `--config`, `--ckpt`, and `--phones_dict` are arguments for acoustic model, which correspond to the 3 files in the VITS pretrained model. +2. `--lang` is the model language, which can be `zh` or `en`. +3. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. +4. `--text` is the text file, which contains sentences to synthesize. +5. `--output_dir` is the directory to save synthesized audio files. +6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. + +## Pretrained Model diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md index 77da5b185875e52c031715d1013659e4291a81c6..d19fe84979044e3cd0f9d638f3cb7dbd300b95e7 100644 --- a/examples/csmsc/voc1/README.md +++ b/examples/csmsc/voc1/README.md @@ -65,7 +65,7 @@ Train a ParallelWaveGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG ParallelWaveGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md index 12adaf7f4e2098f86e75c4d155951bccc8969f5e..eb771036277ebae3eff200fa4b36579492ffaf80 100644 --- a/examples/csmsc/voc3/README.md +++ b/examples/csmsc/voc3/README.md @@ -63,7 +63,7 @@ Train a Multi-Band MelGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG Multi-Band MelGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/csmsc/voc4/README.md b/examples/csmsc/voc4/README.md index b7add3e574c63b61c22e75ca15289f2b6bc7ce51..d9e86a88db23398cfdde12c0f0dd9526851a99a2 100644 --- a/examples/csmsc/voc4/README.md +++ b/examples/csmsc/voc4/README.md @@ -63,7 +63,7 @@ Train a Style MelGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG Style MelGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/csmsc/voc5/README.md b/examples/csmsc/voc5/README.md index 94f93b48b68239c351189892f81130c3fa350769..e044a0c74bbcb1c4354e82d0db7876d29d89c003 100644 --- a/examples/csmsc/voc5/README.md +++ b/examples/csmsc/voc5/README.md @@ -63,7 +63,7 @@ Train a HiFiGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG HiFiGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/csmsc/voc6/README.md b/examples/csmsc/voc6/README.md index 7dcf133bdc8c379ec3f6dd03a8c2e01ee6ba55f7..f1a5ec3bb4e8725ab32a5159b6b77dea858524f9 100644 --- a/examples/csmsc/voc6/README.md +++ b/examples/csmsc/voc6/README.md @@ -63,7 +63,7 @@ Train a WaveRNN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG WaveRNN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/ljspeech/tts0/README.md b/examples/ljspeech/tts0/README.md index ba7ad619392c41c67d16790c99dfc8865e219e30..581f7930fb23a8d4494567a053ac2c68fa39c399 100644 --- a/examples/ljspeech/tts0/README.md +++ b/examples/ljspeech/tts0/README.md @@ -103,12 +103,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ```text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -117,11 +117,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -133,10 +132,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -152,12 +151,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -167,11 +166,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -182,10 +180,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -198,9 +196,9 @@ optional arguments: output dir. ``` 1. `--am` is acoustic model type with the format {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model. +2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model. 3. `--voc` is vocoder type with the format {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. 5. `--lang` is the model language, which can be `zh` or `en`. 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 7. `--text` is the text file, which contains sentences to synthesize. diff --git a/examples/ljspeech/tts1/README.md b/examples/ljspeech/tts1/README.md index 7f32522acd3f486a958d4e8640ee88275e7fbb8b..f85991cba6d9ddac1ae18992435d3cd3cab73840 100644 --- a/examples/ljspeech/tts1/README.md +++ b/examples/ljspeech/tts1/README.md @@ -61,7 +61,7 @@ Train a TransformerTTS model with LJSpeech TTS dataset. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG TransformerTTS config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md index e028fa05d5a1748fab1a4fc3231f6da741701e76..a6724083daf7e3952200ee07ba90b65d206665ef 100644 --- a/examples/ljspeech/tts3/README.md +++ b/examples/ljspeech/tts3/README.md @@ -109,12 +109,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ``text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -123,11 +123,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -139,10 +138,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -158,12 +157,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -173,11 +172,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -188,10 +186,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -204,9 +202,9 @@ optional arguments: output dir. ``` 1. `--am` is acoustic model type with the format {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model. +2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model. 3. `--voc` is vocoder type with the format {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. 5. `--lang` is the model language, which can be `zh` or `en`. 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 7. `--text` is the text file, which contains sentences to synthesize. diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md index 4513b2a05a67342a9be8d923fc517a7738eccf83..6fd6cbe24bf1db05209df9feec80b647e611fcb1 100644 --- a/examples/ljspeech/voc1/README.md +++ b/examples/ljspeech/voc1/README.md @@ -65,7 +65,7 @@ Train a ParallelWaveGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG ParallelWaveGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/ljspeech/voc5/README.md b/examples/ljspeech/voc5/README.md index 9b31e2650459c54ee1d6bed286f4f361077331ff..afc1bb8be86a1ced4a65e81f05e7ca467c65943f 100644 --- a/examples/ljspeech/voc5/README.md +++ b/examples/ljspeech/voc5/README.md @@ -57,15 +57,13 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] - [--run-benchmark RUN_BENCHMARK] - [--profiler_options PROFILER_OPTIONS] + [--ngpu NGPU] -Train a ParallelWaveGAN model. +Train a HiFiGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG HiFiGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA @@ -73,19 +71,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - -benchmark: - arguments related to benchmark. - - --batch-size BATCH_SIZE - batch size. - --max-iter MAX_ITER train max steps. - --run-benchmark RUN_BENCHMARK - runing benchmark or not, if True, use the --batch-size - and --max-iter. - --profiler_options PROFILER_OPTIONS - The option of profiler, which should be in format - "key1=value1;key2=value2;key3=value3". ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md index f373ca6a387e53b8395570705f6e8576293055c0..379f5c0fd773ab5b94467a3e0f92674d26bdd92d 100644 --- a/examples/vctk/tts3/README.md +++ b/examples/vctk/tts3/README.md @@ -112,12 +112,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ```text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -126,11 +126,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -142,10 +141,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -161,12 +160,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -176,11 +175,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -191,10 +189,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -207,9 +205,9 @@ optional arguments: output dir. ``` 1. `--am` is acoustic model type with the format {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model. +2. `--am_config`, `--am_ckpt`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model. 3. `--voc` is vocoder type with the format {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. 5. `--lang` is the model language, which can be `zh` or `en`. 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 7. `--text` is the text file, which contains sentences to synthesize. diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md index 1c3016f885de2bb15bd2af5d4866782cb0a81f80..c4c40d1d0c5d8c809403b431088e92938b961c6d 100644 --- a/examples/vctk/voc1/README.md +++ b/examples/vctk/voc1/README.md @@ -70,7 +70,7 @@ Train a ParallelWaveGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG ParallelWaveGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/vctk/voc5/README.md b/examples/vctk/voc5/README.md index 4eb25c02d7f97764d10ab2c6b5e871f06b61b148..c53d46325d3c010436e93196ca619c0db795cba8 100644 --- a/examples/vctk/voc5/README.md +++ b/examples/vctk/voc5/README.md @@ -62,15 +62,13 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] - [--run-benchmark RUN_BENCHMARK] - [--profiler_options PROFILER_OPTIONS] + [--ngpu NGPU] -Train a ParallelWaveGAN model. +Train a HiFiGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG HiFiGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA @@ -78,19 +76,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - -benchmark: - arguments related to benchmark. - - --batch-size BATCH_SIZE - batch size. - --max-iter MAX_ITER train max steps. - --run-benchmark RUN_BENCHMARK - runing benchmark or not, if True, use the --batch-size - and --max-iter. - --profiler_options PROFILER_OPTIONS - The option of profiler, which should be in format - "key1=value1;key2=value2;key3=value3". ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. diff --git a/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py b/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py index c70821e78fe6e4063d74e8c5608ede225ed1b230..4c733dc9b05ba8880d415482d7c194c910039f92 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py @@ -243,8 +243,7 @@ def main(): # parse args and config and redirect to train_sp parser = argparse.ArgumentParser(description="Train a HiFiGAN model.") - parser.add_argument( - "--config", type=str, help="config file to overwrite default config.") + parser.add_argument("--config", type=str, help="HiFiGAN config file.") parser.add_argument("--train-metadata", type=str, help="training data.") parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") diff --git a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py index 27ffded63b3621c0f2110815b27fd4420ef0bc5a..3b3ebb4788e2cd4b58119996f74c8dc0d1bfc46b 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py @@ -233,7 +233,7 @@ def main(): parser = argparse.ArgumentParser( description="Train a Multi-Band MelGAN model.") parser.add_argument( - "--config", type=str, help="config file to overwrite default config.") + "--config", type=str, help="Multi-Band MelGAN config file.") parser.add_argument("--train-metadata", type=str, help="training data.") parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py index 92de7a2c4e7a04ed28b7b30dfa47be4796acc93f..b26407028928cec639d47f576e2bb1f6766e1990 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py @@ -208,7 +208,7 @@ def main(): parser = argparse.ArgumentParser( description="Train a ParallelWaveGAN model.") parser.add_argument( - "--config", type=str, help="config file to overwrite default config.") + "--config", type=str, help="ParallelWaveGAN config file.") parser.add_argument("--train-metadata", type=str, help="training data.") parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") diff --git a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py index be3ba74251d92cf90be713651837205fa8dc582a..a87cc7a182fdc88d9770021969b4d4248d87e83a 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py @@ -224,8 +224,7 @@ def main(): # parse args and config and redirect to train_sp parser = argparse.ArgumentParser(description="Train a Style MelGAN model.") - parser.add_argument( - "--config", type=str, help="config file to overwrite default config.") + parser.add_argument("--config", type=str, help="Style MelGAN config file.") parser.add_argument("--train-metadata", type=str, help="training data.") parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") diff --git a/paddlespeech/t2s/exps/transformer_tts/train.py b/paddlespeech/t2s/exps/transformer_tts/train.py index 45ecb269bac033fed4287e5083ced6ce92b89f35..da48b6b99700ed49e5c815bf6b6f14c8eecfae95 100644 --- a/paddlespeech/t2s/exps/transformer_tts/train.py +++ b/paddlespeech/t2s/exps/transformer_tts/train.py @@ -160,7 +160,7 @@ def main(): parser = argparse.ArgumentParser(description="Train a TransformerTTS " "model with LJSpeech TTS dataset.") parser.add_argument( - "--config", type=str, help="config file to overwrite default config.") + "--config", type=str, help="TransformerTTS config file.") parser.add_argument("--train-metadata", type=str, help="training data.") parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py index b921f92af75ba367d702be94050e563538e7c755..dbda8b7177bca068ecaeabe41679a93e153aba35 100644 --- a/paddlespeech/t2s/exps/vits/train.py +++ b/paddlespeech/t2s/exps/vits/train.py @@ -226,9 +226,8 @@ def train_sp(args, config): def main(): # parse args and config and redirect to train_sp - parser = argparse.ArgumentParser(description="Train a HiFiGAN model.") - parser.add_argument( - "--config", type=str, help="config file to overwrite default config.") + parser = argparse.ArgumentParser(description="Train a VITS model.") + parser.add_argument("--config", type=str, help="VITS config file") parser.add_argument("--train-metadata", type=str, help="training data.") parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") diff --git a/paddlespeech/t2s/exps/wavernn/train.py b/paddlespeech/t2s/exps/wavernn/train.py index 8661d311d218bda58142a846f3dedce5a07ffabf..cf24ea26888002afcfba19449ec8ac5db6efe517 100644 --- a/paddlespeech/t2s/exps/wavernn/train.py +++ b/paddlespeech/t2s/exps/wavernn/train.py @@ -180,8 +180,7 @@ def main(): # parse args and config and redirect to train_sp parser = argparse.ArgumentParser(description="Train a WaveRNN model.") - parser.add_argument( - "--config", type=str, help="config file to overwrite default config.") + parser.add_argument("--config", type=str, help="WaveRNN config file.") parser.add_argument("--train-metadata", type=str, help="training data.") parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.")