diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md index 9c505679c91ca9dad9cf7d5f7dee0ec7970a682d..31c99898ccc7839b835a0fbd7daec550a36de340 100644 --- a/examples/aishell3/tts3/README.md +++ b/examples/aishell3/tts3/README.md @@ -113,12 +113,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ```text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -127,11 +127,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -143,10 +142,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -162,12 +161,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -177,11 +176,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -192,10 +190,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -208,9 +206,9 @@ optional arguments: output dir. ``` 1. `--am` is acoustic model type with the format {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model. +2. `--am_config`, `--am_ckpt`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model. 3. `--voc` is vocoder type with the format {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. 5. `--lang` is the model language, which can be `zh` or `en`. 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 7. `--text` is the text file, which contains sentences to synthesize. diff --git a/examples/aishell3/voc1/README.md b/examples/aishell3/voc1/README.md index e9e012d29ea95cda33eaccf8201bb0d74a90a9f5..a3daf3dfd6889e60f9d71be396bc9b3d2404fe54 100644 --- a/examples/aishell3/voc1/README.md +++ b/examples/aishell3/voc1/README.md @@ -68,7 +68,7 @@ Train a ParallelWaveGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG ParallelWaveGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/aishell3/voc5/README.md b/examples/aishell3/voc5/README.md index 84bcd78ef0ca56ef385f56173d866f1cf4c64bfd..c3e3197d63c0e871dabd61e67992335fc8d5d1f9 100644 --- a/examples/aishell3/voc5/README.md +++ b/examples/aishell3/voc5/README.md @@ -59,15 +59,13 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] - [--run-benchmark RUN_BENCHMARK] - [--profiler_options PROFILER_OPTIONS] + [--ngpu NGPU] -Train a ParallelWaveGAN model. +Train a HiFiGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG HiFiGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA @@ -75,19 +73,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - -benchmark: - arguments related to benchmark. - - --batch-size BATCH_SIZE - batch size. - --max-iter MAX_ITER train max steps. - --run-benchmark RUN_BENCHMARK - runing benchmark or not, if True, use the --batch-size - and --max-iter. - --profiler_options PROFILER_OPTIONS - The option of profiler, which should be in format - "key1=value1;key2=value2;key3=value3". ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. diff --git a/examples/csmsc/tts0/README.md b/examples/csmsc/tts0/README.md index d62c901174ae2ed8e0bb14f93c76f97939b33499..bc7769d1572166b6e27185f282b8ca16b998f40f 100644 --- a/examples/csmsc/tts0/README.md +++ b/examples/csmsc/tts0/README.md @@ -103,12 +103,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ```text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -117,11 +117,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -133,10 +132,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -152,12 +151,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -167,11 +166,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -182,10 +180,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -198,9 +196,9 @@ optional arguments: output dir. ``` 1. `--am` is acoustic model type with the format {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model. +2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model. 3. `--voc` is vocoder type with the format {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. 5. `--lang` is the model language, which can be `zh` or `en`. 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 7. `--text` is the text file, which contains sentences to synthesize. diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md index 1bcfb383f200b6d454eabac2b85e4b23b32780d1..f45561719ba88485e6da2c20674da215f06d4093 100644 --- a/examples/csmsc/tts2/README.md +++ b/examples/csmsc/tts2/README.md @@ -109,12 +109,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ```text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -123,11 +123,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -139,10 +138,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -158,12 +157,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -173,11 +172,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -188,10 +186,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -204,9 +202,9 @@ optional arguments: output dir. ``` 1. `--am` is acoustic model type with the format {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat`, `--phones_dict` and `--tones_dict` are arguments for acoustic model, which correspond to the 5 files in the speedyspeech pretrained model. +2. `--am_config`, `--am_ckpt`, `--am_stat`, `--phones_dict` and `--tones_dict` are arguments for acoustic model, which correspond to the 5 files in the speedyspeech pretrained model. 3. `--voc` is vocoder type with the format {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. 5. `--lang` is the model language, which can be `zh` or `en`. 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 7. `--text` is the text file, which contains sentences to synthesize. diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md index 1f7dfa0fdd31526f1bf05bca51ed953e6c57227e..371034e772391196ab1b455b0c5f42e86df82b35 100644 --- a/examples/csmsc/tts3/README.md +++ b/examples/csmsc/tts3/README.md @@ -111,12 +111,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ```text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -125,11 +125,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -141,10 +140,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -160,12 +159,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -175,11 +174,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -190,10 +188,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -204,11 +202,12 @@ optional arguments: --text TEXT text to synthesize, a 'utt_id sentence' pair per line. --output_dir OUTPUT_DIR output dir. + ``` 1. `--am` is acoustic model type with the format {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model. +2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model. 3. `--voc` is vocoder type with the format {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. 5. `--lang` is the model language, which can be `zh` or `en`. 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 7. `--text` is the text file, which contains sentences to synthesize. diff --git a/examples/csmsc/tts3/README_cn.md b/examples/csmsc/tts3/README_cn.md index f08ca724c8a7b48bb5783c0b5e37d7e64a4d6595..1829b77063e0ec08f5bd8dcb41405ea815d3ec0e 100644 --- a/examples/csmsc/tts3/README_cn.md +++ b/examples/csmsc/tts3/README_cn.md @@ -117,12 +117,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ```text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -131,11 +131,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -147,10 +146,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -167,12 +166,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -182,11 +181,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -197,10 +195,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -213,9 +211,9 @@ optional arguments: output dir. ``` 1. `--am` 声学模型格式是否符合 {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat` 和 `--phones_dict` 是声学模型的参数,对应于 fastspeech2 预训练模型中的 4 个文件。 +2. `--am_config`, `--am_ckpt`, `--am_stat` 和 `--phones_dict` 是声学模型的参数,对应于 fastspeech2 预训练模型中的 4 个文件。 3. `--voc` 声码器(vocoder)格式是否符合 {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` 是声码器的参数,对应于 parallel wavegan 预训练模型中的 3 个文件。 +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` 是声码器的参数,对应于 parallel wavegan 预训练模型中的 3 个文件。 5. `--lang` 对应模型的语言可以是 `zh` 或 `en` 。 6. `--test_metadata` 应为 `dump` 文件夹中 `test` 下的规范化元数据文件、 7. `--text` 是文本文件,其中包含要合成的句子。 diff --git a/examples/csmsc/vits/README.md b/examples/csmsc/vits/README.md index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0c16840a04e32be8fefb3bae6c23fb4bd853be9f 100644 --- a/examples/csmsc/vits/README.md +++ b/examples/csmsc/vits/README.md @@ -0,0 +1,146 @@ +# VITS with CSMSC +This example contains code used to train a [VITS](https://arxiv.org/abs/2106.06103) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html). + +## Dataset +### Download and Extract +Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source). + +### Get MFA Result and Extract +We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes for VITS, the durations of MFA are not needed here. +You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. + +## Get Started +Assume the path to the dataset is `~/datasets/BZNSYP`. +Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`. +Run the command below to +1. **source path**. +2. preprocess the dataset. +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. + - synthesize waveform from a text file. + +```bash +./run.sh +``` +You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset. +```bash +./run.sh --stage 0 --stop-stage 0 +``` +### Data Preprocessing +```bash +./local/preprocess.sh ${conf_path} +``` +When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. + +```text +dump +├── dev +│   ├── norm +│   └── raw +├── phone_id_map.txt +├── speaker_id_map.txt +├── test +│   ├── norm +│   └── raw +└── train + ├── feats_stats.npy + ├── norm + └── raw +``` +The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains wave and linear spectrogram of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/feats_stats.npy`. + +Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, feats, feats_lengths, the path of linear spectrogram features, the path of raw waves, speaker, and the id of each utterance. + +### Model Training +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} +``` +`./local/train.sh` calls `${BIN_DIR}/train.py`. +Here's the complete help message. +```text +usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] + [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] + [--ngpu NGPU] [--phones-dict PHONES_DICT] + +Train a VITS model. + +optional arguments: + -h, --help show this help message and exit + --config CONFIG config file to overwrite default config. + --train-metadata TRAIN_METADATA + training data. + --dev-metadata DEV_METADATA + dev data. + --output-dir OUTPUT_DIR + output dir. + --ngpu NGPU if ngpu == 0, use cpu. + --phones-dict PHONES_DICT + phone vocabulary file. +``` +1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. +2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. +3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory. +4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. +5. `--phones-dict` is the path of the phone vocabulary file. + +### Synthesizing + +`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. + +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize.py [-h] [--config CONFIG] [--ckpt CKPT] + [--phones_dict PHONES_DICT] [--ngpu NGPU] + [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] + +Synthesize with VITS + +optional arguments: + -h, --help show this help message and exit + --config CONFIG Config of VITS. + --ckpt CKPT Checkpoint file of VITS. + --phones_dict PHONES_DICT + phone vocabulary file. + --ngpu NGPU if ngpu == 0, use cpu. + --test_metadata TEST_METADATA + test metadata. + --output_dir OUTPUT_DIR + output dir. +``` +`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize_e2e.py [-h] [--config CONFIG] [--ckpt CKPT] + [--phones_dict PHONES_DICT] [--lang LANG] + [--inference_dir INFERENCE_DIR] [--ngpu NGPU] + [--text TEXT] [--output_dir OUTPUT_DIR] + +Synthesize with VITS + +optional arguments: + -h, --help show this help message and exit + --config CONFIG Config of VITS. + --ckpt CKPT Checkpoint file of VITS. + --phones_dict PHONES_DICT + phone vocabulary file. + --lang LANG Choose model language. zh or en + --inference_dir INFERENCE_DIR + dir to save inference models + --ngpu NGPU if ngpu == 0, use cpu. + --text TEXT text to synthesize, a 'utt_id sentence' pair per line. + --output_dir OUTPUT_DIR + output dir. +``` +1. `--config`, `--ckpt`, and `--phones_dict` are arguments for acoustic model, which correspond to the 3 files in the VITS pretrained model. +2. `--lang` is the model language, which can be `zh` or `en`. +3. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. +4. `--text` is the text file, which contains sentences to synthesize. +5. `--output_dir` is the directory to save synthesized audio files. +6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. + +## Pretrained Model diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md index d5bec1cd79d21e311d1859b4ef1dde1d59400788..4646a034599a6f71e613b17d42810c267d870df8 100644 --- a/examples/csmsc/voc1/README.md +++ b/examples/csmsc/voc1/README.md @@ -65,7 +65,7 @@ Train a ParallelWaveGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG ParallelWaveGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md index e188bcb35787a25ddad2f0318238c923a1ff014f..09fb8836c58c03fd000090ab40d04937dcd51c35 100644 --- a/examples/csmsc/voc3/README.md +++ b/examples/csmsc/voc3/README.md @@ -63,7 +63,7 @@ Train a Multi-Band MelGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG Multi-Band MelGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/csmsc/voc4/README.md b/examples/csmsc/voc4/README.md index 19836134e15c676a05ff35c8ac6cc6af3d1605a5..f1a132a84d7a83bba7cc5d94caea7aa1f0d36398 100644 --- a/examples/csmsc/voc4/README.md +++ b/examples/csmsc/voc4/README.md @@ -63,7 +63,7 @@ Train a Style MelGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG Style MelGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/csmsc/voc5/README.md b/examples/csmsc/voc5/README.md index 4c38b5987c9d5fed287d205e1224aa0a511449d3..ef552fd3078e00c5ac030e8a2e732043783e3f19 100644 --- a/examples/csmsc/voc5/README.md +++ b/examples/csmsc/voc5/README.md @@ -63,7 +63,7 @@ Train a HiFiGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG HiFiGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/csmsc/voc6/README.md b/examples/csmsc/voc6/README.md index 0e5ce633411f1e58dec48ea2141575d5a77353f9..b48c36414b607732ed66741dedb2f27270ba6e84 100644 --- a/examples/csmsc/voc6/README.md +++ b/examples/csmsc/voc6/README.md @@ -63,7 +63,7 @@ Train a WaveRNN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG WaveRNN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/ljspeech/tts0/README.md b/examples/ljspeech/tts0/README.md index e3292957b09b8226a01515c01e6c05eb130daca3..85d9e448b8422e32d9058a18d1ac413aff69cce7 100644 --- a/examples/ljspeech/tts0/README.md +++ b/examples/ljspeech/tts0/README.md @@ -103,12 +103,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ```text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -117,11 +117,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -133,10 +132,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -152,12 +151,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -167,11 +166,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -182,10 +180,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -198,9 +196,9 @@ optional arguments: output dir. ``` 1. `--am` is acoustic model type with the format {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model. +2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model. 3. `--voc` is vocoder type with the format {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. 5. `--lang` is the model language, which can be `zh` or `en`. 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 7. `--text` is the text file, which contains sentences to synthesize. diff --git a/examples/ljspeech/tts1/README.md b/examples/ljspeech/tts1/README.md index 9f82185cada1d4873707bb32f36ac4b080848082..85621653f761b05ac382efe83e5f1daa694a06e6 100644 --- a/examples/ljspeech/tts1/README.md +++ b/examples/ljspeech/tts1/README.md @@ -58,7 +58,7 @@ Train a TransformerTTS model with LJSpeech TTS dataset. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG TransformerTTS config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md index 8a666193fd5bf4fbd34c01b65e4fc99717ea8686..fb1f9f4f71296656098b6788f77cf3d970d39ac2 100644 --- a/examples/ljspeech/tts3/README.md +++ b/examples/ljspeech/tts3/README.md @@ -109,12 +109,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ``text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -123,11 +123,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -139,10 +138,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -158,12 +157,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -173,11 +172,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -188,10 +186,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -204,9 +202,9 @@ optional arguments: output dir. ``` 1. `--am` is acoustic model type with the format {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model. +2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model. 3. `--voc` is vocoder type with the format {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. 5. `--lang` is the model language, which can be `zh` or `en`. 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 7. `--text` is the text file, which contains sentences to synthesize. diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md index 491444261d32df254271e319dc283ab91742f1ba..d16c0e35fb2fcf13e1ec52ef608db925dc945f51 100644 --- a/examples/ljspeech/voc1/README.md +++ b/examples/ljspeech/voc1/README.md @@ -65,7 +65,7 @@ Train a ParallelWaveGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG ParallelWaveGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/ljspeech/voc5/README.md b/examples/ljspeech/voc5/README.md index 8305150422b523155e9d4b62a434e08890eedaa7..d856cfecfdebed32cd33b6ef90285c3c1ec5299a 100644 --- a/examples/ljspeech/voc5/README.md +++ b/examples/ljspeech/voc5/README.md @@ -57,15 +57,13 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] - [--run-benchmark RUN_BENCHMARK] - [--profiler_options PROFILER_OPTIONS] + [--ngpu NGPU] -Train a ParallelWaveGAN model. +Train a HiFiGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG HiFiGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA @@ -73,19 +71,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - -benchmark: - arguments related to benchmark. - - --batch-size BATCH_SIZE - batch size. - --max-iter MAX_ITER train max steps. - --run-benchmark RUN_BENCHMARK - runing benchmark or not, if True, use the --batch-size - and --max-iter. - --profiler_options PROFILER_OPTIONS - The option of profiler, which should be in format - "key1=value1;key2=value2;key3=value3". ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md index 8864910878a8292d2afacdbd3442b9de101931ed..0b0ce09349dbf11e0823392e7ace9aeb9c1033cc 100644 --- a/examples/vctk/tts3/README.md +++ b/examples/vctk/tts3/README.md @@ -112,12 +112,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p ``` ```text usage: synthesize.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--ngpu NGPU] [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR] @@ -126,11 +126,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -142,10 +141,10 @@ optional arguments: speaker id map file. --voice-cloning VOICE_CLONING whether training voice cloning model. - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -161,12 +160,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp ``` ```text usage: synthesize_e2e.py [-h] - [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] + [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_stat AM_STAT] [--phones_dict PHONES_DICT] [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID] - [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}] + [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}] [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT] [--voc_stat VOC_STAT] [--lang LANG] [--inference_dir INFERENCE_DIR] [--ngpu NGPU] @@ -176,11 +175,10 @@ Synthesize with acoustic model & vocoder optional arguments: -h, --help show this help message and exit - --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk} + --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech} Choose acoustic model type of tts task. --am_config AM_CONFIG - Config of acoustic model. Use deault config when it is - None. + Config of acoustic model. --am_ckpt AM_CKPT Checkpoint file of acoustic model. --am_stat AM_STAT mean and standard deviation used to normalize spectrogram when training acoustic model. @@ -191,10 +189,10 @@ optional arguments: --speaker_dict SPEAKER_DICT speaker id map file. --spk_id SPK_ID spk id for multi speaker acoustic model - --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc} + --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc} Choose vocoder type of tts task. --voc_config VOC_CONFIG - Config of voc. Use deault config when it is None. + Config of voc. --voc_ckpt VOC_CKPT Checkpoint file of voc. --voc_stat VOC_STAT mean and standard deviation used to normalize spectrogram when training voc. @@ -207,9 +205,9 @@ optional arguments: output dir. ``` 1. `--am` is acoustic model type with the format {model_name}_{dataset} -2. `--am_config`, `--am_checkpoint`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model. +2. `--am_config`, `--am_ckpt`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model. 3. `--voc` is vocoder type with the format {model_name}_{dataset} -4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. +4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model. 5. `--lang` is the model language, which can be `zh` or `en`. 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder. 7. `--text` is the text file, which contains sentences to synthesize. diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md index 45ba51013e1399a2f6916d0f1b7c0723e22618d2..a0e06a4206d8c214119b187164396fe9a0b1711b 100644 --- a/examples/vctk/voc1/README.md +++ b/examples/vctk/voc1/README.md @@ -70,7 +70,7 @@ Train a ParallelWaveGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG ParallelWaveGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA diff --git a/examples/vctk/voc5/README.md b/examples/vctk/voc5/README.md index 514af4679a13242cd9b7cb4663e520805a1275e1..f2cbf27d21706d0702e46a20ff57aabf737de6a2 100644 --- a/examples/vctk/voc5/README.md +++ b/examples/vctk/voc5/README.md @@ -62,15 +62,13 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] - [--run-benchmark RUN_BENCHMARK] - [--profiler_options PROFILER_OPTIONS] + [--ngpu NGPU] -Train a ParallelWaveGAN model. +Train a HiFiGAN model. optional arguments: -h, --help show this help message and exit - --config CONFIG config file to overwrite default config. + --config CONFIG HiFiGAN config file. --train-metadata TRAIN_METADATA training data. --dev-metadata DEV_METADATA @@ -78,19 +76,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - -benchmark: - arguments related to benchmark. - - --batch-size BATCH_SIZE - batch size. - --max-iter MAX_ITER train max steps. - --run-benchmark RUN_BENCHMARK - runing benchmark or not, if True, use the --batch-size - and --max-iter. - --profiler_options PROFILER_OPTIONS - The option of profiler, which should be in format - "key1=value1;key2=value2;key3=value3". ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. diff --git a/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py b/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py index c70821e78fe6e4063d74e8c5608ede225ed1b230..4c733dc9b05ba8880d415482d7c194c910039f92 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py @@ -243,8 +243,7 @@ def main(): # parse args and config and redirect to train_sp parser = argparse.ArgumentParser(description="Train a HiFiGAN model.") - parser.add_argument( - "--config", type=str, help="config file to overwrite default config.") + parser.add_argument("--config", type=str, help="HiFiGAN config file.") parser.add_argument("--train-metadata", type=str, help="training data.") parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") diff --git a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py index 27ffded63b3621c0f2110815b27fd4420ef0bc5a..3b3ebb4788e2cd4b58119996f74c8dc0d1bfc46b 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py @@ -233,7 +233,7 @@ def main(): parser = argparse.ArgumentParser( description="Train a Multi-Band MelGAN model.") parser.add_argument( - "--config", type=str, help="config file to overwrite default config.") + "--config", type=str, help="Multi-Band MelGAN config file.") parser.add_argument("--train-metadata", type=str, help="training data.") parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py index 92de7a2c4e7a04ed28b7b30dfa47be4796acc93f..b26407028928cec639d47f576e2bb1f6766e1990 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py @@ -208,7 +208,7 @@ def main(): parser = argparse.ArgumentParser( description="Train a ParallelWaveGAN model.") parser.add_argument( - "--config", type=str, help="config file to overwrite default config.") + "--config", type=str, help="ParallelWaveGAN config file.") parser.add_argument("--train-metadata", type=str, help="training data.") parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") diff --git a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py index be3ba74251d92cf90be713651837205fa8dc582a..a87cc7a182fdc88d9770021969b4d4248d87e83a 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py @@ -224,8 +224,7 @@ def main(): # parse args and config and redirect to train_sp parser = argparse.ArgumentParser(description="Train a Style MelGAN model.") - parser.add_argument( - "--config", type=str, help="config file to overwrite default config.") + parser.add_argument("--config", type=str, help="Style MelGAN config file.") parser.add_argument("--train-metadata", type=str, help="training data.") parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") diff --git a/paddlespeech/t2s/exps/transformer_tts/train.py b/paddlespeech/t2s/exps/transformer_tts/train.py index 45ecb269bac033fed4287e5083ced6ce92b89f35..da48b6b99700ed49e5c815bf6b6f14c8eecfae95 100644 --- a/paddlespeech/t2s/exps/transformer_tts/train.py +++ b/paddlespeech/t2s/exps/transformer_tts/train.py @@ -160,7 +160,7 @@ def main(): parser = argparse.ArgumentParser(description="Train a TransformerTTS " "model with LJSpeech TTS dataset.") parser.add_argument( - "--config", type=str, help="config file to overwrite default config.") + "--config", type=str, help="TransformerTTS config file.") parser.add_argument("--train-metadata", type=str, help="training data.") parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py index b921f92af75ba367d702be94050e563538e7c755..dbda8b7177bca068ecaeabe41679a93e153aba35 100644 --- a/paddlespeech/t2s/exps/vits/train.py +++ b/paddlespeech/t2s/exps/vits/train.py @@ -226,9 +226,8 @@ def train_sp(args, config): def main(): # parse args and config and redirect to train_sp - parser = argparse.ArgumentParser(description="Train a HiFiGAN model.") - parser.add_argument( - "--config", type=str, help="config file to overwrite default config.") + parser = argparse.ArgumentParser(description="Train a VITS model.") + parser.add_argument("--config", type=str, help="VITS config file") parser.add_argument("--train-metadata", type=str, help="training data.") parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.") diff --git a/paddlespeech/t2s/exps/wavernn/train.py b/paddlespeech/t2s/exps/wavernn/train.py index 8661d311d218bda58142a846f3dedce5a07ffabf..cf24ea26888002afcfba19449ec8ac5db6efe517 100644 --- a/paddlespeech/t2s/exps/wavernn/train.py +++ b/paddlespeech/t2s/exps/wavernn/train.py @@ -180,8 +180,7 @@ def main(): # parse args and config and redirect to train_sp parser = argparse.ArgumentParser(description="Train a WaveRNN model.") - parser.add_argument( - "--config", type=str, help="config file to overwrite default config.") + parser.add_argument("--config", type=str, help="WaveRNN config file.") parser.add_argument("--train-metadata", type=str, help="training data.") parser.add_argument("--dev-metadata", type=str, help="dev data.") parser.add_argument("--output-dir", type=str, help="output dir.")