diff --git a/.pre-commit-hooks/copyright-check.hook b/.pre-commit-hooks/copyright-check.hook
index 26044c29e4fdc827abb4ba2d415db66c780fd366..761edbc018bf840626367d4bed5b0083e0021275 100644
--- a/.pre-commit-hooks/copyright-check.hook
+++ b/.pre-commit-hooks/copyright-check.hook
@@ -19,7 +19,7 @@ import subprocess
import platform
COPYRIGHT = '''
-Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
diff --git a/README.md b/README.md
index a43e21bd2e4970b4da004e98a3c2e2c076e275d1..c9d4796c8c7c77ff1e0526c3f46a39a24d8794a3 100644
--- a/README.md
+++ b/README.md
@@ -24,6 +24,8 @@
| Documents
| Models List
| AIStudio Courses
+ | Paper
+ | Gitee
diff --git a/README_cn.md b/README_cn.md
index ed5c6a90dcf2d773002691ed4186282ae67386e8..c751b061dcbbfd4ee8c66bf494e006028a0a9ae1 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -25,6 +25,8 @@
| 教程文档
| 模型列表
| AIStudio 课程
+ | 论文
+ | Gitee
diff --git a/demos/README.md b/demos/README.md
index 8abd67249d7ad939db6d79d7b8160b8efa7cb8ba..2a306df6b1e1b3648b7306506adc01d5e0ffcdf2 100644
--- a/demos/README.md
+++ b/demos/README.md
@@ -2,14 +2,14 @@
([简体中文](./README_cn.md)|English)
-The directory containes many speech applications in multi scenarios.
+This directory contains many speech applications in multiple scenarios.
* audio searching - mass audio similarity retrieval
* audio tagging - multi-label tagging of an audio file
-* automatic_video_subtitiles - generate subtitles from a video
+* automatic_video_subtitles - generate subtitles from a video
* metaverse - 2D AR with TTS
* punctuation_restoration - restore punctuation from raw text
-* speech recogintion - recognize text of an audio file
+* speech recognition - recognize text of an audio file
* speech server - Server for Speech Task, e.g. ASR,TTS,CLS
* streaming asr server - receive audio stream from websocket, and recognize to transcript.
* speech translation - end to end speech translation
diff --git a/demos/audio_searching/src/encode.py b/demos/audio_searching/src/encode.py
index c89a11c1f86b86a32dd70477a27e3842dffccfe9..f6bcb00adda6fb7b6d07327a860b7a6e877d352e 100644
--- a/demos/audio_searching/src/encode.py
+++ b/demos/audio_searching/src/encode.py
@@ -14,7 +14,7 @@
import numpy as np
from logs import LOGGER
-from paddlespeech.cli import VectorExecutor
+from paddlespeech.cli.vector import VectorExecutor
vector_executor = VectorExecutor()
diff --git a/demos/audio_tagging/README.md b/demos/audio_tagging/README.md
index 9d4af0be6c286ecb5ea09baa9178217d67c3f20c..fc4a334ea053d30402cf112848ae1cd81f9f1f57 100644
--- a/demos/audio_tagging/README.md
+++ b/demos/audio_tagging/README.md
@@ -57,7 +57,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespe
- Python API
```python
import paddle
- from paddlespeech.cli import CLSExecutor
+ from paddlespeech.cli.cls import CLSExecutor
cls_executor = CLSExecutor()
result = cls_executor(
diff --git a/demos/audio_tagging/README_cn.md b/demos/audio_tagging/README_cn.md
index 79f87bf8c224e4d0c57368130d04fbc7b32d811e..36b5d8aaf231342f5addd4ba31129881ea1f19f5 100644
--- a/demos/audio_tagging/README_cn.md
+++ b/demos/audio_tagging/README_cn.md
@@ -57,7 +57,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespe
- Python API
```python
import paddle
- from paddlespeech.cli import CLSExecutor
+ from paddlespeech.cli.cls import CLSExecutor
cls_executor = CLSExecutor()
result = cls_executor(
diff --git a/demos/automatic_video_subtitiles/README.md b/demos/automatic_video_subtitiles/README.md
index db6da40db0add1b61d95aecc389d645f001b735f..b815425ec426b1b6b1bc66e1235b52afb7dffa0e 100644
--- a/demos/automatic_video_subtitiles/README.md
+++ b/demos/automatic_video_subtitiles/README.md
@@ -28,7 +28,8 @@ ffmpeg -i subtitle_demo1.mp4 -ac 1 -ar 16000 -vn input.wav
- Python API
```python
import paddle
- from paddlespeech.cli import ASRExecutor, TextExecutor
+ from paddlespeech.cli.asr import ASRExecutor
+ from paddlespeech.cli.text import TextExecutor
asr_executor = ASRExecutor()
text_executor = TextExecutor()
diff --git a/demos/automatic_video_subtitiles/README_cn.md b/demos/automatic_video_subtitiles/README_cn.md
index fc7b2cf6a04203dee389f4e2d640d3a3de57cc3a..990ff6dbdf0cb2e0c31ed296b44783a4b6254711 100644
--- a/demos/automatic_video_subtitiles/README_cn.md
+++ b/demos/automatic_video_subtitiles/README_cn.md
@@ -23,7 +23,8 @@ ffmpeg -i subtitle_demo1.mp4 -ac 1 -ar 16000 -vn input.wav
- Python API
```python
import paddle
- from paddlespeech.cli import ASRExecutor, TextExecutor
+ from paddlespeech.cli.asr import ASRExecutor
+ from paddlespeech.cli.text import TextExecutor
asr_executor = ASRExecutor()
text_executor = TextExecutor()
diff --git a/demos/automatic_video_subtitiles/recognize.py b/demos/automatic_video_subtitiles/recognize.py
index 72e3c3a8597283916762d92dcc9f5e95c6261d2f..304599d1990c4da5e37df803afb8f65aef033c30 100644
--- a/demos/automatic_video_subtitiles/recognize.py
+++ b/demos/automatic_video_subtitiles/recognize.py
@@ -16,8 +16,8 @@ import os
import paddle
-from paddlespeech.cli import ASRExecutor
-from paddlespeech.cli import TextExecutor
+from paddlespeech.cli.asr import ASRExecutor
+from paddlespeech.cli.text import TextExecutor
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
diff --git a/demos/punctuation_restoration/README.md b/demos/punctuation_restoration/README.md
index 518d437dc20a34251e24f58298b0f49dcd57e967..458ab92f9b230069f11464afe1ff2b5331a08c73 100644
--- a/demos/punctuation_restoration/README.md
+++ b/demos/punctuation_restoration/README.md
@@ -42,7 +42,7 @@ The input of this demo should be a text of the specific language that can be pas
- Python API
```python
import paddle
- from paddlespeech.cli import TextExecutor
+ from paddlespeech.cli.text import TextExecutor
text_executor = TextExecutor()
result = text_executor(
diff --git a/demos/punctuation_restoration/README_cn.md b/demos/punctuation_restoration/README_cn.md
index 9d4be8bf08a8d512cd4189367cca9cd8590a60f4..f25acdadbd31385e13d64677ea7a39f3ce5f22c9 100644
--- a/demos/punctuation_restoration/README_cn.md
+++ b/demos/punctuation_restoration/README_cn.md
@@ -44,7 +44,7 @@
- Python API
```python
import paddle
- from paddlespeech.cli import TextExecutor
+ from paddlespeech.cli.text import TextExecutor
text_executor = TextExecutor()
result = text_executor(
diff --git a/demos/speaker_verification/README.md b/demos/speaker_verification/README.md
index 63dc9294ec7a8da7abe11465502c466e9ca40e8a..900b5ae4088a4839866a22c094bf159f4a07af6f 100644
--- a/demos/speaker_verification/README.md
+++ b/demos/speaker_verification/README.md
@@ -96,7 +96,7 @@ wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
- Python API
```python
- from paddlespeech.cli import VectorExecutor
+ from paddlespeech.cli.vector import VectorExecutor
vector_executor = VectorExecutor()
audio_emb = vector_executor(
diff --git a/demos/speaker_verification/README_cn.md b/demos/speaker_verification/README_cn.md
index 07eeac2ee8a469df4b86b258a14df0f3430d7e58..f6afa86ac832e88fa07d6673041c97e1668bc12e 100644
--- a/demos/speaker_verification/README_cn.md
+++ b/demos/speaker_verification/README_cn.md
@@ -95,7 +95,7 @@ wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
- Python API
```python
import paddle
- from paddlespeech.cli import VectorExecutor
+ from paddlespeech.cli.vector import VectorExecutor
vector_executor = VectorExecutor()
audio_emb = vector_executor(
diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md
index 6493e8e613800ea163b8669842c93a7dd82d68ac..c815a88af2de7876404dd6f929e27f37bb3f2edb 100644
--- a/demos/speech_recognition/README.md
+++ b/demos/speech_recognition/README.md
@@ -58,7 +58,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
- Python API
```python
import paddle
- from paddlespeech.cli import ASRExecutor
+ from paddlespeech.cli.asr import ASRExecutor
asr_executor = ASRExecutor()
text = asr_executor(
diff --git a/demos/speech_recognition/README_cn.md b/demos/speech_recognition/README_cn.md
index 8d631d89ca1d61196cbf167b3f263cfd478fb571..13aa9f27755482294d4aed8fc25d75f485289224 100644
--- a/demos/speech_recognition/README_cn.md
+++ b/demos/speech_recognition/README_cn.md
@@ -56,7 +56,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
- Python API
```python
import paddle
- from paddlespeech.cli import ASRExecutor
+ from paddlespeech.cli.asr import ASRExecutor
asr_executor = ASRExecutor()
text = asr_executor(
diff --git a/demos/speech_translation/README.md b/demos/speech_translation/README.md
index f675a4eda0a57b9c33633d9b9a9c619a99e8e712..00a9c79324a15f54326454c13cf0c715472ac3a2 100644
--- a/demos/speech_translation/README.md
+++ b/demos/speech_translation/README.md
@@ -47,7 +47,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
- Python API
```python
import paddle
- from paddlespeech.cli import STExecutor
+ from paddlespeech.cli.st import STExecutor
st_executor = STExecutor()
text = st_executor(
diff --git a/demos/speech_translation/README_cn.md b/demos/speech_translation/README_cn.md
index bad9b392f9122d02c9fa3c365d5fc54bb82ed492..5119bf9f4eea5a164966ad0a5c9e1c71768ad55e 100644
--- a/demos/speech_translation/README_cn.md
+++ b/demos/speech_translation/README_cn.md
@@ -47,7 +47,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
- Python API
```python
import paddle
- from paddlespeech.cli import STExecutor
+ from paddlespeech.cli.st import STExecutor
st_executor = STExecutor()
text = st_executor(
diff --git a/demos/text_to_speech/README.md b/demos/text_to_speech/README.md
index 2df72a82dec88ddc55505c9575721aee2de09536..389847a129b8d10787679c4442b8a1999ca044c5 100644
--- a/demos/text_to_speech/README.md
+++ b/demos/text_to_speech/README.md
@@ -77,7 +77,7 @@ The input of this demo should be a text of the specific language that can be pas
- Python API
```python
import paddle
- from paddlespeech.cli import TTSExecutor
+ from paddlespeech.cli.tts import TTSExecutor
tts_executor = TTSExecutor()
wav_file = tts_executor(
diff --git a/demos/text_to_speech/README_cn.md b/demos/text_to_speech/README_cn.md
index 7e02b962483b4b0959fa9b9fe0c082bb0a6fdc3e..f967d3d4da47647037ac7a035b3c0ca930762691 100644
--- a/demos/text_to_speech/README_cn.md
+++ b/demos/text_to_speech/README_cn.md
@@ -80,7 +80,7 @@
- Python API
```python
import paddle
- from paddlespeech.cli import TTSExecutor
+ from paddlespeech.cli.tts import TTSExecutor
tts_executor = TTSExecutor()
wav_file = tts_executor(
diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md
index 9c505679c91ca9dad9cf7d5f7dee0ec7970a682d..31c99898ccc7839b835a0fbd7daec550a36de340 100644
--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@@ -113,12 +113,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
```
```text
usage: synthesize.py [-h]
- [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+ [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
[--voice-cloning VOICE_CLONING]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--ngpu NGPU]
[--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -127,11 +127,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+ --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -143,10 +142,10 @@ optional arguments:
speaker id map file.
--voice-cloning VOICE_CLONING
whether training voice cloning model.
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -162,12 +161,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
```
```text
usage: synthesize_e2e.py [-h]
- [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+ [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT]
[--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--lang LANG]
[--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -177,11 +176,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+ --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -192,10 +190,10 @@ optional arguments:
--speaker_dict SPEAKER_DICT
speaker id map file.
--spk_id SPK_ID spk id for multi speaker acoustic model
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -208,9 +206,9 @@ optional arguments:
output dir.
```
1. `--am` is acoustic model type with the format {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model.
+2. `--am_config`, `--am_ckpt`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model.
3. `--voc` is vocoder type with the format {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
5. `--lang` is the model language, which can be `zh` or `en`.
6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder.
7. `--text` is the text file, which contains sentences to synthesize.
diff --git a/examples/aishell3/voc1/README.md b/examples/aishell3/voc1/README.md
index e9e012d29ea95cda33eaccf8201bb0d74a90a9f5..a3daf3dfd6889e60f9d71be396bc9b3d2404fe54 100644
--- a/examples/aishell3/voc1/README.md
+++ b/examples/aishell3/voc1/README.md
@@ -68,7 +68,7 @@ Train a ParallelWaveGAN model.
optional arguments:
-h, --help show this help message and exit
- --config CONFIG config file to overwrite default config.
+ --config CONFIG ParallelWaveGAN config file.
--train-metadata TRAIN_METADATA
training data.
--dev-metadata DEV_METADATA
diff --git a/examples/aishell3/voc5/README.md b/examples/aishell3/voc5/README.md
index 84bcd78ef0ca56ef385f56173d866f1cf4c64bfd..c3e3197d63c0e871dabd61e67992335fc8d5d1f9 100644
--- a/examples/aishell3/voc5/README.md
+++ b/examples/aishell3/voc5/README.md
@@ -59,15 +59,13 @@ Here's the complete help message.
```text
usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
[--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
- [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
- [--run-benchmark RUN_BENCHMARK]
- [--profiler_options PROFILER_OPTIONS]
+ [--ngpu NGPU]
-Train a ParallelWaveGAN model.
+Train a HiFiGAN model.
optional arguments:
-h, --help show this help message and exit
- --config CONFIG config file to overwrite default config.
+ --config CONFIG HiFiGAN config file.
--train-metadata TRAIN_METADATA
training data.
--dev-metadata DEV_METADATA
@@ -75,19 +73,6 @@ optional arguments:
--output-dir OUTPUT_DIR
output dir.
--ngpu NGPU if ngpu == 0, use cpu.
-
-benchmark:
- arguments related to benchmark.
-
- --batch-size BATCH_SIZE
- batch size.
- --max-iter MAX_ITER train max steps.
- --run-benchmark RUN_BENCHMARK
- runing benchmark or not, if True, use the --batch-size
- and --max-iter.
- --profiler_options PROFILER_OPTIONS
- The option of profiler, which should be in format
- "key1=value1;key2=value2;key3=value3".
```
1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
diff --git a/examples/csmsc/tts0/README.md b/examples/csmsc/tts0/README.md
index d62c901174ae2ed8e0bb14f93c76f97939b33499..bc7769d1572166b6e27185f282b8ca16b998f40f 100644
--- a/examples/csmsc/tts0/README.md
+++ b/examples/csmsc/tts0/README.md
@@ -103,12 +103,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
```
```text
usage: synthesize.py [-h]
- [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}]
+ [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
[--voice-cloning VOICE_CLONING]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--ngpu NGPU]
[--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -117,11 +117,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}
+ --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -133,10 +132,10 @@ optional arguments:
speaker id map file.
--voice-cloning VOICE_CLONING
whether training voice cloning model.
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -152,12 +151,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
```
```text
usage: synthesize_e2e.py [-h]
- [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}]
+ [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT]
[--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--lang LANG]
[--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -167,11 +166,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}
+ --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -182,10 +180,10 @@ optional arguments:
--speaker_dict SPEAKER_DICT
speaker id map file.
--spk_id SPK_ID spk id for multi speaker acoustic model
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -198,9 +196,9 @@ optional arguments:
output dir.
```
1. `--am` is acoustic model type with the format {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model.
+2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model.
3. `--voc` is vocoder type with the format {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
5. `--lang` is the model language, which can be `zh` or `en`.
6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder.
7. `--text` is the text file, which contains sentences to synthesize.
diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md
index 1bcfb383f200b6d454eabac2b85e4b23b32780d1..f45561719ba88485e6da2c20674da215f06d4093 100644
--- a/examples/csmsc/tts2/README.md
+++ b/examples/csmsc/tts2/README.md
@@ -109,12 +109,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
```
```text
usage: synthesize.py [-h]
- [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+ [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
[--voice-cloning VOICE_CLONING]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--ngpu NGPU]
[--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -123,11 +123,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+ --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -139,10 +138,10 @@ optional arguments:
speaker id map file.
--voice-cloning VOICE_CLONING
whether training voice cloning model.
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -158,12 +157,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
```
```text
usage: synthesize_e2e.py [-h]
- [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+ [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT]
[--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--lang LANG]
[--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -173,11 +172,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+ --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -188,10 +186,10 @@ optional arguments:
--speaker_dict SPEAKER_DICT
speaker id map file.
--spk_id SPK_ID spk id for multi speaker acoustic model
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -204,9 +202,9 @@ optional arguments:
output dir.
```
1. `--am` is acoustic model type with the format {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat`, `--phones_dict` and `--tones_dict` are arguments for acoustic model, which correspond to the 5 files in the speedyspeech pretrained model.
+2. `--am_config`, `--am_ckpt`, `--am_stat`, `--phones_dict` and `--tones_dict` are arguments for acoustic model, which correspond to the 5 files in the speedyspeech pretrained model.
3. `--voc` is vocoder type with the format {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
5. `--lang` is the model language, which can be `zh` or `en`.
6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder.
7. `--text` is the text file, which contains sentences to synthesize.
diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md
index 1f7dfa0fdd31526f1bf05bca51ed953e6c57227e..371034e772391196ab1b455b0c5f42e86df82b35 100644
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@@ -111,12 +111,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
```
```text
usage: synthesize.py [-h]
- [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+ [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
[--voice-cloning VOICE_CLONING]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--ngpu NGPU]
[--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -125,11 +125,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+ --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -141,10 +140,10 @@ optional arguments:
speaker id map file.
--voice-cloning VOICE_CLONING
whether training voice cloning model.
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -160,12 +159,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
```
```text
usage: synthesize_e2e.py [-h]
- [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+ [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT]
[--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--lang LANG]
[--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -175,11 +174,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+ --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -190,10 +188,10 @@ optional arguments:
--speaker_dict SPEAKER_DICT
speaker id map file.
--spk_id SPK_ID spk id for multi speaker acoustic model
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -204,11 +202,12 @@ optional arguments:
--text TEXT text to synthesize, a 'utt_id sentence' pair per line.
--output_dir OUTPUT_DIR
output dir.
+
```
1. `--am` is acoustic model type with the format {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model.
+2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model.
3. `--voc` is vocoder type with the format {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
5. `--lang` is the model language, which can be `zh` or `en`.
6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder.
7. `--text` is the text file, which contains sentences to synthesize.
diff --git a/examples/csmsc/tts3/README_cn.md b/examples/csmsc/tts3/README_cn.md
index f08ca724c8a7b48bb5783c0b5e37d7e64a4d6595..1829b77063e0ec08f5bd8dcb41405ea815d3ec0e 100644
--- a/examples/csmsc/tts3/README_cn.md
+++ b/examples/csmsc/tts3/README_cn.md
@@ -117,12 +117,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
```
```text
usage: synthesize.py [-h]
- [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+ [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
[--voice-cloning VOICE_CLONING]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--ngpu NGPU]
[--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -131,11 +131,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+ --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -147,10 +146,10 @@ optional arguments:
speaker id map file.
--voice-cloning VOICE_CLONING
whether training voice cloning model.
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -167,12 +166,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
```
```text
usage: synthesize_e2e.py [-h]
- [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+ [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT]
[--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--lang LANG]
[--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -182,11 +181,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+ --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -197,10 +195,10 @@ optional arguments:
--speaker_dict SPEAKER_DICT
speaker id map file.
--spk_id SPK_ID spk id for multi speaker acoustic model
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -213,9 +211,9 @@ optional arguments:
output dir.
```
1. `--am` 声学模型格式是否符合 {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat` 和 `--phones_dict` 是声学模型的参数,对应于 fastspeech2 预训练模型中的 4 个文件。
+2. `--am_config`, `--am_ckpt`, `--am_stat` 和 `--phones_dict` 是声学模型的参数,对应于 fastspeech2 预训练模型中的 4 个文件。
3. `--voc` 声码器(vocoder)格式是否符合 {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` 是声码器的参数,对应于 parallel wavegan 预训练模型中的 3 个文件。
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` 是声码器的参数,对应于 parallel wavegan 预训练模型中的 3 个文件。
5. `--lang` 对应模型的语言可以是 `zh` 或 `en` 。
6. `--test_metadata` 应为 `dump` 文件夹中 `test` 下的规范化元数据文件、
7. `--text` 是文本文件,其中包含要合成的句子。
diff --git a/examples/csmsc/vits/README.md b/examples/csmsc/vits/README.md
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0c16840a04e32be8fefb3bae6c23fb4bd853be9f 100644
--- a/examples/csmsc/vits/README.md
+++ b/examples/csmsc/vits/README.md
@@ -0,0 +1,146 @@
+# VITS with CSMSC
+This example contains code used to train a [VITS](https://arxiv.org/abs/2106.06103) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html).
+
+## Dataset
+### Download and Extract
+Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source).
+
+### Get MFA Result and Extract
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes for VITS, the durations of MFA are not needed here.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
+
+## Get Started
+Assume the path to the dataset is `~/datasets/BZNSYP`.
+Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset.
+3. train the model.
+4. synthesize wavs.
+ - synthesize waveform from `metadata.jsonl`.
+ - synthesize waveform from a text file.
+
+```bash
+./run.sh
+```
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
+```bash
+./local/preprocess.sh ${conf_path}
+```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+
+```text
+dump
+├── dev
+│ ├── norm
+│ └── raw
+├── phone_id_map.txt
+├── speaker_id_map.txt
+├── test
+│ ├── norm
+│ └── raw
+└── train
+ ├── feats_stats.npy
+ ├── norm
+ └── raw
+```
+The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains wave and linear spectrogram of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/feats_stats.npy`.
+
+Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, feats, feats_lengths, the path of linear spectrogram features, the path of raw waves, speaker, and the id of each utterance.
+
+### Model Training
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
+```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
+Here's the complete help message.
+```text
+usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
+ [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+ [--ngpu NGPU] [--phones-dict PHONES_DICT]
+
+Train a VITS model.
+
+optional arguments:
+ -h, --help show this help message and exit
+ --config CONFIG config file to overwrite default config.
+ --train-metadata TRAIN_METADATA
+ training data.
+ --dev-metadata DEV_METADATA
+ dev data.
+ --output-dir OUTPUT_DIR
+ output dir.
+ --ngpu NGPU if ngpu == 0, use cpu.
+ --phones-dict PHONES_DICT
+ phone vocabulary file.
+```
+1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
+2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
+3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+5. `--phones-dict` is the path of the phone vocabulary file.
+
+### Synthesizing
+
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize.py [-h] [--config CONFIG] [--ckpt CKPT]
+ [--phones_dict PHONES_DICT] [--ngpu NGPU]
+ [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
+
+Synthesize with VITS
+
+optional arguments:
+ -h, --help show this help message and exit
+ --config CONFIG Config of VITS.
+ --ckpt CKPT Checkpoint file of VITS.
+ --phones_dict PHONES_DICT
+ phone vocabulary file.
+ --ngpu NGPU if ngpu == 0, use cpu.
+ --test_metadata TEST_METADATA
+ test metadata.
+ --output_dir OUTPUT_DIR
+ output dir.
+```
+`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize_e2e.py [-h] [--config CONFIG] [--ckpt CKPT]
+ [--phones_dict PHONES_DICT] [--lang LANG]
+ [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
+ [--text TEXT] [--output_dir OUTPUT_DIR]
+
+Synthesize with VITS
+
+optional arguments:
+ -h, --help show this help message and exit
+ --config CONFIG Config of VITS.
+ --ckpt CKPT Checkpoint file of VITS.
+ --phones_dict PHONES_DICT
+ phone vocabulary file.
+ --lang LANG Choose model language. zh or en
+ --inference_dir INFERENCE_DIR
+ dir to save inference models
+ --ngpu NGPU if ngpu == 0, use cpu.
+ --text TEXT text to synthesize, a 'utt_id sentence' pair per line.
+ --output_dir OUTPUT_DIR
+ output dir.
+```
+1. `--config`, `--ckpt`, and `--phones_dict` are arguments for acoustic model, which correspond to the 3 files in the VITS pretrained model.
+2. `--lang` is the model language, which can be `zh` or `en`.
+3. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder.
+4. `--text` is the text file, which contains sentences to synthesize.
+5. `--output_dir` is the directory to save synthesized audio files.
+6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+## Pretrained Model
diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md
index d5bec1cd79d21e311d1859b4ef1dde1d59400788..4646a034599a6f71e613b17d42810c267d870df8 100644
--- a/examples/csmsc/voc1/README.md
+++ b/examples/csmsc/voc1/README.md
@@ -65,7 +65,7 @@ Train a ParallelWaveGAN model.
optional arguments:
-h, --help show this help message and exit
- --config CONFIG config file to overwrite default config.
+ --config CONFIG ParallelWaveGAN config file.
--train-metadata TRAIN_METADATA
training data.
--dev-metadata DEV_METADATA
diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md
index e188bcb35787a25ddad2f0318238c923a1ff014f..09fb8836c58c03fd000090ab40d04937dcd51c35 100644
--- a/examples/csmsc/voc3/README.md
+++ b/examples/csmsc/voc3/README.md
@@ -63,7 +63,7 @@ Train a Multi-Band MelGAN model.
optional arguments:
-h, --help show this help message and exit
- --config CONFIG config file to overwrite default config.
+ --config CONFIG Multi-Band MelGAN config file.
--train-metadata TRAIN_METADATA
training data.
--dev-metadata DEV_METADATA
diff --git a/examples/csmsc/voc4/README.md b/examples/csmsc/voc4/README.md
index 19836134e15c676a05ff35c8ac6cc6af3d1605a5..f1a132a84d7a83bba7cc5d94caea7aa1f0d36398 100644
--- a/examples/csmsc/voc4/README.md
+++ b/examples/csmsc/voc4/README.md
@@ -63,7 +63,7 @@ Train a Style MelGAN model.
optional arguments:
-h, --help show this help message and exit
- --config CONFIG config file to overwrite default config.
+ --config CONFIG Style MelGAN config file.
--train-metadata TRAIN_METADATA
training data.
--dev-metadata DEV_METADATA
diff --git a/examples/csmsc/voc5/README.md b/examples/csmsc/voc5/README.md
index 4c38b5987c9d5fed287d205e1224aa0a511449d3..ef552fd3078e00c5ac030e8a2e732043783e3f19 100644
--- a/examples/csmsc/voc5/README.md
+++ b/examples/csmsc/voc5/README.md
@@ -63,7 +63,7 @@ Train a HiFiGAN model.
optional arguments:
-h, --help show this help message and exit
- --config CONFIG config file to overwrite default config.
+ --config CONFIG HiFiGAN config file.
--train-metadata TRAIN_METADATA
training data.
--dev-metadata DEV_METADATA
diff --git a/examples/csmsc/voc6/README.md b/examples/csmsc/voc6/README.md
index 0e5ce633411f1e58dec48ea2141575d5a77353f9..b48c36414b607732ed66741dedb2f27270ba6e84 100644
--- a/examples/csmsc/voc6/README.md
+++ b/examples/csmsc/voc6/README.md
@@ -63,7 +63,7 @@ Train a WaveRNN model.
optional arguments:
-h, --help show this help message and exit
- --config CONFIG config file to overwrite default config.
+ --config CONFIG WaveRNN config file.
--train-metadata TRAIN_METADATA
training data.
--dev-metadata DEV_METADATA
diff --git a/examples/ljspeech/tts0/README.md b/examples/ljspeech/tts0/README.md
index e3292957b09b8226a01515c01e6c05eb130daca3..85d9e448b8422e32d9058a18d1ac413aff69cce7 100644
--- a/examples/ljspeech/tts0/README.md
+++ b/examples/ljspeech/tts0/README.md
@@ -103,12 +103,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
```
```text
usage: synthesize.py [-h]
- [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}]
+ [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
[--voice-cloning VOICE_CLONING]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--ngpu NGPU]
[--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -117,11 +117,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}
+ --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -133,10 +132,10 @@ optional arguments:
speaker id map file.
--voice-cloning VOICE_CLONING
whether training voice cloning model.
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -152,12 +151,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
```
```text
usage: synthesize_e2e.py [-h]
- [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}]
+ [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT]
[--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--lang LANG]
[--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -167,11 +166,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}
+ --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -182,10 +180,10 @@ optional arguments:
--speaker_dict SPEAKER_DICT
speaker id map file.
--spk_id SPK_ID spk id for multi speaker acoustic model
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -198,9 +196,9 @@ optional arguments:
output dir.
```
1. `--am` is acoustic model type with the format {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model.
+2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model.
3. `--voc` is vocoder type with the format {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
5. `--lang` is the model language, which can be `zh` or `en`.
6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder.
7. `--text` is the text file, which contains sentences to synthesize.
diff --git a/examples/ljspeech/tts1/README.md b/examples/ljspeech/tts1/README.md
index 9f82185cada1d4873707bb32f36ac4b080848082..85621653f761b05ac382efe83e5f1daa694a06e6 100644
--- a/examples/ljspeech/tts1/README.md
+++ b/examples/ljspeech/tts1/README.md
@@ -58,7 +58,7 @@ Train a TransformerTTS model with LJSpeech TTS dataset.
optional arguments:
-h, --help show this help message and exit
- --config CONFIG config file to overwrite default config.
+ --config CONFIG TransformerTTS config file.
--train-metadata TRAIN_METADATA
training data.
--dev-metadata DEV_METADATA
diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md
index 8a666193fd5bf4fbd34c01b65e4fc99717ea8686..fb1f9f4f71296656098b6788f77cf3d970d39ac2 100644
--- a/examples/ljspeech/tts3/README.md
+++ b/examples/ljspeech/tts3/README.md
@@ -109,12 +109,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
```
``text
usage: synthesize.py [-h]
- [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+ [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
[--voice-cloning VOICE_CLONING]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--ngpu NGPU]
[--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -123,11 +123,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+ --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -139,10 +138,10 @@ optional arguments:
speaker id map file.
--voice-cloning VOICE_CLONING
whether training voice cloning model.
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -158,12 +157,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
```
```text
usage: synthesize_e2e.py [-h]
- [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+ [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT]
[--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--lang LANG]
[--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -173,11 +172,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+ --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -188,10 +186,10 @@ optional arguments:
--speaker_dict SPEAKER_DICT
speaker id map file.
--spk_id SPK_ID spk id for multi speaker acoustic model
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -204,9 +202,9 @@ optional arguments:
output dir.
```
1. `--am` is acoustic model type with the format {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model.
+2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model.
3. `--voc` is vocoder type with the format {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
5. `--lang` is the model language, which can be `zh` or `en`.
6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder.
7. `--text` is the text file, which contains sentences to synthesize.
diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md
index 491444261d32df254271e319dc283ab91742f1ba..d16c0e35fb2fcf13e1ec52ef608db925dc945f51 100644
--- a/examples/ljspeech/voc1/README.md
+++ b/examples/ljspeech/voc1/README.md
@@ -65,7 +65,7 @@ Train a ParallelWaveGAN model.
optional arguments:
-h, --help show this help message and exit
- --config CONFIG config file to overwrite default config.
+ --config CONFIG ParallelWaveGAN config file.
--train-metadata TRAIN_METADATA
training data.
--dev-metadata DEV_METADATA
diff --git a/examples/ljspeech/voc5/README.md b/examples/ljspeech/voc5/README.md
index 8305150422b523155e9d4b62a434e08890eedaa7..d856cfecfdebed32cd33b6ef90285c3c1ec5299a 100644
--- a/examples/ljspeech/voc5/README.md
+++ b/examples/ljspeech/voc5/README.md
@@ -57,15 +57,13 @@ Here's the complete help message.
```text
usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
[--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
- [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
- [--run-benchmark RUN_BENCHMARK]
- [--profiler_options PROFILER_OPTIONS]
+ [--ngpu NGPU]
-Train a ParallelWaveGAN model.
+Train a HiFiGAN model.
optional arguments:
-h, --help show this help message and exit
- --config CONFIG config file to overwrite default config.
+ --config CONFIG HiFiGAN config file.
--train-metadata TRAIN_METADATA
training data.
--dev-metadata DEV_METADATA
@@ -73,19 +71,6 @@ optional arguments:
--output-dir OUTPUT_DIR
output dir.
--ngpu NGPU if ngpu == 0, use cpu.
-
-benchmark:
- arguments related to benchmark.
-
- --batch-size BATCH_SIZE
- batch size.
- --max-iter MAX_ITER train max steps.
- --run-benchmark RUN_BENCHMARK
- runing benchmark or not, if True, use the --batch-size
- and --max-iter.
- --profiler_options PROFILER_OPTIONS
- The option of profiler, which should be in format
- "key1=value1;key2=value2;key3=value3".
```
1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md
index 8864910878a8292d2afacdbd3442b9de101931ed..0b0ce09349dbf11e0823392e7ace9aeb9c1033cc 100644
--- a/examples/vctk/tts3/README.md
+++ b/examples/vctk/tts3/README.md
@@ -112,12 +112,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
```
```text
usage: synthesize.py [-h]
- [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+ [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
[--voice-cloning VOICE_CLONING]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--ngpu NGPU]
[--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -126,11 +126,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+ --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -142,10 +141,10 @@ optional arguments:
speaker id map file.
--voice-cloning VOICE_CLONING
whether training voice cloning model.
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -161,12 +160,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
```
```text
usage: synthesize_e2e.py [-h]
- [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+ [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
[--am_stat AM_STAT] [--phones_dict PHONES_DICT]
[--tones_dict TONES_DICT]
[--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
- [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
[--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
[--voc_stat VOC_STAT] [--lang LANG]
[--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -176,11 +175,10 @@ Synthesize with acoustic model & vocoder
optional arguments:
-h, --help show this help message and exit
- --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+ --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
Choose acoustic model type of tts task.
--am_config AM_CONFIG
- Config of acoustic model. Use deault config when it is
- None.
+ Config of acoustic model.
--am_ckpt AM_CKPT Checkpoint file of acoustic model.
--am_stat AM_STAT mean and standard deviation used to normalize
spectrogram when training acoustic model.
@@ -191,10 +189,10 @@ optional arguments:
--speaker_dict SPEAKER_DICT
speaker id map file.
--spk_id SPK_ID spk id for multi speaker acoustic model
- --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
Choose vocoder type of tts task.
--voc_config VOC_CONFIG
- Config of voc. Use deault config when it is None.
+ Config of voc.
--voc_ckpt VOC_CKPT Checkpoint file of voc.
--voc_stat VOC_STAT mean and standard deviation used to normalize
spectrogram when training voc.
@@ -207,9 +205,9 @@ optional arguments:
output dir.
```
1. `--am` is acoustic model type with the format {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model.
+2. `--am_config`, `--am_ckpt`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model.
3. `--voc` is vocoder type with the format {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
5. `--lang` is the model language, which can be `zh` or `en`.
6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder.
7. `--text` is the text file, which contains sentences to synthesize.
diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md
index 45ba51013e1399a2f6916d0f1b7c0723e22618d2..a0e06a4206d8c214119b187164396fe9a0b1711b 100644
--- a/examples/vctk/voc1/README.md
+++ b/examples/vctk/voc1/README.md
@@ -70,7 +70,7 @@ Train a ParallelWaveGAN model.
optional arguments:
-h, --help show this help message and exit
- --config CONFIG config file to overwrite default config.
+ --config CONFIG ParallelWaveGAN config file.
--train-metadata TRAIN_METADATA
training data.
--dev-metadata DEV_METADATA
diff --git a/examples/vctk/voc5/README.md b/examples/vctk/voc5/README.md
index 514af4679a13242cd9b7cb4663e520805a1275e1..f2cbf27d21706d0702e46a20ff57aabf737de6a2 100644
--- a/examples/vctk/voc5/README.md
+++ b/examples/vctk/voc5/README.md
@@ -62,15 +62,13 @@ Here's the complete help message.
```text
usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
[--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
- [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
- [--run-benchmark RUN_BENCHMARK]
- [--profiler_options PROFILER_OPTIONS]
+ [--ngpu NGPU]
-Train a ParallelWaveGAN model.
+Train a HiFiGAN model.
optional arguments:
-h, --help show this help message and exit
- --config CONFIG config file to overwrite default config.
+ --config CONFIG HiFiGAN config file.
--train-metadata TRAIN_METADATA
training data.
--dev-metadata DEV_METADATA
@@ -78,19 +76,6 @@ optional arguments:
--output-dir OUTPUT_DIR
output dir.
--ngpu NGPU if ngpu == 0, use cpu.
-
-benchmark:
- arguments related to benchmark.
-
- --batch-size BATCH_SIZE
- batch size.
- --max-iter MAX_ITER train max steps.
- --run-benchmark RUN_BENCHMARK
- runing benchmark or not, if True, use the --batch-size
- and --max-iter.
- --profiler_options PROFILER_OPTIONS
- The option of profiler, which should be in format
- "key1=value1;key2=value2;key3=value3".
```
1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
diff --git a/examples/wenetspeech/asr1/local/extract_meta.py b/examples/wenetspeech/asr1/local/extract_meta.py
index 0e1b2727838052740e5e89593dcdab04ffe387c9..2cad977bee913d6c0db7719ae7d86803e48ac0ff 100644
--- a/examples/wenetspeech/asr1/local/extract_meta.py
+++ b/examples/wenetspeech/asr1/local/extract_meta.py
@@ -1,18 +1,7 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
# Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang)
# Mobvoi Inc(Author: Di Wu, Binbin Zhang)
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
@@ -24,6 +13,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+
import argparse
import json
import os
diff --git a/paddlespeech/cli/__init__.py b/paddlespeech/cli/__init__.py
index ddf0359bc5fcb7ff80b437a65112869d7faa12eb..ca6993f2b003054062cb99f37675ad7009f70d32 100644
--- a/paddlespeech/cli/__init__.py
+++ b/paddlespeech/cli/__init__.py
@@ -13,14 +13,7 @@
# limitations under the License.
import _locale
-from .asr import ASRExecutor
from .base_commands import BaseCommand
from .base_commands import HelpCommand
-from .cls import CLSExecutor
-from .st import STExecutor
-from .stats import StatsExecutor
-from .text import TextExecutor
-from .tts import TTSExecutor
-from .vector import VectorExecutor
_locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index 2d74afa6d72e166aacbe98003ba4db3e80c4b130..09e8202fd7d54e59b9c535b6b5a598123147f539 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -29,7 +29,6 @@ from yacs.config import CfgNode
from ..download import get_path_from_url
from ..executor import BaseExecutor
from ..log import logger
-from ..utils import cli_register
from ..utils import CLI_TIMER
from ..utils import MODEL_HOME
from ..utils import stats_wrapper
@@ -45,8 +44,6 @@ __all__ = ['ASRExecutor']
@timer_register
-@cli_register(
- name='paddlespeech.asr', description='Speech to text infer command.')
class ASRExecutor(BaseExecutor):
def __init__(self):
super().__init__()
diff --git a/paddlespeech/cli/base_commands.py b/paddlespeech/cli/base_commands.py
index 0a26b12030a0b25fe169be5ad3bc61e82c500fa7..4d4d2cc69b0eec34b626a84ee237c1c4c4c540a2 100644
--- a/paddlespeech/cli/base_commands.py
+++ b/paddlespeech/cli/base_commands.py
@@ -15,6 +15,7 @@ from typing import List
from .entry import commands
from .utils import cli_register
+from .utils import explicit_command_register
from .utils import get_command
__all__ = [
@@ -73,3 +74,20 @@ class VersionCommand:
print(msg)
return True
+
+
+# Dynamic import when running specific command
+_commands = {
+ 'asr': ['Speech to text infer command.', 'ASRExecutor'],
+ 'cls': ['Audio classification infer command.', 'CLSExecutor'],
+ 'st': ['Speech translation infer command.', 'STExecutor'],
+ 'text': ['Text command.', 'TextExecutor'],
+ 'tts': ['Text to Speech infer command.', 'TTSExecutor'],
+ 'vector': ['Speech to vector embedding infer command.', 'VectorExecutor'],
+}
+
+for com, info in _commands.items():
+ explicit_command_register(
+ name='paddlespeech.{}'.format(com),
+ description=info[0],
+ cls='paddlespeech.cli.{}.{}'.format(com, info[1]))
diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py
index 40072d9974e5798dc5d74b921efb905230e06246..3d807b60b3d03d4620875582b41f26f5b699c45b 100644
--- a/paddlespeech/cli/cls/infer.py
+++ b/paddlespeech/cli/cls/infer.py
@@ -27,7 +27,6 @@ from paddlespeech.utils.dynamic_import import dynamic_import
from ..executor import BaseExecutor
from ..log import logger
-from ..utils import cli_register
from ..utils import stats_wrapper
from .pretrained_models import model_alias
from .pretrained_models import pretrained_models
@@ -36,8 +35,6 @@ from .pretrained_models import pretrained_models
__all__ = ['CLSExecutor']
-@cli_register(
- name='paddlespeech.cls', description='Audio classification infer command.')
class CLSExecutor(BaseExecutor):
def __init__(self):
super().__init__()
@@ -246,4 +243,4 @@ class CLSExecutor(BaseExecutor):
self.infer()
res = self.postprocess(topk) # Retrieve result of cls.
- return res
\ No newline at end of file
+ return res
diff --git a/paddlespeech/cli/entry.py b/paddlespeech/cli/entry.py
index 32123ece750457dac8ca90aff1a8731fea569188..e0c306d62a7d55b8a48a147fa7d13dcda866ab79 100644
--- a/paddlespeech/cli/entry.py
+++ b/paddlespeech/cli/entry.py
@@ -34,6 +34,11 @@ def _execute():
# The method 'execute' of a command instance returns 'True' for a success
# while 'False' for a failure. Here converts this result into a exit status
# in bash: 0 for a success and 1 for a failure.
+ if not callable(com['_entry']):
+ i = com['_entry'].rindex('.')
+ module, cls = com['_entry'][:i], com['_entry'][i + 1:]
+ exec("from {} import {}".format(module, cls))
+ com['_entry'] = locals()[cls]
status = 0 if com['_entry']().execute(sys.argv[idx:]) else 1
return status
diff --git a/paddlespeech/cli/st/infer.py b/paddlespeech/cli/st/infer.py
index 4f210fbe685df50236379e196639395b2ce4adf2..ae188b349632bd7af811b363143ed845db012392 100644
--- a/paddlespeech/cli/st/infer.py
+++ b/paddlespeech/cli/st/infer.py
@@ -28,7 +28,6 @@ from yacs.config import CfgNode
from ..executor import BaseExecutor
from ..log import logger
-from ..utils import cli_register
from ..utils import download_and_decompress
from ..utils import MODEL_HOME
from ..utils import stats_wrapper
@@ -42,8 +41,6 @@ from paddlespeech.utils.dynamic_import import dynamic_import
__all__ = ["STExecutor"]
-@cli_register(
- name="paddlespeech.st", description="Speech translation infer command.")
class STExecutor(BaseExecutor):
def __init__(self):
super().__init__()
diff --git a/paddlespeech/cli/text/infer.py b/paddlespeech/cli/text/infer.py
index 97f3bbe21346dfa6651c773e44eb293c4baa841a..be5b5a10d474c2a50d448d955ae8cead3a13202b 100644
--- a/paddlespeech/cli/text/infer.py
+++ b/paddlespeech/cli/text/infer.py
@@ -23,7 +23,6 @@ import paddle
from ..executor import BaseExecutor
from ..log import logger
-from ..utils import cli_register
from ..utils import stats_wrapper
from .pretrained_models import model_alias
from .pretrained_models import pretrained_models
@@ -33,7 +32,6 @@ from paddlespeech.utils.dynamic_import import dynamic_import
__all__ = ['TextExecutor']
-@cli_register(name='paddlespeech.text', description='Text infer command.')
class TextExecutor(BaseExecutor):
def __init__(self):
super().__init__()
diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py
index efab9cb258a34c8ee7f003fd6f1a21fe8e77a126..5fa9b3ed0f0a32ced3e28672210a5d3318a16d69 100644
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@@ -28,7 +28,6 @@ from yacs.config import CfgNode
from ..executor import BaseExecutor
from ..log import logger
-from ..utils import cli_register
from ..utils import stats_wrapper
from .pretrained_models import model_alias
from .pretrained_models import pretrained_models
@@ -40,8 +39,6 @@ from paddlespeech.utils.dynamic_import import dynamic_import
__all__ = ['TTSExecutor']
-@cli_register(
- name='paddlespeech.tts', description='Text to Speech infer command.')
class TTSExecutor(BaseExecutor):
def __init__(self):
super().__init__()
diff --git a/paddlespeech/cli/utils.py b/paddlespeech/cli/utils.py
index e7b499f728c3d93ecdfc3bd8fdf92559ce59845a..128767e627091dc636da2900e5e65b58bdd650ca 100644
--- a/paddlespeech/cli/utils.py
+++ b/paddlespeech/cli/utils.py
@@ -41,6 +41,7 @@ requests.adapters.DEFAULT_RETRIES = 3
__all__ = [
'timer_register',
'cli_register',
+ 'explicit_command_register',
'get_command',
'download_and_decompress',
'load_state_dict_from_url',
@@ -70,6 +71,16 @@ def cli_register(name: str, description: str='') -> Any:
return _warpper
+def explicit_command_register(name: str, description: str='', cls: str=''):
+ items = name.split('.')
+ com = commands
+ for item in items:
+ com = com[item]
+ com['_entry'] = cls
+ if description:
+ com['_description'] = description
+
+
def get_command(name: str) -> Any:
items = name.split('.')
com = commands
diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py
index cc664369fa2a82f929e8112610955e1aa727a6d2..07fb73a4c839864a63a1561324a67da55e9df80d 100644
--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@@ -28,7 +28,6 @@ from yacs.config import CfgNode
from ..executor import BaseExecutor
from ..log import logger
-from ..utils import cli_register
from ..utils import stats_wrapper
from .pretrained_models import model_alias
from .pretrained_models import pretrained_models
@@ -37,9 +36,6 @@ from paddlespeech.vector.io.batch import feature_normalize
from paddlespeech.vector.modules.sid_model import SpeakerIdetification
-@cli_register(
- name="paddlespeech.vector",
- description="Speech to vector embedding infer command.")
class VectorExecutor(BaseExecutor):
def __init__(self):
super().__init__()
@@ -476,4 +472,4 @@ class VectorExecutor(BaseExecutor):
else:
logger.info("The audio file format is right")
- return True
\ No newline at end of file
+ return True
diff --git a/paddlespeech/kws/exps/mdtc/compute_det.py b/paddlespeech/kws/exps/mdtc/compute_det.py
index e43a953dbc35246f295a79790b2da55837318114..853056966376cd7551228ebbabdd7bddf334189e 100644
--- a/paddlespeech/kws/exps/mdtc/compute_det.py
+++ b/paddlespeech/kws/exps/mdtc/compute_det.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2021 Binbin Zhang(binbzha@qq.com)
+# 2022 Shaoqing Yu(954793264@qq.com)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/paddlespeech/kws/exps/mdtc/plot_det_curve.py b/paddlespeech/kws/exps/mdtc/plot_det_curve.py
index a3ea21eff9773f922c789cccaf6b5d5f02fed5c5..4960281eed3b87fe1fd43374382a09526799a663 100644
--- a/paddlespeech/kws/exps/mdtc/plot_det_curve.py
+++ b/paddlespeech/kws/exps/mdtc/plot_det_curve.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2021 Binbin Zhang(binbzha@qq.com)
+# Menglong Xu
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/paddlespeech/kws/exps/mdtc/score.py b/paddlespeech/kws/exps/mdtc/score.py
index 1b5e1e2967f62c5f6259ddc40ad8929fd61d1a7c..556455ca19e135b2c9eee4a2523e42c0a2ad1d0e 100644
--- a/paddlespeech/kws/exps/mdtc/score.py
+++ b/paddlespeech/kws/exps/mdtc/score.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 Binbin Zhang(binbzha@qq.com)
+# 2022 Shaoqing Yu(954793264@qq.com)
+# 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/paddlespeech/kws/models/loss.py b/paddlespeech/kws/models/loss.py
index 64c9a32c9f128338081999ebd209719e90fcf98c..bda77f2ba54cc26523a47b33f6ced1be748a2671 100644
--- a/paddlespeech/kws/models/loss.py
+++ b/paddlespeech/kws/models/loss.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2021 Binbin Zhang
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/paddlespeech/kws/models/mdtc.py b/paddlespeech/kws/models/mdtc.py
index 5d2e5de649c5adc43536e15f1867d19e75b589f6..c605a02b6d0167f9a07267ab9d9f3e5b981e0e5f 100644
--- a/paddlespeech/kws/models/mdtc.py
+++ b/paddlespeech/kws/models/mdtc.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2021 Jingyong Hou (houjingyong@gmail.com)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py
index 2365071f31fbb51b0c8dec6950a8eb7521d92693..2da68435c27fac821a7daed05c03a78b8914eed3 100644
--- a/paddlespeech/s2t/__init__.py
+++ b/paddlespeech/s2t/__init__.py
@@ -189,25 +189,6 @@ if not hasattr(paddle.Tensor, 'contiguous'):
paddle.static.Variable.contiguous = contiguous
-def size(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
- nargs = len(args)
- assert (nargs <= 1)
- s = paddle.shape(xs)
- if nargs == 1:
- return s[args[0]]
- else:
- return s
-
-
-#`to_static` do not process `size` property, maybe some `paddle` api dependent on it.
-logger.debug(
- "override size of paddle.Tensor "
- "(`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!"
-)
-paddle.Tensor.size = size
-paddle.static.Variable.size = size
-
-
def view(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
return xs.reshape(args)
@@ -219,7 +200,7 @@ if not hasattr(paddle.Tensor, 'view'):
def view_as(xs: paddle.Tensor, ys: paddle.Tensor) -> paddle.Tensor:
- return xs.reshape(ys.size())
+ return xs.reshape(paddle.shape(ys))
if not hasattr(paddle.Tensor, 'view_as'):
diff --git a/paddlespeech/s2t/decoders/beam_search/beam_search.py b/paddlespeech/s2t/decoders/beam_search/beam_search.py
index f331cb1c93e1331aa25600e6b5b819212ed6f096..f6a2b4b0a422ea7ee5a31745bf603866db0528f4 100644
--- a/paddlespeech/s2t/decoders/beam_search/beam_search.py
+++ b/paddlespeech/s2t/decoders/beam_search/beam_search.py
@@ -194,7 +194,7 @@ class BeamSearch(paddle.nn.Layer):
Args:
hyp (Hypothesis): Hypothesis with prefix tokens to score
- ids (paddle.Tensor): 1D tensor of new partial tokens to score,
+ ids (paddle.Tensor): 1D tensor of new partial tokens to score,
len(ids) < n_vocab
x (paddle.Tensor): Corresponding input feature, (T, D)
@@ -224,14 +224,14 @@ class BeamSearch(paddle.nn.Layer):
ids (paddle.Tensor): The partial token ids(Global) to compute topk.
Returns:
- Tuple[paddle.Tensor, paddle.Tensor]:
+ Tuple[paddle.Tensor, paddle.Tensor]:
The topk full token ids and partial token ids.
Their shapes are `(self.beam_size,)`.
i.e. (global ids, global relative local ids).
"""
# no pre beam performed, `ids` equal to `weighted_scores`
- if weighted_scores.size(0) == ids.size(0):
+ if paddle.shape(weighted_scores)[0] == paddle.shape(ids)[0]:
top_ids = weighted_scores.topk(
self.beam_size)[1] # index in n_vocab
return top_ids, top_ids
@@ -370,13 +370,13 @@ class BeamSearch(paddle.nn.Layer):
"""
# set length bounds
if maxlenratio == 0:
- maxlen = x.shape[0]
+ maxlen = paddle.shape(x)[0]
elif maxlenratio < 0:
maxlen = -1 * int(maxlenratio)
else:
- maxlen = max(1, int(maxlenratio * x.size(0)))
- minlen = int(minlenratio * x.size(0))
- logger.info("decoder input length: " + str(x.shape[0]))
+ maxlen = max(1, int(maxlenratio * paddle.shape(x)[0]))
+ minlen = int(minlenratio * paddle.shape(x)[0])
+ logger.info("decoder input length: " + str(paddle.shape(x)[0]))
logger.info("max output length: " + str(maxlen))
logger.info("min output length: " + str(minlen))
diff --git a/paddlespeech/s2t/decoders/scorers/ctc.py b/paddlespeech/s2t/decoders/scorers/ctc.py
index 81d8b078392eb0282d59cfbefbb72a2583647aae..3c1d4cf8076d2af5399e636aa4ab7f894aeaae5c 100644
--- a/paddlespeech/s2t/decoders/scorers/ctc.py
+++ b/paddlespeech/s2t/decoders/scorers/ctc.py
@@ -69,7 +69,7 @@ class CTCPrefixScorer(BatchPartialScorerInterface):
return sc[i], st[i]
else: # for CTCPrefixScorePD (need new_id > 0)
r, log_psi, f_min, f_max, scoring_idmap = state
- s = log_psi[i, new_id].expand(log_psi.size(1))
+ s = log_psi[i, new_id].expand(paddle.shape(log_psi)[1])
if scoring_idmap is not None:
return r[:, :, i, scoring_idmap[i, new_id]], s, f_min, f_max
else:
@@ -107,7 +107,7 @@ class CTCPrefixScorer(BatchPartialScorerInterface):
"""
logp = self.ctc.log_softmax(x.unsqueeze(0)) # assuming batch_size = 1
- xlen = paddle.to_tensor([logp.size(1)])
+ xlen = paddle.to_tensor([paddle.shape(logp)[1]])
self.impl = CTCPrefixScorePD(logp, xlen, 0, self.eos)
return None
diff --git a/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py b/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
index 78b8fe36c8c0383d642740cab252ba7c89ba2ec0..d8ca5ccde6842274fd2eea6f3a34c36f404ae717 100644
--- a/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
+++ b/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
@@ -33,9 +33,9 @@ class CTCPrefixScorePD():
self.logzero = -10000000000.0
self.blank = blank
self.eos = eos
- self.batch = x.size(0)
- self.input_length = x.size(1)
- self.odim = x.size(2)
+ self.batch = paddle.shape(x)[0]
+ self.input_length = paddle.shape(x)[1]
+ self.odim = paddle.shape(x)[2]
self.dtype = x.dtype
# Pad the rest of posteriors in the batch
@@ -76,8 +76,7 @@ class CTCPrefixScorePD():
last_ids = [yi[-1] for yi in y] # last output label ids
n_bh = len(last_ids) # batch * hyps
n_hyps = n_bh // self.batch # assuming each utterance has the same # of hyps
- self.scoring_num = scoring_ids.size(
- -1) if scoring_ids is not None else 0
+ self.scoring_num = paddle.shape(scoring_ids)[-1] if scoring_ids is not None else 0
# prepare state info
if state is None:
r_prev = paddle.full(
@@ -153,7 +152,7 @@ class CTCPrefixScorePD():
# compute forward probabilities log(r_t^n(h)) and log(r_t^b(h))
for t in range(start, end):
- rp = r[t - 1] # (2 x BW x O')
+ rp = r[t - 1] # (2 x BW x O')
rr = paddle.stack([rp[0], log_phi[t - 1], rp[0], rp[1]]).view(
2, 2, n_bh, snum) # (2,2,BW,O')
r[t] = paddle.logsumexp(rr, 1) + x_[:, t]
@@ -227,7 +226,7 @@ class CTCPrefixScorePD():
if self.x.shape[1] < x.shape[1]: # self.x (2,T,B,O); x (B,T,O)
# Pad the rest of posteriors in the batch
# TODO(takaaki-hori): need a better way without for-loops
- xlens = [x.size(1)]
+ xlens = [paddle.shape(x)[1]]
for i, l in enumerate(xlens):
if l < self.input_length:
x[i, l:, :] = self.logzero
@@ -237,7 +236,7 @@ class CTCPrefixScorePD():
xb = xn[:, :, self.blank].unsqueeze(2).expand(-1, -1, self.odim)
self.x = paddle.stack([xn, xb]) # (2, T, B, O)
self.x[:, :tmp_x.shape[1], :, :] = tmp_x
- self.input_length = x.size(1)
+ self.input_length = paddle.shape(x)[1]
self.end_frames = paddle.to_tensor(xlens) - 1
def extend_state(self, state):
@@ -318,16 +317,16 @@ class CTCPrefixScore():
r[0, 0] = xs[0]
r[0, 1] = self.logzero
else:
- # Although the code does not exactly follow Algorithm 2,
- # we don't have to change it because we can assume
- # r_t(h)=0 for t < |h| in CTC forward computation
+ # Although the code does not exactly follow Algorithm 2,
+ # we don't have to change it because we can assume
+ # r_t(h)=0 for t < |h| in CTC forward computation
# (Note: we assume here that index t starts with 0).
# The purpose of this difference is to reduce the number of for-loops.
# https://github.com/espnet/espnet/pull/3655
- # where we start to accumulate r_t(h) from t=|h|
- # and iterate r_t(h) = (r_{t-1}(h) + ...) to T-1,
+ # where we start to accumulate r_t(h) from t=|h|
+ # and iterate r_t(h) = (r_{t-1}(h) + ...) to T-1,
# avoiding accumulating zeros for t=1~|h|-1.
- # Thus, we need to set r_{|h|-1}(h) = 0,
+ # Thus, we need to set r_{|h|-1}(h) = 0,
# i.e., r[output_length-1] = logzero, for initialization.
# This is just for reducing the computation.
r[output_length - 1] = self.logzero
diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py
index 0e94f047bce7ad053ecd566f4a8d8c83a1b10a7c..9987b511051a302ab76ca52440c0260a6981af5a 100644
--- a/paddlespeech/s2t/io/dataset.py
+++ b/paddlespeech/s2t/io/dataset.py
@@ -1,4 +1,5 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 Mobvoi Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
diff --git a/paddlespeech/s2t/models/lm/transformer.py b/paddlespeech/s2t/models/lm/transformer.py
index 85bd7c2329fbf416d254bd9eabcaaf181fe7db01..d14f99563fcfc7ff8a73e131119fea96ea1d7d07 100644
--- a/paddlespeech/s2t/models/lm/transformer.py
+++ b/paddlespeech/s2t/models/lm/transformer.py
@@ -90,7 +90,7 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
def _target_mask(self, ys_in_pad):
ys_mask = ys_in_pad != 0
- m = subsequent_mask(ys_mask.size(-1)).unsqueeze(0)
+ m = subsequent_mask(paddle.shape(ys_mask)[-1])).unsqueeze(0)
return ys_mask.unsqueeze(-2) & m
def forward(self, x: paddle.Tensor, t: paddle.Tensor
@@ -112,7 +112,7 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
in perplexity: p(t)^{-n} = exp(-log p(t) / n)
"""
- batch_size = x.size(0)
+ batch_size = paddle.shape(x)[0]
xm = x != 0
xlen = xm.sum(axis=1)
if self.embed_drop is not None:
@@ -122,7 +122,7 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
h, _ = self.encoder(emb, xlen)
y = self.decoder(h)
loss = F.cross_entropy(
- y.view(-1, y.shape[-1]), t.view(-1), reduction="none")
+ y.view(-1, paddle.shape(y)[-1]), t.view(-1), reduction="none")
mask = xm.to(loss.dtype)
logp = loss * mask.view(-1)
nll = logp.view(batch_size, -1).sum(-1)
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
index 530840d0f161dcd03ac6873f5217be9a90597150..b4b61666f24f0fe67ea85d92565916617d5d20b2 100644
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -1,3 +1,4 @@
+# Copyright 2021 Mobvoi Inc. All Rights Reserved.
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -775,7 +776,7 @@ class U2DecodeModel(U2BaseModel):
"""
self.eval()
x = paddle.to_tensor(x).unsqueeze(0)
- ilen = x.size(1)
+ ilen = paddle.shape(x)[1]
enc_output, _ = self._forward_encoder(x, ilen)
return enc_output.squeeze(0)
diff --git a/paddlespeech/s2t/models/u2/updater.py b/paddlespeech/s2t/models/u2/updater.py
index c59090a84ee4d416353eff3d6049ff3451cf0dae..898a50bf0a6c241f6b7cdcdbab78310731ead00a 100644
--- a/paddlespeech/s2t/models/u2/updater.py
+++ b/paddlespeech/s2t/models/u2/updater.py
@@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-# Modified from wenet(https://github.com/wenet-e2e/wenet)
+
from contextlib import nullcontext
import paddle
diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py
index 42ac119b44540a1931408b1b86aa75e8b1413597..ccc8482d5372cd77bcae29bdc5b876e8620e2c77 100644
--- a/paddlespeech/s2t/modules/decoder.py
+++ b/paddlespeech/s2t/modules/decoder.py
@@ -242,7 +242,7 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer):
]
# batch decoding
- ys_mask = subsequent_mask(ys.size(-1)).unsqueeze(0) # (B,L,L)
+ ys_mask = subsequent_mask(paddle.shape(ys)[-1]).unsqueeze(0) # (B,L,L)
xs_mask = make_xs_mask(xs).unsqueeze(1) # (B,1,T)
logp, states = self.forward_one_step(
xs, xs_mask, ys, ys_mask, cache=batch_state)
diff --git a/paddlespeech/s2t/modules/embedding.py b/paddlespeech/s2t/modules/embedding.py
index 596f61b78a4e449b2998b3544dd4204371aa8a2b..51e558eb8adbd801e0c1a0d563062e5a3a3f2104 100644
--- a/paddlespeech/s2t/modules/embedding.py
+++ b/paddlespeech/s2t/modules/embedding.py
@@ -115,7 +115,7 @@ class PositionalEncoding(nn.Layer, PositionalEncodingInterface):
assert offset + x.shape[
1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format(
offset, x.shape[1], self.max_len)
- #TODO(Hui Zhang): using T = x.size(1), __getitem__ not support Tensor
+ #TODO(Hui Zhang): using T = paddle.shape(x)[1], __getitem__ not support Tensor
pos_emb = self.pe[:, offset:offset + T]
x = x * self.xscale + pos_emb
return self.dropout(x), self.dropout(pos_emb)
@@ -165,6 +165,6 @@ class RelPositionalEncoding(PositionalEncoding):
1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format(
offset, x.shape[1], self.max_len)
x = x * self.xscale
- #TODO(Hui Zhang): using x.size(1), __getitem__ not support Tensor
+ #TODO(Hui Zhang): using paddle.shape(x)[1], __getitem__ not support Tensor
pos_emb = self.pe[:, offset:offset + x.shape[1]]
return self.dropout(x), self.dropout(pos_emb)
diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py
index 669a12d656947f0446eba3d228832964e8c1d7b0..4d31acf1a73201cfb8f1e49b020c6635792ae638 100644
--- a/paddlespeech/s2t/modules/encoder.py
+++ b/paddlespeech/s2t/modules/encoder.py
@@ -218,7 +218,7 @@ class BaseEncoder(nn.Layer):
assert xs.shape[0] == 1 # batch size must be one
# tmp_masks is just for interface compatibility
# TODO(Hui Zhang): stride_slice not support bool tensor
- # tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool)
+ # tmp_masks = paddle.ones([1, paddle.shape(xs)[1]], dtype=paddle.bool)
tmp_masks = paddle.ones([1, xs.shape[1]], dtype=paddle.int32)
tmp_masks = tmp_masks.unsqueeze(1) #[B=1, C=1, T]
diff --git a/paddlespeech/s2t/utils/ctc_utils.py b/paddlespeech/s2t/utils/ctc_utils.py
index 886b72033605e9080ebc7ae06e0a32054325be71..42564d8e137705dbe6a6c1992bc4e6452ebe6ede 100644
--- a/paddlespeech/s2t/utils/ctc_utils.py
+++ b/paddlespeech/s2t/utils/ctc_utils.py
@@ -1,3 +1,4 @@
+# Copyright 2021 Mobvoi Inc. All Rights Reserved.
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/paddlespeech/s2t/utils/tensor_utils.py b/paddlespeech/s2t/utils/tensor_utils.py
index 0dbaa0b6b77031d4b8e8aa29fcc9246458b8ab99..bc557b13083f5b4bf80b365d3ce3c2d6848361c0 100644
--- a/paddlespeech/s2t/utils/tensor_utils.py
+++ b/paddlespeech/s2t/utils/tensor_utils.py
@@ -58,7 +58,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
>>> a = paddle.ones(25, 300)
>>> b = paddle.ones(22, 300)
>>> c = paddle.ones(15, 300)
- >>> pad_sequence([a, b, c]).size()
+ >>> pad_sequence([a, b, c]).shape
paddle.Tensor([25, 3, 300])
Note:
@@ -79,10 +79,10 @@ def pad_sequence(sequences: List[paddle.Tensor],
# assuming trailing dimensions and type of all the Tensors
# in sequences are same and fetching those from sequences[0]
- max_size = sequences[0].size()
+ max_size = paddle.shape(sequences[0])
# (TODO Hui Zhang): slice not supprot `end==start`
# trailing_dims = max_size[1:]
- trailing_dims = max_size[1:] if max_size.ndim >= 2 else ()
+ trailing_dims = tuple(max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else ()
max_len = max([s.shape[0] for s in sequences])
if batch_first:
out_dims = (len(sequences), max_len) + trailing_dims
@@ -99,7 +99,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
if batch_first:
# TODO (Hui Zhang): set_value op not supprot `end==start`
# TODO (Hui Zhang): set_value op not support int16
- # TODO (Hui Zhang): set_varbase 2 rank not support [0,0,...]
+ # TODO (Hui Zhang): set_varbase 2 rank not support [0,0,...]
# out_tensor[i, :length, ...] = tensor
if length != 0:
out_tensor[i, :length] = tensor
@@ -145,7 +145,7 @@ def add_sos_eos(ys_pad: paddle.Tensor, sos: int, eos: int,
[ 4, 5, 6, 11, -1, -1],
[ 7, 8, 9, 11, -1, -1]])
"""
- # TODO(Hui Zhang): using comment code,
+ # TODO(Hui Zhang): using comment code,
#_sos = paddle.to_tensor(
# [sos], dtype=paddle.long, stop_gradient=True, place=ys_pad.place)
#_eos = paddle.to_tensor(
diff --git a/paddlespeech/s2t/utils/text_grid.py b/paddlespeech/s2t/utils/text_grid.py
index cbd9856e40d72897cd08d3618178e60f7a34ea0f..e696f43d5fbb99fd39ff91af388d86a568e76abe 100644
--- a/paddlespeech/s2t/utils/text_grid.py
+++ b/paddlespeech/s2t/utils/text_grid.py
@@ -1,3 +1,4 @@
+# Copyright 2021 Mobvoi Inc. All Rights Reserved.
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py b/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py
index c70821e78fe6e4063d74e8c5608ede225ed1b230..4c733dc9b05ba8880d415482d7c194c910039f92 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py
@@ -243,8 +243,7 @@ def main():
# parse args and config and redirect to train_sp
parser = argparse.ArgumentParser(description="Train a HiFiGAN model.")
- parser.add_argument(
- "--config", type=str, help="config file to overwrite default config.")
+ parser.add_argument("--config", type=str, help="HiFiGAN config file.")
parser.add_argument("--train-metadata", type=str, help="training data.")
parser.add_argument("--dev-metadata", type=str, help="dev data.")
parser.add_argument("--output-dir", type=str, help="output dir.")
diff --git a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
index 27ffded63b3621c0f2110815b27fd4420ef0bc5a..3b3ebb4788e2cd4b58119996f74c8dc0d1bfc46b 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
@@ -233,7 +233,7 @@ def main():
parser = argparse.ArgumentParser(
description="Train a Multi-Band MelGAN model.")
parser.add_argument(
- "--config", type=str, help="config file to overwrite default config.")
+ "--config", type=str, help="Multi-Band MelGAN config file.")
parser.add_argument("--train-metadata", type=str, help="training data.")
parser.add_argument("--dev-metadata", type=str, help="dev data.")
parser.add_argument("--output-dir", type=str, help="output dir.")
diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
index 92de7a2c4e7a04ed28b7b30dfa47be4796acc93f..b26407028928cec639d47f576e2bb1f6766e1990 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
@@ -208,7 +208,7 @@ def main():
parser = argparse.ArgumentParser(
description="Train a ParallelWaveGAN model.")
parser.add_argument(
- "--config", type=str, help="config file to overwrite default config.")
+ "--config", type=str, help="ParallelWaveGAN config file.")
parser.add_argument("--train-metadata", type=str, help="training data.")
parser.add_argument("--dev-metadata", type=str, help="dev data.")
parser.add_argument("--output-dir", type=str, help="output dir.")
diff --git a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py
index be3ba74251d92cf90be713651837205fa8dc582a..a87cc7a182fdc88d9770021969b4d4248d87e83a 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py
@@ -224,8 +224,7 @@ def main():
# parse args and config and redirect to train_sp
parser = argparse.ArgumentParser(description="Train a Style MelGAN model.")
- parser.add_argument(
- "--config", type=str, help="config file to overwrite default config.")
+ parser.add_argument("--config", type=str, help="Style MelGAN config file.")
parser.add_argument("--train-metadata", type=str, help="training data.")
parser.add_argument("--dev-metadata", type=str, help="dev data.")
parser.add_argument("--output-dir", type=str, help="output dir.")
diff --git a/paddlespeech/t2s/exps/transformer_tts/train.py b/paddlespeech/t2s/exps/transformer_tts/train.py
index 45ecb269bac033fed4287e5083ced6ce92b89f35..da48b6b99700ed49e5c815bf6b6f14c8eecfae95 100644
--- a/paddlespeech/t2s/exps/transformer_tts/train.py
+++ b/paddlespeech/t2s/exps/transformer_tts/train.py
@@ -160,7 +160,7 @@ def main():
parser = argparse.ArgumentParser(description="Train a TransformerTTS "
"model with LJSpeech TTS dataset.")
parser.add_argument(
- "--config", type=str, help="config file to overwrite default config.")
+ "--config", type=str, help="TransformerTTS config file.")
parser.add_argument("--train-metadata", type=str, help="training data.")
parser.add_argument("--dev-metadata", type=str, help="dev data.")
parser.add_argument("--output-dir", type=str, help="output dir.")
diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py
index b921f92af75ba367d702be94050e563538e7c755..dbda8b7177bca068ecaeabe41679a93e153aba35 100644
--- a/paddlespeech/t2s/exps/vits/train.py
+++ b/paddlespeech/t2s/exps/vits/train.py
@@ -226,9 +226,8 @@ def train_sp(args, config):
def main():
# parse args and config and redirect to train_sp
- parser = argparse.ArgumentParser(description="Train a HiFiGAN model.")
- parser.add_argument(
- "--config", type=str, help="config file to overwrite default config.")
+ parser = argparse.ArgumentParser(description="Train a VITS model.")
+ parser.add_argument("--config", type=str, help="VITS config file")
parser.add_argument("--train-metadata", type=str, help="training data.")
parser.add_argument("--dev-metadata", type=str, help="dev data.")
parser.add_argument("--output-dir", type=str, help="output dir.")
diff --git a/paddlespeech/t2s/exps/wavernn/train.py b/paddlespeech/t2s/exps/wavernn/train.py
index 8661d311d218bda58142a846f3dedce5a07ffabf..cf24ea26888002afcfba19449ec8ac5db6efe517 100644
--- a/paddlespeech/t2s/exps/wavernn/train.py
+++ b/paddlespeech/t2s/exps/wavernn/train.py
@@ -180,8 +180,7 @@ def main():
# parse args and config and redirect to train_sp
parser = argparse.ArgumentParser(description="Train a WaveRNN model.")
- parser.add_argument(
- "--config", type=str, help="config file to overwrite default config.")
+ parser.add_argument("--config", type=str, help="WaveRNN config file.")
parser.add_argument("--train-metadata", type=str, help="training data.")
parser.add_argument("--dev-metadata", type=str, help="dev data.")
parser.add_argument("--output-dir", type=str, help="output dir.")
diff --git a/speechx/README.md b/speechx/README.md
index f75d8ac4eb42c4ca257279763006bd3fc61dee42..cd1cd62c154c28e007d5b20c8db77fa712f1e071 100644
--- a/speechx/README.md
+++ b/speechx/README.md
@@ -44,13 +44,13 @@ More details please see `README.md` under `examples`.
> If using docker please check `--privileged` is set when `docker run`.
* Fatal error at startup: `a function redirection which is mandatory for this platform-tool combination cannot be set up`
-```
+```bash
apt-get install libc6-dbg
```
* Install
-```
+```bash
pushd tools
./setup_valgrind.sh
popd
@@ -59,4 +59,4 @@ popd
## TODO
### Deepspeech2 with linear feature
-* DecibelNormalizer: there is a little bit difference between offline and online db norm. The computation of online db norm read feature chunk by chunk, which causes the feature size is different with offline db norm. In normalizer.cc:73, the samples.size() is different, which causes the difference of result.
+* DecibelNormalizer: there is a small difference between the offline and online db norm. The computation of online db norm reads features chunk by chunk, which causes the feature size to be different different with offline db norm. In `normalizer.cc:73`, the `samples.size()` is different, which causes the different result.
diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh
index e1f1853f6f0672486641fc6d1a7f1c8cc0c15eba..e0ebd1412ac9e2734f78948208fe1ed3b429d976 100755
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
@@ -25,7 +25,7 @@ paddlespeech asr --model deepspeech2offline_librispeech --lang en --input ./en.w
# long audio restriction
{
wget -c https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/test_long_audio_01.wav
-paddlespeech asr --input test_long_audio_01.wav
+paddlespeech asr --model deepspeech2online_wenetspeech --input test_long_audio_01.wav -y
if [ $? -ne 255 ]; then
echo -e "\e[1;31mTime restriction not passed\e[0m"
exit 1
@@ -54,7 +54,7 @@ paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input
# Speech Translation (only support linux)
paddlespeech st --input ./en.wav
-# Speaker Verification
+# Speaker Verification
wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
paddlespeech vector --task spk --input 85236145389.wav
@@ -65,7 +65,7 @@ echo -e "demo1 85236145389.wav \n demo2 85236145389.wav" > vec.job
paddlespeech vector --task spk --input vec.job
echo -e "demo3 85236145389.wav \n demo4 85236145389.wav" | paddlespeech vector --task spk
-rm 85236145389.wav
+rm 85236145389.wav
rm vec.job
# shell pipeline
diff --git a/third_party/README.md b/third_party/README.md
index c73df5427b08fdfd12bbb7e779afd41b058f7780..843d0d3b2e69a573195c669ac0bbee8f52558960 100644
--- a/third_party/README.md
+++ b/third_party/README.md
@@ -1,27 +1,27 @@
* [python_kaldi_features](https://github.com/ZitengWang/python_kaldi_features)
commit: fc1bd6240c2008412ab64dc25045cd872f5e126c
ref: https://zhuanlan.zhihu.com/p/55371926
-licence: MIT
+license: MIT
* [python-pinyin](https://github.com/mozillazg/python-pinyin.git)
commit: 55e524aa1b7b8eec3d15c5306043c6cdd5938b03
-licence: MIT
+license: MIT
* [zhon](https://github.com/tsroten/zhon)
commit: 09bf543696277f71de502506984661a60d24494c
-licence: MIT
+license: MIT
* [pymmseg-cpp](https://github.com/pluskid/pymmseg-cpp.git)
commit: b76465045717fbb4f118c4fbdd24ce93bab10a6d
-licence: MIT
+license: MIT
* [chinese_text_normalization](https://github.com/speechio/chinese_text_normalization.git)
commit: 9e92c7bf2d6b5a7974305406d8e240045beac51c
-licence: MIT
+license: MIT
* [phkit](https://github.com/KuangDD/phkit.git)
commit: b2100293c1e36da531d7f30bd52c9b955a649522
-licence: None
+license: None
* [nnAudio](https://github.com/KinWaiCheuk/nnAudio.git)
-licence: MIT
+license: MIT
diff --git a/third_party/ctc_decoders/LICENSE b/third_party/ctc_decoders/LICENSE
index eeef74b3099776f6afa02dd3fe0ca7f304138248..ad947f8d75624fa3862c37db49a6f249bc11dafb 100644
--- a/third_party/ctc_decoders/LICENSE
+++ b/third_party/ctc_decoders/LICENSE
@@ -5,4 +5,4 @@ score.h and score.cpp is under the LGPL license.
The two files include the header files from KenLM project.
For the rest:
-The default licence of paddlespeech-ctcdecoders is Apache License 2.0.
+The default license of paddlespeech-ctcdecoders is Apache License 2.0.
diff --git a/utils/compute-wer.py b/utils/compute-wer.py
index 978a80c9f1d845bc49ffed7a4350a4705bdd50b6..98bb24a7ebd5dff7c7193288caade9e50c03e670 100755
--- a/utils/compute-wer.py
+++ b/utils/compute-wer.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
-# CopyRight WeNet Apache-2.0 License
+# Copyright 2021 Mobvoi Inc. All Rights Reserved.
import codecs
import re
import sys