diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3178434c4cc9de48f71265ea34797a1dc7639d61..6e8315e76c71c80ed9c053f7f5a1fe7739af5019 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,14 +1,29 @@
# Changelog
-Date: 2022-1-19, Author: yt605155624.
-Add features to: T2S:
- - Add csmsc Tacotron2.
+Date: 2022-1-29, Author: yt605155624.
+Add features to: T2S:
+ - Update aishell3 vc0 with new Tacotron2.
+ - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1419
+
+Date: 2022-1-29, Author: yt605155624.
+Add features to: T2S:
+ - Add ljspeech Tacotron2.
+ - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1416
+
+Date: 2022-1-24, Author: yt605155624.
+Add features to: T2S:
+ - Add csmsc WaveRNN.
+ - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1379
+
+Date: 2022-1-19, Author: yt605155624.
+Add features to: T2S:
+ - Add csmsc Tacotron2.
- PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1314
Date: 2022-1-10, Author: Jackwaterveg.
-Add features to: CLI:
- - Support English (librispeech/asr1/transformer).
+Add features to: CLI:
+ - Support English (librispeech/asr1/transformer).
- Support choosing `decode_method` for conformer and transformer models.
- Refactor the config, using the unified config.
- PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1297
@@ -16,8 +31,8 @@ Add features to: CLI:
***
Date: 2022-1-17, Author: Jackwaterveg.
-Add features to: CLI:
- - Support deepspeech2 online/offline model(aishell).
+Add features to: CLI:
+ - Support deepspeech2 online/offline model(aishell).
- PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1356
***
diff --git a/README.md b/README.md
index 23124231d601c1ce1b6d36c36bcbd6079d3ae198..7dd568b0f6d686835febb2250747cdc69e1daed4 100644
--- a/README.md
+++ b/README.md
@@ -317,14 +317,15 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
Acoustic Model |
- Tacotron2 |
- LJSpeech |
+ Tacotron2 |
+ LJSpeech / CSMSC |
- tacotron2-ljspeech
+ tacotron2-ljspeech / tacotron2-csmsc
|
Transformer TTS |
+ LJSpeech |
transformer-ljspeech
|
@@ -344,7 +345,7 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
- Vocoder |
+ Vocoder |
WaveFlow |
LJSpeech |
@@ -378,7 +379,14 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
|
HiFiGAN-csmsc
|
-
+
+
+ WaveRNN |
+ CSMSC |
+
+ WaveRNN-csmsc
+ |
+
Voice Cloning |
GE2E |
@@ -416,7 +424,6 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
-
Audio Classification |
ESC-50 |
@@ -440,7 +447,6 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
-
Punctuation Restoration |
IWLST2012_zh |
diff --git a/README_cn.md b/README_cn.md
index 4ce4ade9bef2b5972f3c283fd57bfeb9527934ca..e7cbec7cd017a49e97743989c48ab7cca9811305 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -315,14 +315,15 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
声学模型 |
- Tacotron2 |
- LJSpeech |
+ Tacotron2 |
+ LJSpeech / CSMSC |
- tacotron2-ljspeech
+ tacotron2-ljspeech / tacotron2-csmsc
|
Transformer TTS |
+ LJSpeech |
transformer-ljspeech
|
@@ -342,7 +343,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
- 声码器 |
+ 声码器 |
WaveFlow |
LJSpeech |
@@ -376,7 +377,14 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
|
HiFiGAN-csmsc
|
-
+
+
+ WaveRNN |
+ CSMSC |
+
+ WaveRNN-csmsc
+ |
+
声音克隆 |
GE2E |
@@ -415,8 +423,6 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
-
-
声音分类 |
ESC-50 |
@@ -440,7 +446,6 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
-
标点恢复 |
IWLST2012_zh |
diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index 23309d8eb4b1bd904478d427025fb8fbe2692188..9227a7c52844e846543348a7c2285b293e9272d7 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -1,3 +1,4 @@
+
# Released Models
## Speech-to-Text Models
@@ -32,7 +33,8 @@ Language Model | Training Data | Token-based | Size | Descriptions
### Acoustic Models
Model Type | Dataset| Example Link | Pretrained Models|Static Models|Size (static)
:-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
-Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3.zip)|||
+Tacotron2|LJSpeech|[tacotron2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip)|||
+Tacotron2|CSMSC|[tacotron2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts0)|[tacotron2_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip)|[tacotron2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_static_0.2.0.zip)|94.95MB|
TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)|||
SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip)|12MB|
FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)|157MB|
diff --git a/docs/source/tts/quick_start_cn.md b/docs/source/tts/quick_start_cn.md
index 39bf3d0a11b0992b6f4c15e27e1a01d8b5468095..37246e84e9ba0e29741389f0f73320108cb3dc2b 100644
--- a/docs/source/tts/quick_start_cn.md
+++ b/docs/source/tts/quick_start_cn.md
@@ -202,4 +202,4 @@ sf.write(
audio_path,
wav.numpy(),
samplerate=fastspeech2_config.fs)
-```
\ No newline at end of file
+```
diff --git a/examples/aishell3/vc0/README.md b/examples/aishell3/vc0/README.md
index 21cd0aa20fcaeb09ba05a88a092ebb8e3f8bc6c1..29585eb4bc5c5cebcfd106515f5e335bc2c9d055 100644
--- a/examples/aishell3/vc0/README.md
+++ b/examples/aishell3/vc0/README.md
@@ -1,4 +1,3 @@
-
# Tacotron2 + AISHELL-3 Voice Cloning
This example contains code used to train a [Tacotron2](https://arxiv.org/abs/1712.05884) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf). The general steps are as follows:
1. Speaker Encoder: We use Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in `Tacotron2` because the transcriptions are not needed, we use more datasets, refer to [ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e).
@@ -17,7 +16,7 @@ mkdir data_aishell3
tar zxvf data_aishell3.tgz -C data_aishell3
```
### Get MFA Result and Extract
-We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
+We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes for Tacotron2, the durations of MFA are not needed here.
You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
## Pretrained GE2E Model
diff --git a/examples/aishell3/vc0/path.sh b/examples/aishell3/vc0/path.sh
index 9cdbe256e11a068a5471fed1ac392804cf615ac1..a37cd21e3210967415d6842067f9294bfbce7b5e 100755
--- a/examples/aishell3/vc0/path.sh
+++ b/examples/aishell3/vc0/path.sh
@@ -9,5 +9,5 @@ export PYTHONDONTWRITEBYTECODE=1
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-MODEL=new_tacotron2
+MODEL=tacotron2
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/aishell3/vc1/README.md b/examples/aishell3/vc1/README.md
index 8a566089171d9ed46b62bf87b8aecfa8aeb1f10e..04b83a5ffae71082da84958e8d20e982ffeb396f 100644
--- a/examples/aishell3/vc1/README.md
+++ b/examples/aishell3/vc1/README.md
@@ -1,4 +1,3 @@
-
# FastSpeech2 + AISHELL-3 Voice Cloning
This example contains code used to train a [FastSpeech2](https://arxiv.org/abs/2006.04558) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf). The general steps are as follows:
1. Speaker Encoder: We use Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in `FastSpeech2` because the transcriptions are not needed, we use more datasets, refer to [ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e).
diff --git a/examples/csmsc/tts0/README.md b/examples/csmsc/tts0/README.md
index b030a51cbc03aaa843d120f9aa68b6f401fccbda..0129329aebd95010fdc6045be2151f0f6af8ea25 100644
--- a/examples/csmsc/tts0/README.md
+++ b/examples/csmsc/tts0/README.md
@@ -212,6 +212,8 @@ optional arguments:
Pretrained Tacotron2 model with no silence in the edge of audios:
- [tacotron2_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip)
+The static model can be downloaded here [tacotron2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_static_0.2.0.zip).
+
Model | Step | eval/loss | eval/l1_loss | eval/mse_loss | eval/bce_loss| eval/attn_loss
:-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------:
diff --git a/examples/csmsc/tts0/local/synthesize_e2e.sh b/examples/csmsc/tts0/local/synthesize_e2e.sh
index c957df876e2b8c92ff21d3bcf28bb5b6055e9b58..ea0f11d696095e372b19307807a31a4551a162ab 100755
--- a/examples/csmsc/tts0/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts0/local/synthesize_e2e.sh
@@ -7,6 +7,7 @@ ckpt_name=$3
stage=0
stop_stage=0
+# TODO: tacotron2 动转静的结果没有静态图的响亮, 可能还是 decode 的时候某个函数动静不对齐
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
diff --git a/examples/csmsc/tts0/path.sh b/examples/csmsc/tts0/path.sh
index 9cdbe256e11a068a5471fed1ac392804cf615ac1..a37cd21e3210967415d6842067f9294bfbce7b5e 100755
--- a/examples/csmsc/tts0/path.sh
+++ b/examples/csmsc/tts0/path.sh
@@ -9,5 +9,5 @@ export PYTHONDONTWRITEBYTECODE=1
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-MODEL=new_tacotron2
+MODEL=tacotron2
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/csmsc/tts0/run.sh b/examples/csmsc/tts0/run.sh
index 86800920d68499f8249b66ac51a5ad8d9876bf5d..8f06e933cccfd77113c4b72956f28ff74aec2037 100755
--- a/examples/csmsc/tts0/run.sh
+++ b/examples/csmsc/tts0/run.sh
@@ -35,3 +35,8 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# synthesize_e2e, vocoder is pwgan
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+ # inference with static model
+ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
+fi
diff --git a/examples/csmsc/tts3/run.sh b/examples/csmsc/tts3/run.sh
index 5c394c9f9003bc8f0c1d95624605074f9b807001..e1a149b6524716dbf68c8b898cd8d8e5b22e57f6 100755
--- a/examples/csmsc/tts3/run.sh
+++ b/examples/csmsc/tts3/run.sh
@@ -36,3 +36,8 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+ # inference with static model
+ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
+fi
+
diff --git a/examples/ljspeech/tts0/README.md b/examples/ljspeech/tts0/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ba7ad619392c41c67d16790c99dfc8865e219e30
--- /dev/null
+++ b/examples/ljspeech/tts0/README.md
@@ -0,0 +1,247 @@
+# Tacotron2 with LJSpeech-1.1
+This example contains code used to train a [Tacotron2](https://arxiv.org/abs/1712.05884) model with [LJSpeech-1.1](https://keithito.com/LJ-Speech-Dataset/)
+
+## Dataset
+### Download and Extract
+Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech-Dataset/).
+
+### Get MFA Result and Extract
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes for Tacotron2, the durations of MFA are not needed here.
+You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
+
+## Get Started
+Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
+Assume the path to the MFA result of LJSpeech-1.1 is `./ljspeech_alignment`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset.
+3. train the model.
+4. synthesize wavs.
+ - synthesize waveform from `metadata.jsonl`.
+ - synthesize waveform from a text file.
+
+```bash
+./run.sh
+```
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
+```bash
+./local/preprocess.sh ${conf_path}
+```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+
+```text
+dump
+├── dev
+│ ├── norm
+│ └── raw
+├── phone_id_map.txt
+├── speaker_id_map.txt
+├── test
+│ ├── norm
+│ └── raw
+└── train
+ ├── norm
+ ├── raw
+ └── speech_stats.npy
+```
+The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains speech features of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/*_stats.npy`.
+
+Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, speech_lengths, durations, the path of speech features, speaker, and the id of each utterance.
+
+### Model Training
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
+```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
+Here's the complete help message.
+```text
+usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
+ [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+ [--ngpu NGPU] [--phones-dict PHONES_DICT]
+
+Train a Tacotron2 model.
+
+optional arguments:
+ -h, --help show this help message and exit
+ --config CONFIG tacotron2 config file.
+ --train-metadata TRAIN_METADATA
+ training data.
+ --dev-metadata DEV_METADATA
+ dev data.
+ --output-dir OUTPUT_DIR
+ output dir.
+ --ngpu NGPU if ngpu == 0, use cpu.
+ --phones-dict PHONES_DICT
+ phone vocabulary file.
+```
+1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
+2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
+3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+5. `--phones-dict` is the path of the phone vocabulary file.
+
+### Synthesizing
+We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1) as the neural vocoder.
+Download pretrained parallel wavegan model from [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip) and unzip it.
+```bash
+unzip pwg_ljspeech_ckpt_0.5.zip
+```
+Parallel WaveGAN checkpoint contains files listed below.
+```text
+pwg_ljspeech_ckpt_0.5
+├── pwg_default.yaml # default config used to train parallel wavegan
+├── pwg_snapshot_iter_400000.pdz # generator parameters of parallel wavegan
+└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
+```
+`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize.py [-h]
+ [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}]
+ [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
+ [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
+ [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
+ [--voice-cloning VOICE_CLONING]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+ [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
+ [--voc_stat VOC_STAT] [--ngpu NGPU]
+ [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
+
+Synthesize with acoustic model & vocoder
+
+optional arguments:
+ -h, --help show this help message and exit
+ --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}
+ Choose acoustic model type of tts task.
+ --am_config AM_CONFIG
+ Config of acoustic model. Use deault config when it is
+ None.
+ --am_ckpt AM_CKPT Checkpoint file of acoustic model.
+ --am_stat AM_STAT mean and standard deviation used to normalize
+ spectrogram when training acoustic model.
+ --phones_dict PHONES_DICT
+ phone vocabulary file.
+ --tones_dict TONES_DICT
+ tone vocabulary file.
+ --speaker_dict SPEAKER_DICT
+ speaker id map file.
+ --voice-cloning VOICE_CLONING
+ whether training voice cloning model.
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+ Choose vocoder type of tts task.
+ --voc_config VOC_CONFIG
+ Config of voc. Use deault config when it is None.
+ --voc_ckpt VOC_CKPT Checkpoint file of voc.
+ --voc_stat VOC_STAT mean and standard deviation used to normalize
+ spectrogram when training voc.
+ --ngpu NGPU if ngpu == 0, use cpu.
+ --test_metadata TEST_METADATA
+ test metadata.
+ --output_dir OUTPUT_DIR
+ output dir.
+```
+`./local/synthesize_e2e.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize_e2e.py [-h]
+ [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}]
+ [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
+ [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
+ [--tones_dict TONES_DICT]
+ [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
+ [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}]
+ [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
+ [--voc_stat VOC_STAT] [--lang LANG]
+ [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
+ [--text TEXT] [--output_dir OUTPUT_DIR]
+
+Synthesize with acoustic model & vocoder
+
+optional arguments:
+ -h, --help show this help message and exit
+ --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}
+ Choose acoustic model type of tts task.
+ --am_config AM_CONFIG
+ Config of acoustic model. Use deault config when it is
+ None.
+ --am_ckpt AM_CKPT Checkpoint file of acoustic model.
+ --am_stat AM_STAT mean and standard deviation used to normalize
+ spectrogram when training acoustic model.
+ --phones_dict PHONES_DICT
+ phone vocabulary file.
+ --tones_dict TONES_DICT
+ tone vocabulary file.
+ --speaker_dict SPEAKER_DICT
+ speaker id map file.
+ --spk_id SPK_ID spk id for multi speaker acoustic model
+ --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}
+ Choose vocoder type of tts task.
+ --voc_config VOC_CONFIG
+ Config of voc. Use deault config when it is None.
+ --voc_ckpt VOC_CKPT Checkpoint file of voc.
+ --voc_stat VOC_STAT mean and standard deviation used to normalize
+ spectrogram when training voc.
+ --lang LANG Choose model language. zh or en
+ --inference_dir INFERENCE_DIR
+ dir to save inference models
+ --ngpu NGPU if ngpu == 0, use cpu.
+ --text TEXT text to synthesize, a 'utt_id sentence' pair per line.
+ --output_dir OUTPUT_DIR
+ output dir.
+```
+1. `--am` is acoustic model type with the format {model_name}_{dataset}
+2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model.
+3. `--voc` is vocoder type with the format {model_name}_{dataset}
+4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+5. `--lang` is the model language, which can be `zh` or `en`.
+6. `--test_metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder.
+7. `--text` is the text file, which contains sentences to synthesize.
+8. `--output_dir` is the directory to save synthesized audio files.
+9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+
+## Pretrained Model
+Pretrained Tacotron2 model with no silence in the edge of audios:
+- [tacotron2_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip)
+
+
+Model | Step | eval/loss | eval/l1_loss | eval/mse_loss | eval/bce_loss| eval/attn_loss
+:-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------:
+default| 1(gpu) x 60300|0.554092|0.394260|0.141046|0.018747|3.8e-05|
+
+Tacotron2 checkpoint contains files listed below.
+```text
+tacotron2_ljspeech_ckpt_0.2.0
+├── default.yaml # default config used to train Tacotron2
+├── phone_id_map.txt # phone vocabulary file when training Tacotron2
+├── snapshot_iter_60300.pdz # model parameters and optimizer states
+└── speech_stats.npy # statistics used to normalize spectrogram when training Tacotron2
+```
+You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_en.txt` using pretrained Tacotron2 and parallel wavegan models.
+```bash
+source path.sh
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../synthesize_e2e.py \
+ --am=tacotron2_ljspeech \
+ --am_config=tacotron2_ljspeech_ckpt_0.2.0/default.yaml \
+ --am_ckpt=tacotron2_ljspeech_ckpt_0.2.0/snapshot_iter_60300.pdz \
+ --am_stat=tacotron2_ljspeech_ckpt_0.2.0/speech_stats.npy \
+ --voc=pwgan_ljspeech\
+ --voc_config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \
+ --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
+ --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
+ --lang=en \
+ --text=${BIN_DIR}/../sentences_en.txt \
+ --output_dir=exp/default/test_e2e \
+ --phones_dict=tacotron2_ljspeech_ckpt_0.2.0/phone_id_map.txt
+```
diff --git a/examples/ljspeech/tts0/local/synthesize_e2e.sh b/examples/ljspeech/tts0/local/synthesize_e2e.sh
new file mode 100755
index 0000000000000000000000000000000000000000..73dfff603e279f211a8178182b10369428df2e88
--- /dev/null
+++ b/examples/ljspeech/tts0/local/synthesize_e2e.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+# TODO: dygraph to static graph is not good for tacotron2_ljspeech now
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../synthesize_e2e.py \
+ --am=tacotron2_ljspeech \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=pwgan_ljspeech \
+ --voc_config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \
+ --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
+ --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
+ --lang=en \
+ --text=${BIN_DIR}/../sentences_en.txt \
+ --output_dir=${train_output_path}/test_e2e \
+ --phones_dict=dump/phone_id_map.txt \
+ # --inference_dir=${train_output_path}/inference
\ No newline at end of file
diff --git a/examples/ljspeech/tts0/path.sh b/examples/ljspeech/tts0/path.sh
index 9cdbe256e11a068a5471fed1ac392804cf615ac1..a37cd21e3210967415d6842067f9294bfbce7b5e 100755
--- a/examples/ljspeech/tts0/path.sh
+++ b/examples/ljspeech/tts0/path.sh
@@ -9,5 +9,5 @@ export PYTHONDONTWRITEBYTECODE=1
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-MODEL=new_tacotron2
+MODEL=tacotron2
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md
index f3602c3478697f61fcd830a5d4f1894d01e810a9..f5e919c0fe45bff2fe66a599e8bf1c4030a0c91d 100644
--- a/examples/ljspeech/tts3/README.md
+++ b/examples/ljspeech/tts3/README.md
@@ -1,4 +1,4 @@
-# FastSpeech2 with the LJSpeech-1.1
+# FastSpeech2 with LJSpeech-1.1
This example contains code used to train a [Fastspeech2](https://arxiv.org/abs/2006.04558) model with [LJSpeech-1.1](https://keithito.com/LJ-Speech-Dataset/).
## Dataset
diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py
index d6dd7af1051b60539f3c7bc5e8867faf8f37b78c..1c42a87c9ff7574326b25e2c3e6cf0edcb5bef4e 100644
--- a/paddlespeech/t2s/exps/synthesize.py
+++ b/paddlespeech/t2s/exps/synthesize.py
@@ -38,9 +38,9 @@ model_alias = {
"fastspeech2_inference":
"paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
"tacotron2":
- "paddlespeech.t2s.models.new_tacotron2:Tacotron2",
+ "paddlespeech.t2s.models.tacotron2:Tacotron2",
"tacotron2_inference":
- "paddlespeech.t2s.models.new_tacotron2:Tacotron2Inference",
+ "paddlespeech.t2s.models.tacotron2:Tacotron2Inference",
# voc
"pwgan":
"paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py
index 0b95a883299cf45c528b54e42be11eaed0d2b5b7..75c631b847a1459ec56fbb32f97c88ce6ee8fce9 100644
--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@@ -39,9 +39,9 @@ model_alias = {
"fastspeech2_inference":
"paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
"tacotron2":
- "paddlespeech.t2s.models.new_tacotron2:Tacotron2",
+ "paddlespeech.t2s.models.tacotron2:Tacotron2",
"tacotron2_inference":
- "paddlespeech.t2s.models.new_tacotron2:Tacotron2Inference",
+ "paddlespeech.t2s.models.tacotron2:Tacotron2Inference",
# voc
"pwgan":
"paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
@@ -229,6 +229,11 @@ def evaluate(args):
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
merge_sentences = False
+ # Avoid not stopping at the end of a sub sentence when tacotron2_ljspeech dygraph to static graph
+ # but still not stopping in the end (NOTE by yuantian01 Feb 9 2022)
+ if am_name == 'tacotron2':
+ merge_sentences = True
+
for utt_id, sentence in sentences:
get_tone_ids = False
if am_name == 'speedyspeech':
diff --git a/paddlespeech/t2s/exps/new_tacotron2/__init__.py b/paddlespeech/t2s/exps/tacotron2/__init__.py
similarity index 100%
rename from paddlespeech/t2s/exps/new_tacotron2/__init__.py
rename to paddlespeech/t2s/exps/tacotron2/__init__.py
diff --git a/paddlespeech/t2s/exps/new_tacotron2/normalize.py b/paddlespeech/t2s/exps/tacotron2/normalize.py
similarity index 100%
rename from paddlespeech/t2s/exps/new_tacotron2/normalize.py
rename to paddlespeech/t2s/exps/tacotron2/normalize.py
diff --git a/paddlespeech/t2s/exps/new_tacotron2/preprocess.py b/paddlespeech/t2s/exps/tacotron2/preprocess.py
similarity index 100%
rename from paddlespeech/t2s/exps/new_tacotron2/preprocess.py
rename to paddlespeech/t2s/exps/tacotron2/preprocess.py
diff --git a/paddlespeech/t2s/exps/new_tacotron2/train.py b/paddlespeech/t2s/exps/tacotron2/train.py
similarity index 97%
rename from paddlespeech/t2s/exps/new_tacotron2/train.py
rename to paddlespeech/t2s/exps/tacotron2/train.py
index a77331e746f23fc56025e8a07d1f7c22f0304f2a..bf4c4e01df72112f68046dc4bd2c201058b77c85 100644
--- a/paddlespeech/t2s/exps/new_tacotron2/train.py
+++ b/paddlespeech/t2s/exps/tacotron2/train.py
@@ -30,9 +30,9 @@ from yacs.config import CfgNode
from paddlespeech.t2s.datasets.am_batch_fn import tacotron2_multi_spk_batch_fn
from paddlespeech.t2s.datasets.am_batch_fn import tacotron2_single_spk_batch_fn
from paddlespeech.t2s.datasets.data_table import DataTable
-from paddlespeech.t2s.models.new_tacotron2 import Tacotron2
-from paddlespeech.t2s.models.new_tacotron2 import Tacotron2Evaluator
-from paddlespeech.t2s.models.new_tacotron2 import Tacotron2Updater
+from paddlespeech.t2s.models.tacotron2 import Tacotron2
+from paddlespeech.t2s.models.tacotron2 import Tacotron2Evaluator
+from paddlespeech.t2s.models.tacotron2 import Tacotron2Updater
from paddlespeech.t2s.training.extensions.snapshot import Snapshot
from paddlespeech.t2s.training.extensions.visualizer import VisualDL
from paddlespeech.t2s.training.optimizer import build_optimizers
diff --git a/paddlespeech/t2s/exps/voice_cloning.py b/paddlespeech/t2s/exps/voice_cloning.py
index d6733a9487704decb838e6d5f0efb49de4cda272..3de30774f5b7610cfd724b62c57eafbd5e6e667f 100644
--- a/paddlespeech/t2s/exps/voice_cloning.py
+++ b/paddlespeech/t2s/exps/voice_cloning.py
@@ -34,9 +34,9 @@ model_alias = {
"fastspeech2_inference":
"paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
"tacotron2":
- "paddlespeech.t2s.models.new_tacotron2:Tacotron2",
+ "paddlespeech.t2s.models.tacotron2:Tacotron2",
"tacotron2_inference":
- "paddlespeech.t2s.models.new_tacotron2:Tacotron2Inference",
+ "paddlespeech.t2s.models.tacotron2:Tacotron2Inference",
# voc
"pwgan":
"paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
diff --git a/paddlespeech/t2s/frontend/phonectic.py b/paddlespeech/t2s/frontend/phonectic.py
index 254138713d123fef0e60ac7110a5ef9cbfe96dcd..a488a6fc0f469905e8e67c8f034dd97f7edef24f 100644
--- a/paddlespeech/t2s/frontend/phonectic.py
+++ b/paddlespeech/t2s/frontend/phonectic.py
@@ -83,11 +83,6 @@ class English(Phonetics):
return phonemes
def _p2id(self, phonemes: List[str]) -> np.array:
- # replace unk phone with sp
- phonemes = [
- phn if (phn in self.vocab_phones and phn not in self.punc) else "sp"
- for phn in phonemes
- ]
phone_ids = [self.vocab_phones[item] for item in phonemes]
return np.array(phone_ids, np.int64)
@@ -102,6 +97,12 @@ class English(Phonetics):
# remove start_symbol and end_symbol
phones = phones[1:-1]
phones = [phn for phn in phones if not phn.isspace()]
+ # replace unk phone with sp
+ phones = [
+ phn
+ if (phn in self.vocab_phones and phn not in self.punc) else "sp"
+ for phn in phones
+ ]
phones_list.append(phones)
if merge_sentences:
diff --git a/paddlespeech/t2s/models/__init__.py b/paddlespeech/t2s/models/__init__.py
index 3b90a414c493f55de215efe4d72cf107a81fa57b..41be7c1db92508271121047ba997ac95fc064505 100644
--- a/paddlespeech/t2s/models/__init__.py
+++ b/paddlespeech/t2s/models/__init__.py
@@ -14,9 +14,9 @@
from .fastspeech2 import *
from .hifigan import *
from .melgan import *
-from .new_tacotron2 import *
from .parallel_wavegan import *
from .speedyspeech import *
+from .tacotron2 import *
from .transformer_tts import *
from .waveflow import *
from .wavernn import *
diff --git a/paddlespeech/t2s/models/new_tacotron2/__init__.py b/paddlespeech/t2s/models/tacotron2/__init__.py
similarity index 100%
rename from paddlespeech/t2s/models/new_tacotron2/__init__.py
rename to paddlespeech/t2s/models/tacotron2/__init__.py
diff --git a/paddlespeech/t2s/models/new_tacotron2/tacotron2.py b/paddlespeech/t2s/models/tacotron2/tacotron2.py
similarity index 100%
rename from paddlespeech/t2s/models/new_tacotron2/tacotron2.py
rename to paddlespeech/t2s/models/tacotron2/tacotron2.py
diff --git a/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py b/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py
similarity index 100%
rename from paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py
rename to paddlespeech/t2s/models/tacotron2/tacotron2_updater.py
diff --git a/paddlespeech/t2s/modules/tacotron2/decoder.py b/paddlespeech/t2s/modules/tacotron2/decoder.py
index 3622fd7a247c3785a9c85900d97c513f30a13bd8..0cfe0b842735b50fd4d284a7e7f85cd557ba08b4 100644
--- a/paddlespeech/t2s/modules/tacotron2/decoder.py
+++ b/paddlespeech/t2s/modules/tacotron2/decoder.py
@@ -395,9 +395,6 @@ class Decoder(nn.Layer):
iunits, odim * reduction_factor, bias_attr=False)
self.prob_out = nn.Linear(iunits, reduction_factor)
- # initialize
- # self.apply(decoder_init)
-
def _zero_state(self, hs):
init_hs = paddle.zeros([paddle.shape(hs)[0], self.lstm[0].hidden_size])
return init_hs
@@ -558,8 +555,11 @@ class Decoder(nn.Layer):
assert len(paddle.shape(h)) == 2
hs = h.unsqueeze(0)
ilens = paddle.shape(h)[0]
- maxlen = int(paddle.shape(h)[0] * maxlenratio)
- minlen = int(paddle.shape(h)[0] * minlenratio)
+ # 本来 maxlen 和 minlen 外面有 int(),防止动转静的问题此处删除
+ maxlen = paddle.shape(h)[0] * maxlenratio
+ minlen = paddle.shape(h)[0] * minlenratio
+ # 本来是直接使用 threshold 的,此处为了防止动转静的问题把 threshold 转成 tensor
+ threshold = paddle.ones([1]) * threshold
# initialize hidden states of decoder
c_list = [self._zero_state(hs)]
@@ -645,11 +645,27 @@ class Decoder(nn.Layer):
if use_att_constraint:
last_attended_idx = int(att_w.argmax())
+ # tacotron2 ljspeech 动转静的问题应该是这里没有正确判断 prob >= threshold 导致的
if prob >= threshold or idx >= maxlen:
# check mininum length
if idx < minlen:
continue
break
+ """
+ 仅解开 665~667 行的代码块,动转静时会卡死,但是动态图时可以正确生成音频,证明模型没问题
+ 同时解开 665~667 行 和 668 ~ 670 行的代码块,动转静时不会卡死,但是生成的音频末尾有多余的噪声
+ 证明动转静没有进入 prob >= threshold 的判断,但是静态图可以进入 prob >= threshold 并退出循环
+ 动转静时是通过 idx >= maxlen 退出循环(所以没有这个逻辑的时候会一直循环,也就是卡死),
+ 没有在模型判断该结束的时候结束,而是在超出最大长度时结束,所以合成的音频末尾有很长的额外预测的噪声
+ 动转静用 prob <= threshold 的条件可以退出循环(虽然结果不正确),证明条件参数的类型本身没问题,可能是 prob 有问题
+ """
+ # if prob >= threshold:
+ # print("prob >= threshold")
+ # break
+ # elif idx >= maxlen:
+ # print("idx >= maxlen")
+ # break
+
# (1, odim, L)
outs = paddle.concat(outs, axis=2)
if self.postnet is not None: