Merge pull request #932 from yt605155624/merge_parakeet

[tts] refactor parakeet example

Merge pull request #932 from yt605155624/merge_parakeet
[tts] refactor parakeet example
e3954624 · Hui Zhang · GitHub · 05288cd3 · 20226b4f · e3954624
238 changed file
--- a/deepspeech/__init__.py
+++ b/deepspeech/__init__.py
@@ -355,7 +355,6 @@ if not hasattr(paddle.Tensor, 'tolist'):
        "register user tolist to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'tolist', tolist)

-
 ########### hack paddle.nn #############
 from paddle.nn import Layer
 from typing import Optional
@@ -506,5 +505,3 @@ if not hasattr(paddle.nn, 'LayerDict'):
    logger.debug(
        "register user LayerDict to paddle.nn, remove this when fixed!")
    setattr(paddle.nn, 'LayerDict', LayerDict)
-
-
--- a/deepspeech/decoders/recog.py
+++ b/deepspeech/decoders/recog.py
@@ -12,12 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """V2 backend for `asr_recog.py` using py:class:`decoders.beam_search.BeamSearch`."""
-import json
-from pathlib import Path
-
 import jsonlines
 import paddle
-import yaml
 from yacs.config import CfgNode

 from .beam_search import BatchBeamSearch
@@ -79,8 +75,7 @@ def recog_v2(args):
        sort_in_input_length=False,
        preprocess_conf=confs.collator.augmentation_config
        if args.preprocess_conf is None else args.preprocess_conf,
-        preprocess_args={"train": False},
-    )
+        preprocess_args={"train": False}, )

    if args.rnnlm:
        lm_args = get_model_conf(args.rnnlm, args.rnnlm_conf)
@@ -113,8 +108,7 @@ def recog_v2(args):
        ctc=args.ctc_weight,
        lm=args.lm_weight,
        ngram=args.ngram_weight,
-        length_bonus=args.penalty,
-    )
+        length_bonus=args.penalty, )
    beam_search = BeamSearch(
        beam_size=args.beam_size,
        vocab_size=len(char_list),
@@ -123,8 +117,7 @@ def recog_v2(args):
        sos=model.sos,
        eos=model.eos,
        token_list=char_list,
-        pre_beam_score_key=None if args.ctc_weight == 1.0 else "full",
-    )
+        pre_beam_score_key=None if args.ctc_weight == 1.0 else "full", )

    # TODO(karita): make all scorers batchfied
    if args.batchsize == 1:
@@ -171,9 +164,10 @@ def recog_v2(args):
                logger.info(f'feat: {feat.shape}')
                enc = model.encode(paddle.to_tensor(feat).to(dtype))
                logger.info(f'eout: {enc.shape}')
-                nbest_hyps = beam_search(x=enc,
-                                         maxlenratio=args.maxlenratio,
-                                         minlenratio=args.minlenratio)
+                nbest_hyps = beam_search(
+                    x=enc,
+                    maxlenratio=args.maxlenratio,
+                    minlenratio=args.minlenratio)
                nbest_hyps = [
                    h.asdict()
                    for h in nbest_hyps[:min(len(nbest_hyps), args.nbest)]
@@ -183,9 +177,8 @@ def recog_v2(args):

                item = new_js[name]['output'][0]  # 1-best
                ref = item['text']
-                rec_text = item['rec_text'].replace('▁',
-                                                    ' ').replace('<eos>',
-                                                                 '').strip()
+                rec_text = item['rec_text'].replace('▁', ' ').replace(
+                    '<eos>', '').strip()
                rec_tokenid = list(map(int, item['rec_tokenid'].split()))
                f.write({
                    "utt": name,

--- a/deepspeech/decoders/recog_bin.py
+++ b/deepspeech/decoders/recog_bin.py
@@ -21,7 +21,7 @@ from distutils.util import strtobool
 import configargparse
 import numpy as np

-from .recog import recog_v2
+from deepspeech.decoders.recog import recog_v2


 def get_parser():
@@ -359,7 +359,7 @@ def main(args):
        if args.num_encs == 1:
            # Experimental API that supports custom LMs
            if args.api == "v2":
-                from deepspeech.decoders.recog import recog_v2
+
                recog_v2(args)
            else:
                raise ValueError("Only support --api v2")

--- a/deepspeech/modules/ctc.py
+++ b/deepspeech/modules/ctc.py
@@ -11,9 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Union
+
 import paddle
 from paddle import nn
-from typing import Union
 from paddle.nn import functional as F
 from typeguard import check_argument_types


--- a/examples/aishell3/README.md
+++ b/examples/aishell3/README.md
 # Aishell3

-* tts0 - fastspeech2
-* vc0 - tactron2 voice clone
+* tts0 - Tactron2
+* tts1 - TransformerTTS
+* tts2 - SpeedySpeech
+* tts3 - FastSpeech2
+* voc0 - WaveFlow
+* voc1 - Parallel WaveGAN
+* voc2 - MelGAN
+* voc3 - MultiBand MelGAN
+* vc0 - Tactron2 Voice Clone with GE2E
--- a/examples/aishell3/tts0/run.sh
+++ b/examples/aishell3/tts0/run.sh
--- a/examples/vctk/fastspeech2/aishell3/README.md
+++ b/examples/vctk/fastspeech2/aishell3/README.md
@@ -18,12 +18,23 @@ tar zxvf data_aishell3.tgz -C data_aishell3
 ### Get MFA result of AISHELL-3 and Extract it
 We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
 You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) (use MFA1.x now) of our repo.
-### Preprocess the dataset
+
+## Get Started
 Assume the path to the dataset is `~/datasets/data_aishell3`.
 Assume the path to the MFA result of AISHELL-3 is `./aishell3_alignment_tone`.
-Run the command below to preprocess the dataset.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset,
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+    - synthesize waveform from text file.
 ```bash
-./preprocess.sh
+./run.sh
+```
+### Preprocess the dataset
+```bash
+./local/preprocess.sh ${conf_path}
 ```
 When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
 ```text
@@ -47,10 +58,10 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi

 Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance.

-## Train the model
-`./run.sh` calls `../train.py`.
+### Train the model
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
 ```bash
-./run.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
 ```
 Here's the complete help message.
 ```text
@@ -85,20 +96,8 @@ optional arguments:
 5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
 6. `--phones-dict` is the path of the phone vocabulary file.
 7. `--speaker-dict`is the path of the  speaker id map file when training a multi-speaker FastSpeech2.
-## Pretrained Model
-Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)

-FastSpeech2 checkpoint contains files listed below.
-
-```text
-fastspeech2_nosil_aishell3_ckpt_0.4
-├── default.yaml            # default config used to train fastspeech2
-├── phone_id_map.txt        # phone vocabulary file when training fastspeech2
-├── snapshot_iter_96400.pdz # model parameters and optimizer states
-├── speaker_id_map.txt      # speaker id map file when training a multi-speaker fastspeech2
-└── speech_stats.npy        # statistics used to normalize spectrogram when training fastspeech2
-```
-## Synthesize
+### Synthesize
 We use [parallel wavegan](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/parallelwave_gan/baker) as the neural vocoder.
 Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it.
 ```bash
@@ -111,9 +110,9 @@ pwg_baker_ckpt_0.4
 ├── pwg_snapshot_iter_400000.pdz   # model parameters of parallel wavegan
 └── pwg_stats.npy                  # statistics used to normalize spectrogram when training parallel wavegan
 ```
-`synthesize.sh` calls `synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
 ```bash
-./synthesize.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
 usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
@@ -153,22 +152,22 @@ optional arguments:
  --device DEVICE       device type to use.
  --verbose VERBOSE     verbose.
 ```
-
-`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file.
-
+`./local/synthesize_e2e.sh` calls `${BIN_DIR}/multi_spk_synthesize_e2e.py`, which can synthesize waveform from text file.
 ```bash
-./synthesize_e2e.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
-usage: synthesize_e2e.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
-                         [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT]
-                         [--fastspeech2-stat FASTSPEECH2_STAT]
-                         [--pwg-config PWG_CONFIG]
-                         [--pwg-checkpoint PWG_CHECKPOINT]
-                         [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT]
-                         [--speaker-dict SPEAKER_DICT] [--text TEXT]
-                         [--output-dir OUTPUT_DIR] [--device DEVICE]
-                         [--verbose VERBOSE]
+usage: multi_spk_synthesize_e2e.py [-h]
+                                   [--fastspeech2-config FASTSPEECH2_CONFIG]
+                                   [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT]
+                                   [--fastspeech2-stat FASTSPEECH2_STAT]
+                                   [--pwg-config PWG_CONFIG]
+                                   [--pwg-checkpoint PWG_CHECKPOINT]
+                                   [--pwg-stat PWG_STAT]
+                                   [--phones-dict PHONES_DICT]
+                                   [--speaker-dict SPEAKER_DICT] [--text TEXT]
+                                   [--output-dir OUTPUT_DIR] [--device DEVICE]
+                                   [--verbose VERBOSE]

 Synthesize with fastspeech2 & parallel wavegan.

@@ -204,24 +203,38 @@ optional arguments:
 5. `--output-dir` is the directory to save synthesized audio files.
 6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.

-You can use the following scripts to synthesize for `../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
+## Pretrained Model
+Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
+
+FastSpeech2 checkpoint contains files listed below.
+
+```text
+fastspeech2_nosil_aishell3_ckpt_0.4
+├── default.yaml            # default config used to train fastspeech2
+├── phone_id_map.txt        # phone vocabulary file when training fastspeech2
+├── snapshot_iter_96400.pdz # model parameters and optimizer states
+├── speaker_id_map.txt      # speaker id map file when training a multi-speaker fastspeech2
+└── speech_stats.npy        # statistics used to normalize spectrogram when training fastspeech2
+```
+You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
 ```bash
+source path.sh
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 synthesize_e2e.py \
+python3 ${BIN_DIR}/multi_spk_synthesize_e2e.py \
  --fastspeech2-config=fastspeech2_nosil_aishell3_ckpt_0.4/default.yaml \
  --fastspeech2-checkpoint=fastspeech2_nosil_aishell3_ckpt_0.4/snapshot_iter_96400.pdz \
  --fastspeech2-stat=fastspeech2_nosil_aishell3_ckpt_0.4/speech_stats.npy \
  --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
  --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
  --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
-  --text=../sentences.txt \
+  --text=${BIN_DIR}/../sentences.txt \
  --output-dir=exp/default/test_e2e \
  --device="gpu" \
  --phones-dict=fastspeech2_nosil_aishell3_ckpt_0.4/phone_id_map.txt \
  --speaker-dict=fastspeech2_nosil_aishell3_ckpt_0.4/speaker_id_map.txt

 ```
-
 ## Future work
 A multi-speaker  vocoder is needed.
--- a/examples/vctk/fastspeech2/aishell3/conf/default.yaml
+++ b/examples/vctk/fastspeech2/aishell3/conf/default.yaml
--- a/examples/vctk/fastspeech2/aishell3/preprocess.sh
+++ b/examples/vctk/fastspeech2/aishell3/preprocess.sh
@@ -3,7 +3,7 @@
 stage=0
 stop_stage=100

-export MAIN_ROOT=`realpath ${PWD}/../../../`
+config_path=$1

 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # get durations from MFA's result
@@ -11,18 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
        --inputdir=./aishell3_alignment_tone \
        --output durations.txt \
-        --config=conf/default.yaml 
+        --config=${config_path}
 fi

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # extract features
    echo "Extract features ..."
-    python3 ../preprocess.py \
+    python3 ${BIN_DIR}/preprocess.py \
        --dataset=aishell3 \
        --rootdir=~/datasets/data_aishell3/ \
        --dumpdir=dump \
        --dur-file=durations.txt \
-        --config=conf/default.yaml \
+        --config=${config_path} \
        --num-cpu=20 \
        --cut-sil=True
 fi
@@ -46,7 +46,7 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # normalize and covert phone/speaker to id, dev and test should use train's stats
    echo "Normalize ..."
-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --dumpdir=dump/train/norm \
        --speech-stats=dump/train/speech_stats.npy \
@@ -55,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --phones-dict=dump/phone_id_map.txt \
        --speaker-dict=dump/speaker_id_map.txt

-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/dev/raw/metadata.jsonl \
        --dumpdir=dump/dev/norm \
        --speech-stats=dump/train/speech_stats.npy \
@@ -64,7 +64,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --phones-dict=dump/phone_id_map.txt \
        --speaker-dict=dump/speaker_id_map.txt

-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/test/raw/metadata.jsonl \
        --dumpdir=dump/test/norm \
        --speech-stats=dump/train/speech_stats.npy \

--- a/examples/vctk/fastspeech2/aishell3/synthesize.sh
+++ b/examples/vctk/fastspeech2/aishell3/synthesize.sh
 #!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3  ../synthesize.py \
-  --fastspeech2-config=conf/default.yaml \
-  --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_96400.pdz \
+python3 ${BIN_DIR}/synthesize.py \
+  --fastspeech2-config=${config_path} \
+  --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --fastspeech2-stat=dump/train/speech_stats.npy \
  --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
  --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
  --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
  --test-metadata=dump/test/norm/metadata.jsonl \
-  --output-dir=exp/default/test \
+  --output-dir=${train_output_path}/test \
  --device="gpu" \
  --phones-dict=dump/phone_id_map.txt \
  --speaker-dict=dump/speaker_id_map.txt
--- a/examples/vctk/fastspeech2/aishell3/synthesize_e2e.sh
+++ b/examples/vctk/fastspeech2/aishell3/synthesize_e2e.sh
 #!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 synthesize_e2e.py \
-  --fastspeech2-config=conf/default.yaml \
-  --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_96400.pdz \
+python3 ${BIN_DIR}/multi_spk_synthesize_e2e.py \
+  --fastspeech2-config=${config_path} \
+  --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --fastspeech2-stat=dump/train/speech_stats.npy \
  --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
  --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
  --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
-  --text=../sentences.txt \
-  --output-dir=exp/default/test_e2e \
+  --text=${BIN_DIR}/../sentences.txt \
+  --output-dir=${train_output_path}/test_e2e \
  --device="gpu" \
  --phones-dict=dump/phone_id_map.txt \
  --speaker-dict=dump/speaker_id_map.txt
--- a/examples/vctk/fastspeech2/aishell3/run.sh
+++ b/examples/vctk/fastspeech2/aishell3/run.sh
 #!/bin/bash

-python3 ../train.py \
+config_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=conf/default.yaml \
-    --output-dir=exp/default \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
    --nprocs=2 \
    --phones-dict=dump/phone_id_map.txt \
    --speaker-dict=dump/speaker_id_map.txt
--- a/examples/aishell3/tts3/path.sh
+++ b/examples/aishell3/tts3/path.sh
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=fastspeech2
+export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
--- a/examples/aishell3/tts3/run.sh
+++ b/examples/aishell3/tts3/run.sh
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_482.pdz
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
--- a/examples/aishell3/vc0/README.md
+++ b/examples/aishell3/vc0/README.md
+# Tacotron2 + AISHELL-3 Voice Cloning
+This example contains code used to train a [Tacotron2 ](https://arxiv.org/abs/1712.05884) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of  [Transfer Learning from Speaker Veriﬁcation to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) . The general steps are as follows:
+1. Speaker Encoder: We  use a Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in Tacotron2, because the  transcriptions are not needed, we use more datasets, refer to  [ge2e](../../other/ge2e).
+2. Synthesizer: Then, we use the trained speaker encoder to generate utterance embedding for each  sentence in AISHELL-3. This embedding is a extra input of  Tacotron2 which will be concated with encoder outputs.
+3. Vocoder: We use WaveFlow as the neural Vocoder, refer to [waveflow](../../ljspeech/voc0).
+
+## Get Started
+Assume the path to the dataset is `~/datasets/data_aishell3`.
+Assume the path to the MFA result of AISHELL-3 is `./alignment`.
+Assume the path to the pretrained ge2e model is `ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000`
+Run the command below to
+1. **source path**.
+2. preprocess the dataset,
+3. train the model.
+4. start a voice cloning inference.
+```bash
+./run.sh
+```
+### Preprocess the dataset
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${input} ${preprocess_path} ${alignment} ${ge2e_ckpt_path}
+```
+#### generate utterance embedding
+ Use pretrained GE2E (speaker encoder) to generate utterance embedding for each sentence in AISHELL-3, which has the same file structure with wav files and the format is  `.npy`.
+
+```bash
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../ge2e/inference.py \
+        --input=${input} \
+        --output=${preprocess_path}/embed \
+        --device="gpu" \
+        --checkpoint_path=${ge2e_ckpt_path}
+fi
+```
+
+The computing time of  utterance embedding can be x hours.
+####  process wav
+There are silence in the edge of AISHELL-3's wavs, and the audio amplitude is very small, so, we need to remove the silence and normalize the audio. You can the silence remove method based on   volume or energy, but the effect is not very good, We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get  the alignment of text and  speech, then utilize the alignment results to remove the silence.
+
+We use Montreal Force Aligner 1.0. The label in  aishell3 include pinyin，so the lexicon we provided to MFA is pinyin rather than Chinese characters. And the prosody marks(`$`  and `%`) need to be removed. You shoud preprocess the dataset into the format  which MFA needs, the texts have the same name with wavs and have the suffix `.lab`.
+
+We use [lexicon.txt](./lexicon.txt) as the lexicon.
+
+You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) (use MFA1.x now) of our repo.
+
+```bash
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "Process wav ..."
+    python3 ${BIN_DIR}/process_wav.py \
+        --input=${input}/wav \
+        --output=${preprocess_path}/normalized_wav \
+        --alignment=${alignment}
+fi
+```
+
+#### preprocess transcription
+We revert the transcription into `phones` and  `tones`. It is worth noting that our processing here is different from that used for MFA, we separated the tones. This is a processing method, of course, you can only segment initials and vowels.
+
+```bash
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    python3 ${BIN_DIR}/preprocess_transcription.py \
+        --input=${input} \
+        --output=${preprocess_path}
+fi
+```
+The default input is  `~/datasets/data_aishell3/train`，which contains `label_train-set.txt`, the processed results are `metadata.yaml` and  `metadata.pickle`. the former is a text format for easy viewing, and the latter is a binary format for direct reading.
+#### extract mel
+```python
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    python3 ${BIN_DIR}/extract_mel.py \
+        --input=${preprocess_path}/normalized_wav \
+        --output=${preprocess_path}/mel
+fi
+```
+
+###  Train the model
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path}
+```
+
+Our model remve  stop token prediction in Tacotron2, because of the problem of extremely unbalanced proportion of positive and negative samples of stop token prediction, and it's very sensitive to the clip of audio silence. We use the last symbol from the highest point of attention to the encoder side as the termination condition.
+
+In addition, in order to accelerate the convergence of the model, we add `guided attention loss` to induce the alignment between encoder and decoder to show diagonal lines faster.
+###  Infernece
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${ge2e_params_path} ${tacotron2_params_path} ${waveflow_params_path} ${vc_input} ${vc_output}
+```
+## Pretrained Model
+[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip).
--- a/examples/aishell3/vc0/local/tacotron2/images/alignment-step2000.png
+++ b/examples/aishell3/vc0/local/tacotron2/images/alignment-step2000.png
--- a/examples/aishell3/vc0/local/tacotron2/images/train.png
+++ b/examples/aishell3/vc0/local/tacotron2/images/train.png
--- a/examples/aishell3/vc0/local/tacotron2/images/valid.png
+++ b/examples/aishell3/vc0/local/tacotron2/images/valid.png
--- a/examples/aishell3/vc0/local/preprocess.sh
+++ b/examples/aishell3/vc0/local/preprocess.sh
+#!/bin/bash
+
+stage=0
+stop_stage=100
+
+input=$1
+preprocess_path=$2
+alignment=$3
+ge2e_ckpt_path=$4
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../../ge2e/inference.py \
+        --input=${input} \
+        --output=${preprocess_path}/embed \
+        --device="gpu" \
+        --checkpoint_path=${ge2e_ckpt_path}
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "Process wav ..."
+    python3 ${BIN_DIR}/process_wav.py \
+        --input=${input}/wav \
+        --output=${preprocess_path}/normalized_wav \
+        --alignment=${alignment}
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    python3 ${BIN_DIR}/preprocess_transcription.py \
+        --input=${input} \
+        --output=${preprocess_path}
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    python3 ${BIN_DIR}/extract_mel.py \
+        --input=${preprocess_path}/normalized_wav \
+        --output=${preprocess_path}/mel
+fi
--- a/examples/aishell3/vc0/local/tacotron2/README_cn.md
+++ b/examples/aishell3/vc0/local/tacotron2/README_cn.md
-## Tacotron2 + AISHELL-3 数据集训练语音克隆模型
-
-本实验的内容是利用 AISHELL-3 数据集和 Tacotron 2 模型进行语音克隆任务，使用的模型大体结构和论文 [Transfer Learning from Speaker Veriﬁcation to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) 相同。大致步骤如下：
-
-1. Speaker Encoder: 我们使用了一个 Speaker Verification 任务训练一个 speaker encoder。这部分任务所用的数据集和训练 Tacotron 2 的数据集不同，因为不需要 transcription 的缘故，我们使用了较多的训练数据，可以参考实现 [ge2e](../ge2e)。
-2. Synthesizer: 然后使用训练好的 speaker encoder 为 AISHELL-3 数据集中的每个句子生成对应的 utterance embedding. 这个 Embedding 作为 Tacotron 模型中的一个额外输入和 encoder outputs 拼接在一起。
-3. Vocoder: 我们使用的声码器是 WaveFlow，参考实验 [waveflow](../waveflow).
-
-## 数据处理
-
-### utterance embedding 的生成
-
-使用训练好的 speaker encoder 为 AISHELL-3 数据集中的每个句子生成对应的 utterance embedding. 以和音频文件夹同构的方式存储。存储格式是 `.npy` 文件。
-
-首先 cd 到 [ge2e](../ge2e) 文件夹。下载训练好的 [模型](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip)，然后运行脚本生成每个句子的 utterance embedding.
-
-```bash
-python inference.py --input=<intput> --output=<output> --device="gpu" --checkpoint_path=<pretrained checkpoint>
-```
-
-其中 input 是只包含音频文件夹的文件。这里可以用 `~/datasets/aishell3/train/wav`，然后 output 是用于存储 utterance embed 的文件夹，这里可以用 `~/datasets/aishell3/train/embed`。Utterance embedding 会以和音频文件夹相同的文件结构存储，格式为 `.npy`.
-
-utterance embedding 的计算可能会用几个小时的时间，请耐心等待。
-
-### 音频处理
-
-因为 AISHELL-3 数据集前后有一些空白，静音片段，而且语音幅值很小，所以我们需要进行空白移除和音量规范化。空白移除可以简单的使用基于音量或者能量的方法，但是效果不是很好，对于不同的句子很难取到一个一致的阈值。我们使用的是先利用 Force Aligner 进行文本和语音的对齐。然后根据对齐结果截除空白。
-
-我们使用的工具是 Montreal Force Aligner 1.0. 因为 aishell 的标注包含拼音标注，所以我们提供给 Montreal Force Aligner 的是拼音 transcription 而不是汉字 transcription. 而且需要把其中的韵律标记(`$` 和 `%`)去除，并且处理成 Montreal Force Alinger 所需要的文件形式。和音频同名的文本文件，扩展名为 `.lab`.
-
-此外还需要准备词典文件。其中包含把拼音序列转换为 phone 序列的映射关系。在这里我们只做声母和韵母的切分，而声调则归为韵母的一部分。我们使用的[词典文件](./lexicon.txt)可以下载。
-
-准备好之后运行训练和对齐。首先下载 [Montreal Force Aligner 1.0](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/tag/v1.0.1).下载之后解压即可运行。cd 到其中的 bin 文件夹运行命令，即可进行训练和对齐。前三个命令行参数分别是音频文件夹的路径，词典路径和对齐文件输出路径。可以通过`-o` 传入训练得到的模型保存路径。
-
-```bash
-./mfa_train_and_align \
-  ~/datasets/aishell3/train/wav \
-  lexicon.txt \
-  ~/datasets/aishell3/train/alignment \
-  -o aishell3_model \
-  -v
-```
-
-因为训练和对齐的时间比较长。我们提供了对齐后的 [alignment 文件](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz)，其中每个句子对应的文件为 `.TextGrid` 格式的文本。
-
-得到了对齐文件之后，可以运行 `process_wav.py` 脚本来处理音频。
-
-```bash
-python process_wav.py --input=<input> --output=<output> --alignment=<alignment>
-```
-
-默认 input, output, alignment 分别是 `~/datasets/aishell3/train/wav`, `~/datasets/aishell3/train/normalized_wav`, `~/datasets/aishell3/train/alignment`.
-
-处理结束后，会将处理好的音频保存在 `<output>` 文件夹中。
-
-### 转录文本处理
-
-把文本转换成为 phone 和 tone 的形式，并存储起来。值得注意的是，这里我们的处理和用于 montreal force aligner 的不一样。我们把声调分了出来。这是一个处理方式，当然也可以只做声母和韵母的切分。
-
-运行脚本处理转录文本。
-
-```bash
-python preprocess_transcription.py --input=<input> --output=<output>
-```
-
-默认的 input 是 `~/datasets/aishell3/train`，其中会包含 `label_train-set.txt` 文件，处理后的结果会 `metadata.yaml` 和 `metadata.pickle`. 前者是文本格式，方便查看，后者是二进制格式，方便直接读取。
-
-### mel 频谱提取
-
-对处理后的音频进行 mel 频谱的提取，并且以和音频文件夹同构的方式存储，存储格式是 `.npy` 文件。
-
-```python
-python extract_mel.py --input=<intput> --output=<output>
-```
-
-input 是处理后的音频所在的文件夹，output 是输出频谱的文件夹。
-
-## 训练
-
-运行脚本训练。
-
-```python
-python train.py --data=<data> --output=<output> --device="gpu"
-```
-
-我们的模型去掉了 tacotron2 模型中的 stop token prediction。因为实践中由于 stop token prediction 是一个正负样例比例极不平衡的问题，每个句子可能有几百帧对应负样例，只有一帧正样例，而且这个 stop token prediction 对音频静音的裁切十分敏感。我们转用 attention 的最高点到达 encoder 侧的最后一个符号为终止条件。
-
-另外，为了加速模型的收敛，我们加上了 guided attention loss, 诱导 encoder-decoder 之间的 alignment 更快地呈现对角线。
-
-可以使用 visualdl 查看训练过程的 log。
-
-```bash
-visualdl --logdir=<output> --host=$HOSTNAME
-```
-
-示例 training loss / validation loss 曲线如下。
-
-![train](./images/train.png)
-
-![valid](./images/valid.png)
-
-<img src="images/alignment-step2000.png" alt="alignment-step2000" style="zoom:50%;" />
-
-大约从训练 2000 步左右就从 validation 过程中产出的 alignement 中可以观察到模糊的对角线。随着训练步数增加，对角线会更加清晰。但因为 validation 也是以 teacher forcing 的方式进行的，所以要在真正的 auto regressive 合成中产出的 alignment 中观察到对角线，需要更长的时间。
-
-## 预训练模型
-
-预训练模型下载链接。[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip).
-
-## 使用
-
-本实验包含了一个简单的使用示例，用户可以替换作为参考的声音以及文本，用训练好的模型来合成语音。使用方式参考 [notebook](./voice_cloning.ipynb) 上的使用说明。
--- a/examples/aishell3/vc0/local/tacotron2/voice_cloning.ipynb
+++ b/examples/aishell3/vc0/local/tacotron2/voice_cloning.ipynb
--- a/examples/aishell3/vc0/local/train.sh
+++ b/examples/aishell3/vc0/local/train.sh
+#!/bin/bash
+
+preprocess_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
+    --data=${preprocess_path} \
+    --output=${train_output_path} \
+    --device="gpu"
\ No newline at end of file
--- a/examples/aishell3/vc0/local/voice_cloning.sh
+++ b/examples/aishell3/vc0/local/voice_cloning.sh
+#!/bin/bash
+
+ge2e_params_path=$1
+tacotron2_params_path=$2
+waveflow_params_path=$3
+vc_input=$4
+vc_output=$5
+
+python3 ${BIN_DIR}/voice_cloning.py \
+        --ge2e_params_path=${ge2e_params_path} \
+        --tacotron2_params_path=${tacotron2_params_path} \
+        --waveflow_params_path=${waveflow_params_path} \
+        --input-dir=${vc_input} \
+        --output-dir=${vc_output}
\ No newline at end of file
--- a/examples/aishell3/vc0/path.sh
+++ b/examples/aishell3/vc0/path.sh
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=voice_cloning/tacotron2_ge2e
+export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
--- a/examples/aishell3/vc0/run.sh
+++ b/examples/aishell3/vc0/run.sh
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+input=~/datasets/data_aishell3/train
+preprocess_path=dump
+alignment=./alignment
+
+# not include ".pdparams" here
+ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000
+train_output_path=output
+# include ".pdparams" here
+ge2e_params_path=${ge2e_ckpt_path}.pdparams
+tacotron2_params_path=${train_output_path}/checkpoints/step-1000.pdparams
+# pretrained model
+# tacotron2_params_path=./tacotron2_aishell3_ckpt_0.3/step-450000.pdparams
+waveflow_params_path=./waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams
+vc_input=ref_audio
+vc_output=syn_audio
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${input} ${preprocess_path} ${alignment} ${ge2e_ckpt_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${ge2e_params_path} ${tacotron2_params_path} ${waveflow_params_path} ${vc_input} ${vc_output} || exit -1
+fi
+
+
--- a/examples/csmsc/README.md
+++ b/examples/csmsc/README.md
+
+# CSMSC
+
+* tts0 - Tactron2
+* tts1 - TransformerTTS
+* tts2 - SpeedySpeech
+* tts3 - FastSpeech2
+* voc0 - WaveFlow
+* voc1 - Parallel WaveGAN
+* voc2 - MelGAN
+* voc3 - MultiBand MelGAN
--- a/examples/csmsc/speedyspeech/baker/README.md
+++ b/examples/csmsc/speedyspeech/baker/README.md
 # Speedyspeech with CSMSC
-
 This example contains code used to train a [Speedyspeech](http://arxiv.org/abs/2008.03802) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html). NOTE that we only implement the student part of the Speedyspeech model. The ground truth alignment used to train the model is extracted from the dataset using [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner).

 ## Dataset
@@ -10,12 +9,23 @@ Download CSMSC from it's [Official Website](https://test.data-baker.com/data/ind
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for SPEEDYSPEECH.
 You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo.

-## Preprocess the dataset
+## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.
 Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`.
-Run the command below to preprocess the dataset.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset,
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+    - synthesize waveform from text file.
+6. inference using static model.
 ```bash
-./preprocess.sh
+./run.sh
+```
+### Preprocess the dataset
+```bash
+./local/preprocess.sh ${conf_path}
 ```
 When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.

@@ -37,13 +47,12 @@ The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of whi

 Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, tones, durations, path of spectrogram, and id of each utterance.

-## Train the model
-`./run.sh` calls `../train.py`.
+### Train the model
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
 ```bash
-./run.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
 ```
 Here's the complete help message.
-
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                     [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
@@ -81,20 +90,7 @@ optional arguments:
 6. `--phones-dict` is the path of the phone vocabulary file.
 7. `--tones-dict` is the path of the tone vocabulary file.

-## Pretrained Model
-Pretrained SpeedySpeech model with no silence in the edge of audios. [speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip)
-
-SpeedySpeech checkpoint contains files listed below.
-```text
-speedyspeech_nosil_baker_ckpt_0.5
-├── default.yaml            # default config used to train speedyspeech
-├── feats_stats.npy         # statistics used to normalize spectrogram when training speedyspeech
-├── phone_id_map.txt        # phone vocabulary file when training speedyspeech
-├── snapshot_iter_11400.pdz # model parameters and optimizer states
-└── tone_id_map.txt         # tone vocabulary file when training speedyspeech
-```
-
-## Synthesize
+### Synthesize
 We use [parallel wavegan](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/parallelwave_gan/baker) as the neural vocoder.
 Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it.
 ```bash
@@ -107,9 +103,9 @@ pwg_baker_ckpt_0.4
 ├── pwg_snapshot_iter_400000.pdz   # model parameters of parallel wavegan
 └── pwg_stats.npy                  # statistics used to normalize spectrogram when training parallel wavegan
 ```
-`synthesize.sh` calls `../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
 ```bash
-./synthesize.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
 usage: synthesize.py [-h] [--speedyspeech-config SPEEDYSPEECH_CONFIG]
@@ -152,9 +148,9 @@ optional arguments:
  --device DEVICE       device type to use
  --verbose VERBOSE     verbose
 ```
-`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file.
+`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file.
 ```bash
-./synthesize_e2e.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
 usage: synthesize_e2e.py [-h] [--speedyspeech-config SPEEDYSPEECH_CONFIG]
@@ -203,21 +199,42 @@ optional arguments:
 4. `--output-dir` is the directory to save synthesized audio files.
 5. `--inference-dir` is the directory to save exported model, which can be used with paddle infernece.
 6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.
-6. `--phones-dict` is the path of the phone vocabulary file.
-7. `--tones-dict` is the path of the tone vocabulary file.
+7. `--phones-dict` is the path of the phone vocabulary file.
+8. `--tones-dict` is the path of the tone vocabulary file.
+
+### Inference
+After Synthesize, we will get static models of speedyspeech and pwgan in `${train_output_path}/inference`.
+`./local/inference.sh` calls `${BIN_DIR}/inference.py`, which provides a paddle static model inference example for speedyspeech + pwgan synthesize.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path}
+```

-You can use the following scripts to synthesize for `../sentences.txt` using pretrained speedyspeech and parallel wavegan models.
+## Pretrained Model
+Pretrained SpeedySpeech model with no silence in the edge of audios. [speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip)
+
+SpeedySpeech checkpoint contains files listed below.
+```text
+speedyspeech_nosil_baker_ckpt_0.5
+├── default.yaml            # default config used to train speedyspeech
+├── feats_stats.npy         # statistics used to normalize spectrogram when training speedyspeech
+├── phone_id_map.txt        # phone vocabulary file when training speedyspeech
+├── snapshot_iter_11400.pdz # model parameters and optimizer states
+└── tone_id_map.txt         # tone vocabulary file when training speedyspeech
+```
+You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained speedyspeech and parallel wavegan models.
 ```bash
+source path.sh
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 synthesize_e2e.py \
+python3 ${BIN_DIR}/synthesize_e2e.py \
  --speedyspeech-config=speedyspeech_nosil_baker_ckpt_0.5/default.yaml \
  --speedyspeech-checkpoint=speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz \
  --speedyspeech-stat=speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy \
  --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
  --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
  --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
-  --text=../sentences.txt \
+  --text=${BIN_DIR}/../sentences.txt \
  --output-dir=exp/default/test_e2e \
  --inference-dir=exp/default/inference \
  --device="gpu" \

--- a/examples/csmsc/speedyspeech/baker/conf/default.yaml
+++ b/examples/csmsc/speedyspeech/baker/conf/default.yaml
--- a/examples/csmsc/speedyspeech/baker/inference.sh
+++ b/examples/csmsc/speedyspeech/baker/inference.sh
 #!/bin/bash

-python3 inference.py \
-  --inference-dir=exp/default/inference \
-  --text=../sentences.txt \
-  --output-dir=exp/default/pd_infer_out \
+train_output_path=$1
+
+python3 ${BIN_DIR}/inference.py \
+  --inference-dir=${train_output_path}/inference \
+  --text=${BIN_DIR}/../sentences.txt \
+  --output-dir=${train_output_path}/pd_infer_out \
  --phones-dict=dump/phone_id_map.txt \
  --tones-dict=dump/tone_id_map.txt
--- a/examples/csmsc/speedyspeech/baker/preprocess.sh
+++ b/examples/csmsc/speedyspeech/baker/preprocess.sh
 #!/bin/bash

+
 stage=0
 stop_stage=100

-export MAIN_ROOT=`realpath ${PWD}/../../../`
+config_path=$1

 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # get durations from MFA's result
@@ -11,17 +12,17 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
        --inputdir=./baker_alignment_tone \
        --output=durations.txt \
-        --config=conf/default.yaml
+        --config=${config_path}
 fi

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    echo "Extract features ..."
-    python3 ../preprocess.py \
+    python3 ${BIN_DIR}/preprocess.py \
        --dataset=baker \
        --rootdir=~/datasets/BZNSYP/ \
        --dumpdir=dump \
        --dur-file=durations.txt \
-        --config=conf/default.yaml \
+        --config=${config_path} \
        --num-cpu=20 \
        --cut-sil=True \
        --use-relative-path=True
@@ -38,7 +39,7 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # normalize and covert phone/tone to id, dev and test should use train's stats
    echo "Normalize ..."
-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --dumpdir=dump/train/norm \
        --stats=dump/train/feats_stats.npy \
@@ -46,7 +47,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --tones-dict=dump/tone_id_map.txt \
        --use-relative-path=True

-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/dev/raw/metadata.jsonl \
        --dumpdir=dump/dev/norm \
        --stats=dump/train/feats_stats.npy \
@@ -54,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --tones-dict=dump/tone_id_map.txt \
        --use-relative-path=True

-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/test/raw/metadata.jsonl \
        --dumpdir=dump/test/norm \
        --stats=dump/train/feats_stats.npy \

--- a/examples/csmsc/speedyspeech/baker/synthesize.sh
+++ b/examples/csmsc/speedyspeech/baker/synthesize.sh
 #!/bin/bash
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ../synthesize.py \
-  --speedyspeech-config=conf/default.yaml \
-  --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_11400.pdz \
+python3 ${BIN_DIR}/synthesize.py \
+  --speedyspeech-config=${config_path} \
+  --speedyspeech-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --speedyspeech-stat=dump/train/feats_stats.npy \
  --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
  --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
  --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
  --test-metadata=dump/test/norm/metadata.jsonl \
-  --output-dir=exp/default/test \
-  --inference-dir=exp/default/inference \
+  --output-dir=${train_output_path}/test \
+  --inference-dir=${train_output_path}/inference \
  --phones-dict=dump/phone_id_map.txt \
  --tones-dict=dump/tone_id_map.txt \
  --device="gpu"
--- a/examples/csmsc/speedyspeech/baker/synthesize_e2e.sh
+++ b/examples/csmsc/speedyspeech/baker/synthesize_e2e.sh
 #!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python synthesize_e2e.py \
-  --speedyspeech-config=conf/default.yaml \
-  --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_11400.pdz \
+python3 ${BIN_DIR}/synthesize_e2e.py \
+  --speedyspeech-config=${config_path} \
+  --speedyspeech-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --speedyspeech-stat=dump/train/feats_stats.npy \
  --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
  --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
  --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
-  --text=../sentences.txt \
-  --output-dir=exp/default/test_e2e \
-  --inference-dir=exp/default/inference \
+  --text=${BIN_DIR}/../sentences.txt \
+  --output-dir=${train_output_path}/test_e2e \
+  --inference-dir=${train_output_path}/inference \
  --device="gpu" \
  --phones-dict=dump/phone_id_map.txt \
  --tones-dict=dump/tone_id_map.txt
--- a/examples/csmsc/speedyspeech/baker/run.sh
+++ b/examples/csmsc/speedyspeech/baker/run.sh

 #!/bin/bash

-python ../train.py \
+config_path=$1
+train_output_path=$2
+
+python ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=conf/default.yaml \
-    --output-dir=exp/default \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
    --nprocs=2 \
    --phones-dict=dump/phone_id_map.txt \
    --tones-dict=dump/tone_id_map.txt \

--- a/examples/csmsc/tts2/path.sh
+++ b/examples/csmsc/tts2/path.sh
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=speedyspeech
+export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
\ No newline at end of file
--- a/examples/csmsc/tts2/run.sh
+++ b/examples/csmsc/tts2/run.sh
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_76.pdz
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # inference with static model
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
+fi
--- a/examples/vctk/fastspeech2/baker/README.md
+++ b/examples/vctk/fastspeech2/baker/README.md
@@ -9,13 +9,22 @@ Download CSMSC from it's [Official Website](https://test.data-baker.com/data/ind
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2.
 You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo.

-### Preprocess the dataset
+## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.
 Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`.
-Run the command below to preprocess the dataset.
-
+Run the command below to
+1. **source path**.
+2. preprocess the dataset,
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+    - synthesize waveform from text file.
+```bash
+./run.sh
+```
+### Preprocess the dataset
 ```bash
-./preprocess.sh
+./local/preprocess.sh ${conf_path}
 ```
 When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.

@@ -40,11 +49,11 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi

 Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance.

-## Train the model
-`./run.sh` calls `../train.py`.
+### Train the model
 ```bash
-./run.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
 ```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
 Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
@@ -78,18 +87,7 @@ optional arguments:
 5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
 6. `--phones-dict` is the path of the phone vocabulary file.

-## Pretrained Model
-Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip)
-
-FastSpeech2 checkpoint contains files listed below.
-```text
-fastspeech2_nosil_baker_ckpt_0.4
-├── default.yaml            # default config used to train fastspeech2
-├── phone_id_map.txt        # phone vocabulary file when training fastspeech2
-├── snapshot_iter_76000.pdz # model parameters and optimizer states
-└── speech_stats.npy        # statistics used to normalize spectrogram when training fastspeech2
-```
-## Synthesize
+### Synthesize
 We use [parallel wavegan](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/parallelwave_gan/baker) as the neural vocoder.
 Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it.
 ```bash
@@ -102,9 +100,9 @@ pwg_baker_ckpt_0.4
 ├── pwg_snapshot_iter_400000.pdz   # model parameters of parallel wavegan
 └── pwg_stats.npy                  # statistics used to normalize spectrogram when training parallel wavegan
 ```
-`synthesize.sh` calls `../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
 ```bash
-./synthesize.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
 usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
@@ -144,9 +142,9 @@ optional arguments:
  --device DEVICE       device type to use.
  --verbose VERBOSE     verbose.
 ```
-`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file.
+`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file.
 ```bash
-./synthesize_e2e.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
 usage: synthesize_e2e.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
@@ -191,18 +189,31 @@ optional arguments:
 5. `--output-dir` is the directory to save synthesized audio files.
 6. `--device is` the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.

-You can use the following scripts to synthesize for `../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
+## Pretrained Model
+Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip)
+
+FastSpeech2 checkpoint contains files listed below.
+```text
+fastspeech2_nosil_baker_ckpt_0.4
+├── default.yaml            # default config used to train fastspeech2
+├── phone_id_map.txt        # phone vocabulary file when training fastspeech2
+├── snapshot_iter_76000.pdz # model parameters and optimizer states
+└── speech_stats.npy        # statistics used to normalize spectrogram when training fastspeech2
+```
+You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
 ```bash
+source path.sh
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 synthesize_e2e.py \
+python3 ${BIN_DIR}/synthesize_e2e.py \
  --fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
  --fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
  --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
  --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
  --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
  --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
-  --text=../sentences.txt \
+  --text=${BIN_DIR}/../sentences.txt \
  --output-dir=exp/default/test_e2e \
  --device="gpu" \
  --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt

--- a/examples/vctk/fastspeech2/baker/conf/default.yaml
+++ b/examples/vctk/fastspeech2/baker/conf/default.yaml
--- a/examples/vctk/fastspeech2/baker/preprocess.sh
+++ b/examples/vctk/fastspeech2/baker/preprocess.sh
@@ -3,7 +3,7 @@
 stage=0
 stop_stage=100

-export MAIN_ROOT=`realpath ${PWD}/../../../`
+config_path=$1

 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # get durations from MFA's result
@@ -11,18 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
        --inputdir=./baker_alignment_tone \
        --output=durations.txt \
-        --config=conf/default.yaml
+        --config=${config_path}
 fi

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # extract features
    echo "Extract features ..."
-    python3 ../preprocess.py \
+    python3 ${BIN_DIR}/preprocess.py \
        --dataset=baker \
        --rootdir=~/datasets/BZNSYP/ \
        --dumpdir=dump \
        --dur-file=durations.txt \
-        --config=conf/default.yaml \
+        --config=${config_path} \
        --num-cpu=20 \
        --cut-sil=True
 fi
@@ -46,7 +46,7 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # normalize and covert phone/speaker to id, dev and test should use train's stats
    echo "Normalize ..."
-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --dumpdir=dump/train/norm \
        --speech-stats=dump/train/speech_stats.npy \
@@ -55,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --phones-dict=dump/phone_id_map.txt \
        --speaker-dict=dump/speaker_id_map.txt

-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/dev/raw/metadata.jsonl \
        --dumpdir=dump/dev/norm \
        --speech-stats=dump/train/speech_stats.npy \
@@ -64,7 +64,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --phones-dict=dump/phone_id_map.txt \
        --speaker-dict=dump/speaker_id_map.txt

-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/test/raw/metadata.jsonl \
        --dumpdir=dump/test/norm \
        --speech-stats=dump/train/speech_stats.npy \

--- a/examples/vctk/fastspeech2/baker/simple.lexicon
+++ b/examples/vctk/fastspeech2/baker/simple.lexicon
--- a/examples/vctk/fastspeech2/baker/synthesize.sh
+++ b/examples/vctk/fastspeech2/baker/synthesize.sh
 #!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ../synthesize.py \
-  --fastspeech2-config=conf/default.yaml \
-  --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_76000.pdz \
+python3 ${BIN_DIR}/synthesize.py \
+  --fastspeech2-config=${config_path} \
+  --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --fastspeech2-stat=dump/train/speech_stats.npy \
  --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
  --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
  --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
  --test-metadata=dump/test/norm/metadata.jsonl \
-  --output-dir=exp/default/test \
+  --output-dir=${train_output_path}/test \
  --device="gpu" \
  --phones-dict=dump/phone_id_map.txt
--- a/examples/vctk/fastspeech2/baker/synthesize_e2e.sh
+++ b/examples/vctk/fastspeech2/baker/synthesize_e2e.sh
 #!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 synthesize_e2e.py \
-  --fastspeech2-config=conf/default.yaml \
-  --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_153.pdz \
+python3 ${BIN_DIR}/synthesize_e2e.py \
+  --fastspeech2-config=${config_path} \
+  --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --fastspeech2-stat=dump/train/speech_stats.npy \
  --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
  --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
  --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
-  --text=../sentences.txt \
-  --output-dir=exp/default/test_e2e \
+  --text=${BIN_DIR}/../sentences.txt \
+  --output-dir=${train_output_path}/test_e2e \
  --device="gpu" \
  --phones-dict=dump/phone_id_map.txt
--- a/examples/vctk/fastspeech2/baker/run.sh
+++ b/examples/vctk/fastspeech2/baker/run.sh
 #!/bin/bash

-python3 ../train.py \
+config_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=conf/default.yaml \
-    --output-dir=exp/default \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
    --nprocs=1 \
    --phones-dict=dump/phone_id_map.txt
\ No newline at end of file
--- a/examples/csmsc/tts3/path.sh
+++ b/examples/csmsc/tts3/path.sh
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=fastspeech2
+export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
--- a/examples/csmsc/tts3/run.sh
+++ b/examples/csmsc/tts3/run.sh
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_153.pdz
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    bash ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
--- a/examples/vctk/GANVocoder/parallelwave_gan/baker/README.md
+++ b/examples/vctk/GANVocoder/parallelwave_gan/baker/README.md
 # Parallel WaveGAN with CSMSC
 This example contains code used to train a [parallel wavegan](http://arxiv.org/abs/1910.11480) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html).
-## Preprocess the dataset
+## Dataset
 ### Download and Extract the datasaet
 Download CSMSC from the [official website](https://www.data-baker.com/data/index/source) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/BZNSYP`.

@@ -8,12 +8,21 @@ Download CSMSC from the [official website](https://www.data-baker.com/data/index
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to  cut silence in the edge of audio.
 You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo.

-### Preprocess the dataset
+## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.
 Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`.
-Run the command below to preprocess the dataset.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset,
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+```bash
+./run.sh
+```
+### Preprocess the dataset
 ```bash
-./preprocess.sh
+./local/preprocess.sh ${conf_path}
 ```
 When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.

@@ -30,17 +39,15 @@ dump
    ├── raw
    └── feats_stats.npy
 ```
-
 The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains log magnitude of mel spectrogram of each utterances, while the norm folder contains normalized spectrogram. The statistics used to normalize the spectrogram is computed from the training set, which is located in `dump/train/feats_stats.npy`.

 Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains id and paths to spectrogam of each utterance.

-## Train the model
-
-`./run.sh` calls `../train.py`.
+### Train the model
 ```bash
-./run.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
 ```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
 Here's the complete help message.

 ```text
@@ -86,25 +93,10 @@ benchmark:
 4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported.
 5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.

-## Pretrained Models
-
-Pretrained models can be downloaded here:
-1. Parallel WaveGAN checkpoint. [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip), which is used as a vocoder in the end-to-end inference script.
-
-Parallel WaveGAN checkpoint contains files listed below.
-
-```text
-pwg_baker_ckpt_0.4
-├── pwg_default.yaml              # default config used to train parallel wavegan
-├── pwg_snapshot_iter_400000.pdz  # generator parameters of parallel wavegan
-└── pwg_stats.npy                 # statistics used to normalize spectrogram when training parallel wavegan
-```
-
-## Synthesize
-
-`synthesize.sh` calls `../synthesize.py `, which can synthesize waveform from `metadata.jsonl`.
+### Synthesize
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
 ```bash
-./synthesize.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
 usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT]
@@ -127,10 +119,21 @@ optional arguments:
 ```

 1. `--config` parallel wavegan config file. You should use the same config with which the model is trained.
-2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. If you use the pretrained model, use the `pwg_snapshot_iter_400000.pdz`.
+2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory.
 3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
 4. `--output-dir` is the directory to save the synthesized audio files.
 5. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported.

+## Pretrained Models
+Pretrained models can be downloaded here [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip).
+
+Parallel WaveGAN checkpoint contains files listed below.
+
+```text
+pwg_baker_ckpt_0.4
+├── pwg_default.yaml              # default config used to train parallel wavegan
+├── pwg_snapshot_iter_400000.pdz  # generator parameters of parallel wavegan
+└── pwg_stats.npy                 # statistics used to normalize spectrogram when training parallel wavegan
+```
 ## Acknowledgement
 We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
--- a/examples/vctk/GANVocoder/parallelwave_gan/baker/conf/default.yaml
+++ b/examples/vctk/GANVocoder/parallelwave_gan/baker/conf/default.yaml
--- a/examples/vctk/GANVocoder/parallelwave_gan/baker/preprocess.sh
+++ b/examples/vctk/GANVocoder/parallelwave_gan/baker/preprocess.sh
@@ -3,7 +3,7 @@
 stage=0
 stop_stage=100

-export MAIN_ROOT=`realpath ${PWD}/../../../../`
+config_path=$1

 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # get durations from MFA's result
@@ -11,17 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
        --inputdir=./baker_alignment_tone \
        --output=durations.txt \
-        --config=conf/default.yaml
+        --config=${config_path}
 fi

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # extract features
    echo "Extract features ..."
-    python3 ../../preprocess.py \
+    python3 ${BIN_DIR}/../preprocess.py \
        --rootdir=~/datasets/BZNSYP/ \
        --dataset=baker \
        --dumpdir=dump \
        --dur-file=durations.txt \
-        --config=conf/default.yaml \
+        --config=${config_path} \
        --cut-sil=True \
        --num-cpu=20
 fi
@@ -38,16 +39,16 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # normalize, dev and test should use train's stats
    echo "Normalize ..."
   
-    python3 ../../normalize.py \
+    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --dumpdir=dump/train/norm \
        --stats=dump/train/feats_stats.npy
-    python3 ../../normalize.py \
+    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/dev/raw/metadata.jsonl \
        --dumpdir=dump/dev/norm \
        --stats=dump/train/feats_stats.npy
    
-    python3 ../../normalize.py \
+    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/test/raw/metadata.jsonl \
        --dumpdir=dump/test/norm \
        --stats=dump/train/feats_stats.npy

--- a/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/synthesize.sh
+++ b/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/synthesize.sh
 #!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ../synthesize.py \
-  --config=conf/default.yaml \
-  --checkpoint=exp/default/checkpoints/snapshot_iter_400000.pdz\
+python3 ${BIN_DIR}/synthesize.py \
+  --config=${config_path} \
+  --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --test-metadata=dump/test/norm/metadata.jsonl \
-  --output-dir=exp/default/test
+  --output-dir=${train_output_path}/test
--- a/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/run.sh
+++ b/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/run.sh
 #!/bin/bash

+config_path=$1
+train_output_path=$2
+
 FLAGS_cudnn_exhaustive_search=true \
 FLAGS_conv_workspace_size_limit=4000 \
-python ../train.py \
+python ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=conf/default.yaml \
-    --output-dir=exp/default \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
    --nprocs=1
--- a/examples/csmsc/voc1/path.sh
+++ b/examples/csmsc/voc1/path.sh
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=parallelwave_gan
+export BIN_DIR=${MAIN_ROOT}/parakeet/exps/gan_vocoder/${MODEL}
\ No newline at end of file
--- a/examples/csmsc/voc1/run.sh
+++ b/examples/csmsc/voc1/run.sh
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_5000.pdz
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
--- a/examples/ljspeech/README.md
+++ b/examples/ljspeech/README.md
@@ -2,5 +2,10 @@
 # LJSpeech

 * tts0 - Tactron2
-* tts1 - TransformerTTS 
-* voc0 - WaveFlow 
+* tts1 - TransformerTTS
+* tts2 - SpeedySpeech
+* tts3 - FastSpeech2
+* voc0 - WaveFlow
+* voc1 - Parallel WaveGAN
+* voc2 - MelGAN
+* voc3 - MultiBand MelGAN
--- a/examples/ljspeech/tts0/local/tacotron2/README.md
+++ b/examples/ljspeech/tts0/local/tacotron2/README.md
-# Tacotron2
-
+# Tacotron2  with LJSpeech
 PaddlePaddle dynamic graph implementation of Tacotron2, a neural network architecture for speech synthesis directly from text. The implementation is based on [Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884).

-## Project Structure
-
-```text
-├── config.py              # default configuration file
-├── ljspeech.py            # dataset and dataloader settings for LJSpeech
-├── preprocess.py          # script to preprocess LJSpeech dataset
-├── synthesize.py          # script to synthesize spectrogram from text
-├── train.py               # script for tacotron2 model training
-├── synthesize.ipynb       # notebook example for end-to-end TTS
-```
-
 ## Dataset
-
 We experiment with the LJSpeech dataset. Download and unzip [LJSpeech](https://keithito.com/LJ-Speech-Dataset/).

 ```bash
 wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
 tar xjvf LJSpeech-1.1.tar.bz2
 ```
-
-Then you need to preprocess the data by running ``preprocess.py``, the preprocessed data will be placed in ``--output`` directory.
-
+## Get Started
+Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset,
+3. train the model.
+4. synthesize mels.
 ```bash
-python preprocess.py \
--input=${DATAPATH} \
--output=${PREPROCESSEDDATAPATH} \
-v  \
+./run.sh
 ```
-
-For more help on arguments
-
-``python preprocess.py --help``.
-
-## Train the model
-
-Tacotron2 model can be trained by running ``train.py``.
-
+### Preprocess the dataset
 ```bash
-python train.py \
--data=${PREPROCESSEDDATAPATH} \
--output=${OUTPUTPATH} \
--device=gpu \
+./local/preprocess.sh ${conf_path}
+```
+### Train the model
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
+```
+Here's the complete help message.
+```text
+usage: train.py [-h] [--config FILE] [--data DATA_DIR] [--output OUTPUT_DIR]
+                [--checkpoint_path CHECKPOINT_PATH] [--device {cpu,gpu}]
+                [--nprocs NPROCS] [--opts ...]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config FILE         path of the config file to overwrite to default config
+                        with.
+  --data DATA_DIR       path to the datatset.
+  --output OUTPUT_DIR   path to save checkpoint and logs.
+  --checkpoint_path CHECKPOINT_PATH
+                        path of the checkpoint to load
+  --device {cpu,gpu}    device type to use, cpu and gpu are supported.
+  --nprocs NPROCS       number of parallel processes to use.
+  --opts ...            options to overwrite --config file and the default
+                        config, passing in KEY VALUE pairs
 ```

 If you want to train on CPU, just set ``--device=cpu``.
 If you want to train on multiple GPUs, just set ``--nprocs`` as num of GPU.
-By default, training will be resumed from the latest checkpoint in ``--output``, if you want to start a new training, please use a new ``${OUTPUTPATH}`` with no checkpoint. And if you want to resume from an other existing model, you should set ``checkpoint_path`` to be the checkpoint path you want to load.
-
+By default, training will be resumed from the latest checkpoint in ``--output``, if you want to start a new training, please use a new ``${OUTPUTPATH}`` with no checkpoint.
+And if you want to resume from an other existing model, you should set ``checkpoint_path`` to be the checkpoint path you want to load.
 **Note: The checkpoint path cannot contain the file extension.**

-For more help on arguments
-
-``python train_transformer.py --help``.
-
-## Synthesize
-
-After training the Tacotron2, spectrogram can be synthesized by running ``synthesize.py``.
-
+### Synthesize
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`,  which synthesize **mels**  from text_list here.
 ```bash
-python synthesize.py \
--config=${CONFIGPATH} \
--checkpoint_path=${CHECKPOINTPATH} \
--input=${TEXTPATH} \
--output=${OUTPUTPATH}
--device=gpu
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${train_output_path} ${ckpt_name}
 ```
-
-The ``${CONFIGPATH}`` needs to be matched with ``${CHECKPOINTPATH}``.
-
-For more help on arguments
-
-``python synthesize.py --help``.
-
-Then you can find the spectrogram files in ``${OUTPUTPATH}``, and then they can be the input of vocoder like [waveflow](../waveflow/README.md#Synthesis) to get audio files.
-
+```text
+usage: synthesize.py [-h] [--config FILE] [--checkpoint_path CHECKPOINT_PATH]
+                     [--input INPUT] [--output OUTPUT] [--device DEVICE]
+                     [--opts ...] [-v]
+
+generate mel spectrogram with TransformerTTS.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config FILE         extra config to overwrite the default config
+  --checkpoint_path CHECKPOINT_PATH
+                        path of the checkpoint to load.
+  --input INPUT         path of the text sentences
+  --output OUTPUT       path to save outputs
+  --device DEVICE       device type to use.
+  --opts ...            options to overwrite --config file and the default
+                        config, passing in KEY VALUE pairs
+  -v, --verbose         print msg
+```
+**Ps.** You can  use [waveflow](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/waveflow) as the neural vocoder to synthesize mels to wavs. (Please  refer to `synthesize.sh` in our  LJSpeech waveflow example)

 ## Pretrained Models
-
 Pretrained Models can be downloaded from links below. We provide 2 models with different configurations.

 1. This model use a binary classifier to predict the stop token. [tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip)

 2. This model does not have a stop token predictor. It uses the attention peak position to decided whether all the contents have been uttered. Also guided attention loss is used to speed up training. This model is trained with `configs/alternative.yaml`.[tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3_alternative.zip)
-
-
-## Notebook: End-to-end TTS
-
-See [synthesize.ipynb](./synthesize.ipynb) for details about end-to-end TTS with tacotron2 and waveflow.
--- a/examples/ljspeech/tts0/local/preprocess.sh
+++ b/examples/ljspeech/tts0/local/preprocess.sh
+#!/bin/bash
+
+preprocess_path=$1
+
+python3 ${BIN_DIR}/preprocess.py \
+    --input=~/datasets/LJSpeech-1.1 \
+    --output=${preprocess_path} \
+    -v  \
\ No newline at end of file
--- a/examples/ljspeech/tts0/local/synthesize.sh
+++ b/examples/ljspeech/tts0/local/synthesize.sh
+#!/bin/bash
+
+train_output_path=$1
+ckpt_name=$2
+
+python3 ${BIN_DIR}/synthesize.py \
+    --config=${train_output_path}/config.yaml \
+    --checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \
+    --input=${BIN_DIR}/../sentences_en.txt \
+    --output=${train_output_path}/test
+    --device=gpu
\ No newline at end of file
--- a/examples/ljspeech/tts0/local/train.sh
+++ b/examples/ljspeech/tts0/local/train.sh
+#!/bin/bash
+
+preprocess_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
+    --data=${preprocess_path} \
+    --output=${train_output_path} \
+    --device=gpu \
\ No newline at end of file
--- a/examples/ljspeech/tts0/path.sh
+++ b/examples/ljspeech/tts0/path.sh
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=tacotron2
+export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
--- a/examples/ljspeech/tts0/run.sh
+++ b/examples/ljspeech/tts0/run.sh
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+preprocess_path=preprocessed_ljspeech
+train_output_path=output
+ckpt_name=step-35000
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${preprocess_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${train_output_path} ${ckpt_name} || exit -1
+fi
+
--- a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/README.md
+++ b/examples/ljspeech/tts1/local/transformer_tts/ljspeech/README.md
@@ -8,12 +8,21 @@ wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
 ```bash
 tar xjvf LJSpeech-1.1.tar.bz2
 ```
-### Preprocess the dataset
+## Get Started
 Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
-Run the command below to preprocess the dataset.
-
+Run the command below to
+1. **source path**.
+2. preprocess the dataset,
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+    - synthesize waveform from text file.
+```bash
+./run.sh
+```
+### Preprocess the dataset
 ```bash
-./preprocess.sh.
+./local/preprocess.sh ${conf_path}
 ```
 When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
 ```text
@@ -35,10 +44,10 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi

 Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, path of speech features, speaker and id of each utterance.

-## Train the model
-`./run.sh` calls `../train.py`.
+### Train the model
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
 ```bash
-./run.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
 ```
 Here's the complete help message.
 ```text
@@ -71,17 +80,6 @@ optional arguments:
 5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
 6. `--phones-dict` is the path of the phone vocabulary file.

-## Pretrained Model
-Pretrained Model can be downloaded here. [transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip)
-
-TransformerTTS  checkpoint contains files listed below.
-```text
-transformer_tts_ljspeech_ckpt_0.4
-├── default.yaml             # default config used to train transformer_tts
-├── phone_id_map.txt         # phone vocabulary file when training transformer_tts
-├── snapshot_iter_201500.pdz # model parameters and optimizer states
-└── speech_stats.npy         # statistics used to normalize spectrogram when training transformer_tts
-```
 ## Synthesize
 We use [waveflow](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/waveflow) as the neural vocoder.
 Download Pretrained WaveFlow Model with residual channel equals 128 from [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip) and unzip it.
@@ -94,9 +92,9 @@ waveflow_ljspeech_ckpt_0.3
 ├── config.yaml           # default config used to train waveflow
 └── step-2000000.pdparams # model parameters of waveflow
 ```
-`synthesize.sh` calls `../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
 ```bash
-./synthesize.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
 usage: synthesize.py [-h] [--transformer-tts-config TRANSFORMER_TTS_CONFIG]
@@ -132,9 +130,9 @@ optional arguments:
  --device DEVICE       device type to use.
  --verbose VERBOSE     verbose.
 ```
-`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file.
+`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file.
 ```bash
-./synthesize_e2e.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
 usage: synthesize_e2e.py [-h]
@@ -177,17 +175,30 @@ optional arguments:
 5. `--output-dir` is the directory to save synthesized audio files.
 6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.

-You can use the following scripts to synthesize for `../sentences.txt` using pretrained transformer_tts  and waveflow models.
+## Pretrained Model
+Pretrained Model can be downloaded here. [transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip)
+
+TransformerTTS  checkpoint contains files listed below.
+```text
+transformer_tts_ljspeech_ckpt_0.4
+├── default.yaml             # default config used to train transformer_tts
+├── phone_id_map.txt         # phone vocabulary file when training transformer_tts
+├── snapshot_iter_201500.pdz # model parameters and optimizer states
+└── speech_stats.npy         # statistics used to normalize spectrogram when training transformer_tts
+```
+You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_en.txt` using pretrained transformer_tts  and waveflow models.
 ```bash
+source path.sh
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 synthesize_e2e.py \
+python3 ${BIN_DIR}/synthesize_e2e.py \
  --transformer-tts-config=transformer_tts_ljspeech_ckpt_0.4/default.yaml \
  --transformer-tts-checkpoint=transformer_tts_ljspeech_ckpt_0.4/snapshot_iter_201500.pdz \
  --transformer-tts-stat=transformer_tts_ljspeech_ckpt_0.4/speech_stats.npy \
  --waveflow-config=waveflow_ljspeech_ckpt_0.3/config.yaml \
  --waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \
-  --text=../sentences.txt \
+  --text=${BIN_DIR}/../sentences_en.txt \
  --output-dir=exp/default/test_e2e \
  --device="gpu" \
  --phones-dict=transformer_tts_ljspeech_ckpt_0.4/phone_id_map.txt

--- a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/conf/default.yaml
+++ b/examples/ljspeech/tts1/local/transformer_tts/ljspeech/conf/default.yaml
--- a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/preprocess.sh
+++ b/examples/ljspeech/tts1/local/transformer_tts/ljspeech/preprocess.sh
@@ -3,12 +3,12 @@
 stage=1
 stop_stage=100

-export MAIN_ROOT=`realpath ${PWD}/../../../`
+config_path=$1

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # extract features
    echo "Extract features ..."
-    python3 ../preprocess.py  \
+    python3 ${BIN_DIR}/preprocess.py  \
        --dataset=ljspeech \
        --rootdir=~/datasets/LJSpeech-1.1/ \
        --dumpdir=dump \
@@ -27,21 +27,21 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # normalize and covert phone to id, dev and test should use train's stats
    echo "Normalize ..."
-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --dumpdir=dump/train/norm \
        --speech-stats=dump/train/speech_stats.npy \
        --phones-dict=dump/phone_id_map.txt \
        --speaker-dict=dump/speaker_id_map.txt

-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/dev/raw/metadata.jsonl \
        --dumpdir=dump/dev/norm \
        --speech-stats=dump/train/speech_stats.npy \
        --phones-dict=dump/phone_id_map.txt \
        --speaker-dict=dump/speaker_id_map.txt

-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/test/raw/metadata.jsonl \
        --dumpdir=dump/test/norm \
        --speech-stats=dump/train/speech_stats.npy \

--- a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/synthesize.sh
+++ b/examples/ljspeech/tts1/local/transformer_tts/ljspeech/synthesize.sh
 #!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ../synthesize.py \
-  --transformer-tts-config=conf/default.yaml \
-  --transformer-tts-checkpoint=exp/default/checkpoints/snapshot_iter_201500.pdz \
+python3 ${BIN_DIR}/synthesize.py \
+  --transformer-tts-config=${config_path} \
+  --transformer-tts-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --transformer-tts-stat=dump/train/speech_stats.npy \
  --waveflow-config=waveflow_ljspeech_ckpt_0.3/config.yaml \
  --waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \
  --test-metadata=dump/test/norm/metadata.jsonl \
-  --output-dir=exp/default/test \
+  --output-dir=${train_output_path}/test \
  --device="gpu" \
  --phones-dict=dump/phone_id_map.txt
--- a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/synthesize_e2e.sh
+++ b/examples/ljspeech/tts1/local/transformer_tts/ljspeech/synthesize_e2e.sh
 #!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 synthesize_e2e.py \
-  --transformer-tts-config=conf/default.yaml \
-  --transformer-tts-checkpoint=exp/default/checkpoints/snapshot_iter_201500.pdz \
+python3 ${BIN_DIR}/synthesize_e2e.py \
+  --transformer-tts-config=${config_path} \
+  --transformer-tts-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --transformer-tts-stat=dump/train/speech_stats.npy \
  --waveflow-config=waveflow_ljspeech_ckpt_0.3/config.yaml \
  --waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \
-  --text=../sentences.txt \
-  --output-dir=exp/default/test_e2e \
+  --text=${BIN_DIR}/../sentences_en.txt \
+  --output-dir=${train_output_path}/test_e2e \
  --device="gpu" \
  --phones-dict=dump/phone_id_map.txt
--- a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/run.sh
+++ b/examples/ljspeech/tts1/local/transformer_tts/ljspeech/run.sh
 #!/bin/bash

-python3 ../train.py \
+config_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=conf/default.yaml \
-    --output-dir=exp/default \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
    --nprocs=2 \
    --phones-dict=dump/phone_id_map.txt
--- a/examples/ljspeech/tts1/local/transformer_tts/sentences.txt
+++ b/examples/ljspeech/tts1/local/transformer_tts/sentences.txt
-001 Life was like a box of chocolates, you never know what you're gonna get.
-002 With great power there must come great responsibility.
-003 To be or not to be, that’s a question.
-004 A man can be destroyed but not defeated
-005 Do not, for one repulse, give up the purpose that you resolved to effort.
-006 Death is just a part of life, something we're all destined to do.
-007 I think it's hard winning a war with words. 
-008 Don’t argue with the people of strong determination, because they may change the fact!
-009 Love you three thousand times.
\ No newline at end of file
--- a/examples/ljspeech/tts1/path.sh
+++ b/examples/ljspeech/tts1/path.sh
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=transformer_tts
+export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
--- a/examples/ljspeech/tts1/run.sh
+++ b/examples/ljspeech/tts1/run.sh
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_403.pdz
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
--- a/examples/vctk/fastspeech2/ljspeech/README.md
+++ b/examples/vctk/fastspeech2/ljspeech/README.md
@@ -9,13 +9,22 @@ Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2.
 You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo.

-### Preprocess the dataset
+## Get Started
 Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
 Assume the path to the MFA result of LJSpeech-1.1 is `./ljspeech_alignment`.
-Run the command below to preprocess the dataset.
-
+Run the command below to
+1. **source path**.
+2. preprocess the dataset,
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+    - synthesize waveform from text file.
+```bash
+./run.sh
+```
+### Preprocess the dataset
 ```bash
-./preprocess.sh
+./local/preprocess.sh ${conf_path}
 ```
 When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.

@@ -40,10 +49,10 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi

 Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance.

-## Train the model
-`./run.sh` calls `../train.py`.
+### Train the model
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
 ```bash
-./run.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
 ```
 Here's the complete help message.
 ```text
@@ -78,18 +87,7 @@ optional arguments:
 5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
 6. `--phones-dict` is the path of the phone vocabulary file.

-## Pretrained Model
-Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)
-
-FastSpeech2 checkpoint contains files listed below.
-```text
-fastspeech2_nosil_ljspeech_ckpt_0.5
-├── default.yaml             # default config used to train fastspeech2
-├── phone_id_map.txt         # phone vocabulary file when training fastspeech2
-├── snapshot_iter_100000.pdz # model parameters and optimizer states
-└── speech_stats.npy         # statistics used to normalize spectrogram when training fastspeech2
-```
-## Synthesize
+### Synthesize
 We use [parallel wavegan](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/parallelwave_gan/ljspeech/) as the neural vocoder.
 Download pretrained parallel wavegan model from [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip) and unzip it.
 ```bash
@@ -102,9 +100,9 @@ pwg_ljspeech_ckpt_0.5
 ├── pwg_snapshot_iter_400000.pdz  # generator parameters of parallel wavegan
 └── pwg_stats.npy                 # statistics used to normalize spectrogram when training parallel wavegan
 ```
-`synthesize.sh` calls `../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
 ```bash
-./synthesize.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
 usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
@@ -144,19 +142,19 @@ optional arguments:
  --device DEVICE       device type to use.
  --verbose VERBOSE     verbose.
 ```
-`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file.
+`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e_en.py`, which can synthesize waveform from text file.
 ```bash
-./synthesize_e2e.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
-usage: synthesize_e2e.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
-                         [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT]
-                         [--fastspeech2-stat FASTSPEECH2_STAT]
-                         [--pwg-config PWG_CONFIG]
-                         [--pwg-checkpoint PWG_CHECKPOINT]
-                         [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT]
-                         [--text TEXT] [--output-dir OUTPUT_DIR]
-                         [--device DEVICE] [--verbose VERBOSE]
+usage: synthesize_e2e_en.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
+                            [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT]
+                            [--fastspeech2-stat FASTSPEECH2_STAT]
+                            [--pwg-config PWG_CONFIG]
+                            [--pwg-checkpoint PWG_CHECKPOINT]
+                            [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT]
+                            [--text TEXT] [--output-dir OUTPUT_DIR]
+                            [--device DEVICE] [--verbose VERBOSE]

 Synthesize with fastspeech2 & parallel wavegan.

@@ -191,18 +189,31 @@ optional arguments:
 5. `--output-dir` is the directory to save synthesized audio files.
 6. `--device is` the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.

-You can use the following scripts to synthesize for `../sentences_en.txt` using pretrained fastspeech2 and parallel wavegan models.
+## Pretrained Model
+Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)
+
+FastSpeech2 checkpoint contains files listed below.
+```text
+fastspeech2_nosil_ljspeech_ckpt_0.5
+├── default.yaml             # default config used to train fastspeech2
+├── phone_id_map.txt         # phone vocabulary file when training fastspeech2
+├── snapshot_iter_100000.pdz # model parameters and optimizer states
+└── speech_stats.npy         # statistics used to normalize spectrogram when training fastspeech2
+```
+You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_en.txt` using pretrained fastspeech2 and parallel wavegan models.
 ```bash
+source path.sh
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 synthesize_e2e.py \
+python3 ${BIN_DIR}/synthesize_e2e_en.py \
  --fastspeech2-config=fastspeech2_nosil_ljspeech_ckpt_0.5/default.yaml \
  --fastspeech2-checkpoint=fastspeech2_nosil_ljspeech_ckpt_0.5/snapshot_iter_100000.pdz \
  --fastspeech2-stat=fastspeech2_nosil_ljspeech_ckpt_0.5/speech_stats.npy \
  --pwg-config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \
  --pwg-checkpoint=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
  --pwg-stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
-  --text=../sentences_en.txt \
+  --text=${BIN_DIR}/../sentences_en.txt \
  --output-dir=exp/default/test_e2e \
  --device="gpu" \
  --phones-dict=fastspeech2_nosil_ljspeech_ckpt_0.5/phone_id_map.txt

--- a/examples/vctk/fastspeech2/ljspeech/conf/default.yaml
+++ b/examples/vctk/fastspeech2/ljspeech/conf/default.yaml
--- a/examples/vctk/fastspeech2/ljspeech/preprocess.sh
+++ b/examples/vctk/fastspeech2/ljspeech/preprocess.sh
@@ -3,7 +3,7 @@
 stage=0
 stop_stage=100

-export MAIN_ROOT=`realpath ${PWD}/../../../`
+config_path=$1

 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # get durations from MFA's result
@@ -11,18 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
        --inputdir=./ljspeech_alignment \
        --output=durations.txt \
-        --config=conf/default.yaml
+        --config=${config_path}
 fi

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # extract features
    echo "Extract features ..."
-    python3 ../preprocess.py \
+    python3 ${BIN_DIR}/preprocess.py \
        --dataset=ljspeech \
        --rootdir=~/datasets/LJSpeech-1.1/ \
        --dumpdir=dump \
        --dur-file=durations.txt \
-        --config=conf/default.yaml \
+        --config=${config_path} \
        --num-cpu=8 \
        --cut-sil=True
 fi
@@ -46,7 +46,7 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # normalize and covert phone/speaker to id, dev and test should use train's stats
    echo "Normalize ..."
-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --dumpdir=dump/train/norm \
        --speech-stats=dump/train/speech_stats.npy \
@@ -55,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --phones-dict=dump/phone_id_map.txt \
        --speaker-dict=dump/speaker_id_map.txt

-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/dev/raw/metadata.jsonl \
        --dumpdir=dump/dev/norm \
        --speech-stats=dump/train/speech_stats.npy \
@@ -64,7 +64,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --phones-dict=dump/phone_id_map.txt \
        --speaker-dict=dump/speaker_id_map.txt

-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/test/raw/metadata.jsonl \
        --dumpdir=dump/test/norm \
        --speech-stats=dump/train/speech_stats.npy \

--- a/examples/vctk/fastspeech2/ljspeech/synthesize.sh
+++ b/examples/vctk/fastspeech2/ljspeech/synthesize.sh
-
 #!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ../synthesize.py \
-  --fastspeech2-config=conf/default.yaml \
-  --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_100000.pdz \
+python3 ${BIN_DIR}/synthesize.py \
+  --fastspeech2-config=${config_path} \
+  --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --fastspeech2-stat=dump/train/speech_stats.npy \
  --pwg-config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \
  --pwg-checkpoint=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
  --pwg-stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
  --test-metadata=dump/test/norm/metadata.jsonl \
-  --output-dir=exp/default/test \
+  --output-dir=${train_output_path}/test \
  --device="gpu" \
  --phones-dict=dump/phone_id_map.txt
--- a/examples/vctk/fastspeech2/ljspeech/synthesize_e2e.sh
+++ b/examples/vctk/fastspeech2/ljspeech/synthesize_e2e.sh
-
 #!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 synthesize_e2e.py \
-  --fastspeech2-config=conf/default.yaml \
-  --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_100000.pdz \
+python3 ${BIN_DIR}/synthesize_e2e_en.py \
+  --fastspeech2-config=${config_path} \
+  --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --fastspeech2-stat=dump/train/speech_stats.npy \
  --pwg-config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \
  --pwg-checkpoint=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
  --pwg-stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
-  --text=../sentences_en.txt \
-  --output-dir=exp/default/test_e2e \
+  --text=${BIN_DIR}/../sentences_en.txt \
+  --output-dir=${train_output_path}/test_e2e \
  --device="gpu" \
  --phones-dict=dump/phone_id_map.txt
--- a/examples/vctk/fastspeech2/ljspeech/run.sh
+++ b/examples/vctk/fastspeech2/ljspeech/run.sh
 #!/bin/bash

-python3 ../train.py \
+config_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=conf/default.yaml \
-    --output-dir=exp/default \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
    --nprocs=1 \
    --phones-dict=dump/phone_id_map.txt
--- a/examples/ljspeech/tts3/path.sh
+++ b/examples/ljspeech/tts3/path.sh
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=fastspeech2
+export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
--- a/examples/ljspeech/tts3/run.sh
+++ b/examples/ljspeech/tts3/run.sh
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_201.pdz
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
--- a/examples/ljspeech/voc0/local/waveflow/README.md
+++ b/examples/ljspeech/voc0/local/waveflow/README.md
 # WaveFlow with LJSpeech
-
 ## Dataset
-
 ### Download the datasaet.
-
 ```bash
 wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
 ```
-
 ### Extract the dataset.
-
 ```bash
 tar xjvf LJSpeech-1.1.tar.bz2
 ```
-
+## Get Started
+Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
+Assume the path to the Tacotron2 generated mels is `../tts0/output/test`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset,
+3. train the model.
+4. synthesize wavs from mels.
+```bash
+./run.sh
+```
 ### Preprocess the dataset.
-
-Assume the path to save the preprocessed dataset is `ljspeech_waveflow`. Run the command below to preprocess the dataset.
-
 ```bash
-python preprocess.py --input=LJSpeech-1.1/  --output=ljspeech_waveflow
+./local/preprocess.sh ${preprocess_path}
 ```
-
-## Train the model
-
-The training script requires 4 command line arguments.
-`--data` is the path of the training dataset, `--output` is the path of the output directory (we recommend to use a subdirectory in `runs` to manage different experiments.)
-
-`--device` should be "cpu" or "gpu", `--nprocs` is the number of processes to train the model in parallel.
-
+### Train the model
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
 ```bash
-python train.py --data=ljspeech_waveflow/ --output=runs/test --device="gpu" --nprocs=1
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path}
 ```
+The training script requires 4 command line arguments.
+1. `--data` is the path of the training dataset.
+2. `--output` is the path of the output directory.
+3. `--device` should be "cpu" or "gpu"
+4. `--nprocs` is the number of processes to train the model in parallel.

 If you want distributed training, set a larger `--nprocs` (e.g. 4). Note that distributed training with cpu is not supported yet.

-## Synthesize
-
-Synthesize waveform. We assume the `--input` is a directory containing several mel spectrograms(log magnitude) in `.npy` format. The output would be saved in `--output` directory, containing several `.wav` files, each with the same name as the mel spectrogram does.
-
-`--checkpoint_path` should be the path of the parameter file (`.pdparams`) to load. Note that the extention name `.pdparmas` is not included here.
-
-`--device` specifies to device to run synthesis on.
-
+### Synthesize
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from mels.
 ```bash
-python synthesize.py --input=mels/ --output=wavs/ --checkpoint_path='step-2000000' --device="gpu" --verbose
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${input_mel_path} ${train_output_path} ${ckpt_name}
 ```

-## Pretrained Model
+Synthesize waveform.
+1. We assume the `--input` is a directory containing several mel spectrograms(log magnitude) in `.npy` format.
+2. The output would be saved in `--output` directory, containing several `.wav` files, each with the same name as the mel spectrogram does.
+3. `--checkpoint_path` should be the path of the parameter file (`.pdparams`) to load. Note that the extention name `.pdparmas` is not included here.
+4. `--device` specifies to device to run synthesis on.

+## Pretrained Model
 Pretrained Model with residual channel equals 128 can be downloaded here. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip).
--- a/examples/ljspeech/voc0/local/preprocess.sh
+++ b/examples/ljspeech/voc0/local/preprocess.sh
+#!/bin/bash
+
+preprocess_path=$1
+
+python3 ${BIN_DIR}/preprocess.py \
+    --input=~/datasets/LJSpeech-1.1 \
+    --output=${preprocess_path}
\ No newline at end of file
--- a/examples/ljspeech/voc0/local/synthesize.sh
+++ b/examples/ljspeech/voc0/local/synthesize.sh
+#!/bin/bash
+
+input_mel_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+python ${BIN_DIR}/synthesize.py \
+    --input=${input_mel_path} \
+    --output=${train_output_path}/wavs/ \
+    --checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \
+    --device="gpu" \
+    --verbose
\ No newline at end of file
--- a/examples/ljspeech/voc0/local/train.sh
+++ b/examples/ljspeech/voc0/local/train.sh
+#!/bin/bash
+
+preprocess_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
+    --data=${preprocess_path} \
+    --output=${train_output_path} \
+    --device="gpu" \
+    --nprocs=1
\ No newline at end of file
--- a/examples/ljspeech/voc0/path.sh
+++ b/examples/ljspeech/voc0/path.sh
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=waveflow
+export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
\ No newline at end of file
--- a/examples/ljspeech/voc0/run.sh
+++ b/examples/ljspeech/voc0/run.sh
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+preprocess_path=preprocessed_ljspeech
+train_output_path=output
+# mel generated by Tacotron2
+input_mel_path=../tts0/output/test
+ckpt_name=step-10000
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${preprocess_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${input_mel_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
--- a/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/README.md
+++ b/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/README.md
-# Parallel WaveGAN with the LJSpeech-1.1 dataset
-
+# Parallel WaveGAN with the LJSpeech-1.1
 This example contains code used to train a [parallel wavegan](http://arxiv.org/abs/1910.11480) model with [LJSpeech-1.1](https://keithito.com/LJ-Speech-Dataset/).
-
-## Preprocess the dataset
-
+## Dataset
 ### Download and Extract the datasaet
 Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech-Dataset/).
-
 ### Get MFA results for silence trim
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to  cut silence in the edge of audio.
 You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo.

-### Preprocess the dataset
+## Get Started
 Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
 Assume the path to the MFA result of LJSpeech-1.1 is `./ljspeech_alignment`.
-Run the command below to preprocess the dataset.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset,
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+```bash
+./run.sh
+```
+
+### Preprocess the dataset
 ```bash
-./preprocess.sh
+./local/preprocess.sh ${conf_path}
 ```
 When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.

@@ -38,10 +44,10 @@ The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of whi

 Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains id and paths to spectrogam of each utterance.

-## Train the model
-`./run.sh` calls `../train.py`.
+### Train the model
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
 ```bash
-./run.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
 ```
 Here's the complete help message.

@@ -88,23 +94,10 @@ benchmark:
 4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported.
 5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.

-## Pretrained Models
-Pretrained models can be downloaded here:
-1. Parallel WaveGAN checkpoint. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip), which is used as a vocoder in the end-to-end inference script.
-
-Parallel WaveGAN checkpoint contains files listed below.
-
-```text
-pwg_ljspeech_ckpt_0.5
-├── pwg_default.yaml              # default config used to train parallel wavegan
-├── pwg_snapshot_iter_400000.pdz  # generator parameters of parallel wavegan
-└── pwg_stats.npy                 # statistics used to normalize spectrogram when training parallel wavegan
-```
-
-## Synthesize
-`synthesize.sh` calls `../synthesize.py `, which can synthesize waveform from `metadata.jsonl`.
+### Synthesize
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
 ```bash
-./synthesize.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
 usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT]
@@ -127,10 +120,21 @@ optional arguments:
 ```

 1. `--config` parallel wavegan config file. You should use the same config with which the model is trained.
-2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. If you use the pretrained model, use the `pwg_snapshot_iter_400000.pdz`.
+2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory.
 3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
 4. `--output-dir` is the directory to save the synthesized audio files.
 5. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported.

+## Pretrained Models
+Pretrained models can be downloaded here. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip)
+
+Parallel WaveGAN checkpoint contains files listed below.
+
+```text
+pwg_ljspeech_ckpt_0.5
+├── pwg_default.yaml              # default config used to train parallel wavegan
+├── pwg_snapshot_iter_400000.pdz  # generator parameters of parallel wavegan
+└── pwg_stats.npy                 # statistics used to normalize spectrogram when training parallel wavegan
+```
 ## Acknowledgement
 We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
--- a/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/conf/default.yaml
+++ b/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/conf/default.yaml
--- a/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/preprocess.sh
+++ b/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/preprocess.sh
@@ -3,7 +3,7 @@
 stage=0
 stop_stage=100

-export MAIN_ROOT=`realpath ${PWD}/../../../../`
+config_path=$1

 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # get durations from MFA's result
@@ -11,18 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
        --inputdir=./ljspeech_alignment \
        --output=durations.txt \
-        --config=conf/default.yaml
+        --config=${config_path}
 fi

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # extract features
    echo "Extract features ..."
-    python3 ../../preprocess.py \
+    python3 ${BIN_DIR}/../preprocess.py \
        --rootdir=~/datasets/LJSpeech-1.1/ \
        --dataset=ljspeech \
        --dumpdir=dump \
        --dur-file=durations.txt \
-        --config=conf/default.yaml \
+        --config=${config_path} \
        --cut-sil=True \
        --num-cpu=20
 fi
@@ -39,16 +39,16 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # normalize, dev and test should use train's stats
    echo "Normalize ..."
   
-    python3 ../../normalize.py \
+    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --dumpdir=dump/train/norm \
        --stats=dump/train/feats_stats.npy
-    python3 ../../normalize.py \
+    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/dev/raw/metadata.jsonl \
        --dumpdir=dump/dev/norm \
        --stats=dump/train/feats_stats.npy
    
-    python3 ../../normalize.py \
+    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/test/raw/metadata.jsonl \
        --dumpdir=dump/test/norm \
        --stats=dump/train/feats_stats.npy

--- a/examples/vctk/GANVocoder/parallelwave_gan/baker/synthesize.sh
+++ b/examples/vctk/GANVocoder/parallelwave_gan/baker/synthesize.sh
 #!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ../synthesize.py \
-  --config=conf/default.yaml \
-  --checkpoint=exp/default/checkpoints/snapshot_iter_400000.pdz\
+python3 ${BIN_DIR}/synthesize.py \
+  --config=${config_path} \
+  --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --test-metadata=dump/test/norm/metadata.jsonl \
-  --output-dir=exp/default/test
+  --output-dir=${train_output_path}/test
--- a/examples/vctk/GANVocoder/parallelwave_gan/vctk/run.sh
+++ b/examples/vctk/GANVocoder/parallelwave_gan/vctk/run.sh
 #!/bin/bash

+config_path=$1
+train_output_path=$2
+
 FLAGS_cudnn_exhaustive_search=true \
 FLAGS_conv_workspace_size_limit=4000 \
-python ../train.py \
+python ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=conf/default.yaml \
-    --output-dir=exp/default \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
    --nprocs=1
--- a/examples/ljspeech/voc1/path.sh
+++ b/examples/ljspeech/voc1/path.sh
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=parallelwave_gan
+export BIN_DIR=${MAIN_ROOT}/parakeet/exps/gan_vocoder/${MODEL}
\ No newline at end of file
--- a/examples/ljspeech/voc1/run.sh
+++ b/examples/ljspeech/voc1/run.sh
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_5000.pdz
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
--- a/examples/voxceleb/spk0/local/ge2e/README.md
+++ b/examples/voxceleb/spk0/local/ge2e/README.md
 # Speaker Encoder
-
 This experiment trains a speaker encoder with speaker verification as its task. It is done as a part of the experiment of transfer learning from speaker verification to multispeaker text-to-speech synthesis, which can be found at [tacotron2_aishell3](../tacotron2_shell3). The trained speaker encoder is used to extract utterance embeddings from utterances.
-
 ## Model
-
 The model used in this experiment is the speaker encoder with text independent speaker verification task in [GENERALIZED END-TO-END LOSS FOR SPEAKER VERIFICATION](https://arxiv.org/pdf/1710.10467.pdf). GE2E-softmax loss is used.

-## File Structure
-
-```text
-ge2e
-├── README.md
-├── README_cn.md
-├── audio_processor.py
-├── config.py
-├── dataset_processors.py
-├── inference.py
-├── preprocess.py
-├── random_cycle.py
-├── speaker_verification_dataset.py
-└── train.py
-```
-
 ## Download Datasets
-
 Currently supported datasets are  Librispeech-other-500, VoxCeleb, VoxCeleb2,ai-datatang-200zh, magicdata, which can be downloaded from corresponding webpage.

 1. Librispeech/train-other-500
-
   An English multispeaker dataset，[URL](https://www.openslr.org/resources/12/train-other-500.tar.gz)，only the `train-other-500` subset is used.
-
 2. VoxCeleb1
-
   An English multispeaker dataset，[URL](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.html) , Audio Files from Dev A to Dev D should be downloaded, combined and extracted.
-
 3. VoxCeleb2
-
   An English multispeaker dataset，[URL](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.html) , Audio Files from Dev A to Dev H should be downloaded, combined and extracted.
-
 4. Aidatatang-200zh
-
   A Mandarin Chinese multispeaker dataset ，[URL](https://www.openslr.org/62/) .
-
 5. magicdata
-
   A Mandarin Chinese multispeaker dataset ，[URL](https://www.openslr.org/68/) .

 If you want to use other datasets, you can also download and preprocess it as long as it meets the requirements described below.

-## Preprocess Datasets
+## Get Started

+```bash
+./run.sh
+```
+
+### Preprocess Datasets
+`./local/preprocess.sh` calls `${BIN_DIR}/preprocess.py`.
+```bash
+./local/preprocess.sh ${datasets_root} ${preprocess_path} ${dataset_names}
+```
+Assume datasets_root is `~/datasets/GE2E`, and it has the follow structure（We only use `train-other-500` for simplicity）:
+```Text
+GE2E
+├── LibriSpeech
+└── (other datasets)
+```
 Multispeaker datasets are used as training data, though the transcriptions are not used. To enlarge the amount of data used for training, several multispeaker datasets are combined. The preporcessed datasets are organized in a file structure described below. The mel spectrogram of each utterance is save in `.npy` format. The dataset is 2-stratified (speaker-utterance). Since multiple datasets are combined, to avoid conflict in speaker id, dataset name is prepended to the speake ids.

 ```text
 dataset_root
 ├── dataset01_speaker01/
-│   ├── utterance01.npy
-│   ├── utterance02.npy
-│   └── utterance03.npy
+│   ├── utterance01.npy
+│   ├── utterance02.npy
+│   └── utterance03.npy
 ├── dataset01_speaker02/
-│   ├── utterance01.npy
-│   ├── utterance02.npy
-│   └── utterance03.npy
+│   ├── utterance01.npy
+│   ├── utterance02.npy
+│   └── utterance03.npy
 ├── dataset02_speaker01/
-│   ├── utterance01.npy
-│   ├── utterance02.npy
-│   └── utterance03.npy
+│   ├── utterance01.npy
+│   ├── utterance02.npy
+│   └── utterance03.npy
 └── dataset02_speaker02/
-    ├── utterance01.npy
-    ├── utterance02.npy
-    └── utterance03.npy
+    ├── utterance01.npy
+    ├── utterance02.npy
+    └── utterance03.npy
 ```
+In `${BIN_DIR}/preprocess.py`:
+1. `--datasets_root` is the directory that contains several extracted dataset
+2.  `--output_dir` is the directory to save the preprocessed dataset
+3.  `--dataset_names` is the dataset to preprocess. If there are multiple datasets in `--datasets_root` to preprocess, the names can be joined with comma. Currently supported dataset names are  librispeech_other, voxceleb1, voxceleb2, aidatatang_200zh and magicdata.

-Run the command to preprocess datasets.
-
+### Train the model
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
 ```bash
-python preprocess.py --datasets_root=<datasets_root> --output_dir=<output_dir> --dataset_names=<dataset_names>
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path}
 ```
-
-Here `--datasets_root` is the directory that contains several extracted dataset; `--output_dir` is the directory to save the preprocessed dataset; `--dataset_names` is the dataset to preprocess. If there are multiple datasets in `--datasets_root` to preprocess, the names can be joined with comma. Currently supported dataset names are  librispeech_other, voxceleb1, voxceleb2, aidatatang_200zh and magicdata.
-
-## Training
-
-When preprocessing is done, run the command below to train the mdoel.
-
-```bash
-python train.py --data=<data_path> --output=<output> --device="gpu" --nprocs=1
-```
-
- `--data` is the path to the preprocessed dataset.
- `--output` is the directory to save results，usually a subdirectory of `runs`.It contains visualdl log files, text log files, config file and a `checkpoints` directory, which contains parameter file and optimizer state file. If `--output` already has some training results in it, the most recent parameter file and optimizer state file is loaded before training.
- `--device` is the device type to run the training, 'cpu' and 'gpu' are supported.
- `--nprocs` is the number of replicas to run in multiprocessing based parallel training。Currently multiprocessing based parallel training is only enabled when using 'gpu' as the devicde. `CUDA_VISIBLE_DEVICES` can be used to specify visible devices with cuda.
+In `${BIN_DIR}/train.py`:
+1. `--data` is the path to the preprocessed dataset.
+2. `--output` is the directory to save results，usually a subdirectory of `runs`.It contains visualdl log files, text log files, config file and a `checkpoints` directory, which contains parameter file and optimizer state file. If `--output` already has some training results in it, the most recent parameter file and optimizer state file is loaded before training.
+3. `--device` is the device type to run the training, 'cpu' and 'gpu' are supported.
+4. `--nprocs` is the number of replicas to run in multiprocessing based parallel training。Currently multiprocessing based parallel training is only enabled when using 'gpu' as the devicde.
+5. `CUDA_VISIBLE_DEVICES` can be used to specify visible devices with cuda.

 Other options are described below.

@@ -99,29 +80,23 @@ Other options are described below.
 - `--opts` is command line options to further override config files. It should be the last comman line options passed with multiple key-value pairs separated by spaces.
 - `--checkpoint_path` specifies the checkpoiont to load before training, extension is not included. A parameter file ( `.pdparams`) and an optimizer state file ( `.pdopt`) with the same name is used. This option has a higher priority than auto-resuming from the `--output` directory.

-## Pretrained Model
-
-The pretrained model is first trained to 1560k steps at Librispeech-other-500 and voxceleb1. Then trained at aidatatang_200h and magic_data to 3000k steps.
-
-Download URL [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip).
-
-## Inference
-
+###  Inference
 When training is done, run the command below to generate utterance embedding for each utterance in a dataset.
-
+`./local/inference.sh` calls `${BIN_DIR}/inference.py`.
 ```bash
-python inference.py --input=<input> --output=<output> --checkpoint_path=<checkpoint_path> --device="gpu"
+CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${infer_input} ${infer_output} ${train_output_path} ${ckpt_name}
 ```
+In `${BIN_DIR}/inference.py`:
+1. `--input` is the path of the dataset used for inference.
+2. `--output` is the directory to save the processed results. It has the same file structure as the input dataset. Each utterance in the dataset has a corrsponding utterance embedding file in `*.npy` format.
+3. `--checkpoint_path` is the path of the checkpoint to use, extension not included.
+4. `--pattern` is the wildcard pattern to filter audio files for inference, defaults to `*.wav`.
+5. `--device` and `--opts` have the same meaning as in the training script.

-`--input` is the path of the dataset used for inference.
-
-`--output` is the directory to save the processed results. It has the same file structure as the input dataset. Each utterance in the dataset has a corrsponding utterance embedding file in `*.npy` format.
-
-`--checkpoint_path` is the path of the checkpoint to use, extension not included.
-
-`--pattern` is the wildcard pattern to filter audio files for inference, defaults to `*.wav`.
+## Pretrained Model
+The pretrained model is first trained to 1560k steps at Librispeech-other-500 and voxceleb1. Then trained at aidatatang_200h and magic_data to 3000k steps.

-`--device` and `--opts` have the same meaning as in the training script.
+Download URL [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip).

 ## References


--- a/examples/other/ge2e/local/inference.sh
+++ b/examples/other/ge2e/local/inference.sh
+#!/bin/bash
+
+#generate utterance embedding for each utterance in a dataset.
+infer_input=$1
+infer_output=$2
+train_output_path=$3
+ckpt_name=$4
+
+python3 ${BIN_DIR}/inference.py \
+    --input=${infer_input} \
+    --output=${infer_output} \
+    --checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \
+    --device="gpu"
+
--- a/examples/other/ge2e/local/preprocess.sh
+++ b/examples/other/ge2e/local/preprocess.sh
+#!/bin/bash
+datasets_root=$1
+preprocess_path=$2
+dataset_names=$3
+
+python3 ${BIN_DIR}/preprocess.py \
+    --datasets_root=${datasets_root} \
+    --output_dir=${preprocess_path} \
+    --dataset_names=${dataset_names}
\ No newline at end of file
--- a/examples/other/ge2e/local/train.sh
+++ b/examples/other/ge2e/local/train.sh
+#!/bin/bash
+
+preprocess_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
+    --data=${preprocess_path} \
+    --output=${train_output_path} \
+    --device="gpu" \
+    --nprocs=1
\ No newline at end of file
--- a/examples/other/ge2e/path.sh
+++ b/examples/other/ge2e/path.sh
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=ge2e
+export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
--- a/examples/other/ge2e/run.sh
+++ b/examples/other/ge2e/run.sh
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+datasets_root=~/datasets/GE2E
+preprocess_path=dump
+dataset_names=librispeech_other
+train_output_path=output
+infer_input=infer_input
+infer_output=infer_output
+ckpt_name=step-10000
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${datasets_root} ${preprocess_path} ${dataset_names} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${infer_input} ${infer_output} ${train_output_path} ${ckpt_name} || exit -1
+fi
--- a/examples/other/punctuation_restoration/README.md
+++ b/examples/other/punctuation_restoration/README.md
 # Punctation Restoration

 Please using [PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask) to do this task.
-
--- a/examples/other/text_frontend/get_g2p_data.py
+++ b/examples/other/text_frontend/get_g2p_data.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
 from collections import defaultdict
 from pathlib import Path

--- a/examples/other/text_frontend/get_textnorm_data.py
+++ b/examples/other/text_frontend/get_textnorm_data.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
 from pathlib import Path


--- a/examples/other/text_frontend/test_g2p.py
+++ b/examples/other/text_frontend/test_g2p.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
 import re
 from pathlib import Path

--- a/examples/other/text_frontend/test_textnorm.py
+++ b/examples/other/text_frontend/test_textnorm.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
 import re
 from pathlib import Path

--- a/examples/other/use_mfa/local/detect_oov.py
+++ b/examples/other/use_mfa/local/detect_oov.py
@@ -11,11 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
+import logging
 from collections import OrderedDict
 from pathlib import Path
-import logging


 def detect_oov(corpus_dir, lexicon_path, transcription_pattern="*.lab"):

--- a/examples/other/use_mfa/local/generate_lexicon.py
+++ b/examples/other/use_mfa/local/generate_lexicon.py
@@ -20,9 +20,8 @@ than words are used in transcriptions produced by `reorganize_baker.py`.
 We make this choice to better leverage other software for chinese text to 
 pinyin tools like pypinyin. This is the convention for G2P in Chinese.
 """
-
-import re
 import argparse
+import re
 from collections import OrderedDict

 INITIALS = [

--- a/examples/vctk/GANVocoder/parallelwave_gan/vctk/synthesize.sh
+++ b/examples/vctk/GANVocoder/parallelwave_gan/vctk/synthesize.sh
-#!/bin/bash
-
-python3 ../synthesize.py \
-  --config=conf/default.yaml \
-  --checkpoint=exp/default/checkpoints/snapshot_iter_35000.pdz_bak\
-  --test-metadata=dump/test/norm/metadata.jsonl \
-  --output-dir=exp/default/test
--- a/examples/vctk/README.md
+++ b/examples/vctk/README.md
+
+# VCTK
+
+* tts0 - Tactron2
+* tts1 - TransformerTTS
+* tts2 - SpeedySpeech
+* tts3 - FastSpeech2
+* voc0 - WaveFlow
+* voc1 - Parallel WaveGAN
+* voc2 - MelGAN
+* voc3 - MultiBand MelGAN
--- a/examples/vctk/fastspeech2/sentences.txt
+++ b/examples/vctk/fastspeech2/sentences.txt
-001 凯莫瑞安联合体的经济崩溃，迫在眉睫。
-002 对于所有想要离开那片废土，去寻找更美好生活的人来说。
-003 克哈，是你们所有人安全的港湾。
-004 为了保护尤摩扬人民不受异虫的残害，我所做的，比他们自己的领导委员会都多。
-005 无论他们如何诽谤我，我将继续为所有泰伦人的最大利益，而努力奋斗。
-006 身为你们的元首，我带领泰伦人实现了人类统治领地和经济的扩张。
-007 我们将继续成长，用行动回击那些只会说风凉话，不愿意和我们相向而行的害群之马。
-008 帝国武装力量，无数的优秀儿女，正时刻守卫着我们的家园大门，但是他们孤木难支。
-009 凡是今天应征入伍者，所获的所有刑罚罪责，减半。
-010 激进分子和异见者希望你们一听见枪声，就背弃多年的和平与繁荣。
-011 他们没有勇气和能力，带领人类穿越一个充满危险的星系。
-012 法治是我们的命脉，然而它却受到前所未有的挑战。
-013 我将恢复我们帝国的荣光，绝不会向任何外星势力低头。
-014 我已经驯服了异虫，荡平了星灵。如今它们的创造者，想要夺走我们拥有的一切。
-015 永远记住，谁才是最能保护你们的人。
-016 不要听信别人的谗言，我不是什么克隆人。
\ No newline at end of file
--- a/examples/vctk/fastspeech2/vctk/README.md
+++ b/examples/vctk/fastspeech2/vctk/README.md
@@ -12,13 +12,22 @@ ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://gith
 1. `p315`, because no txt for it.
 2. `p280` and `p362`, because no *_mic2.flac (which is better than *_mic1.flac) for  them.

-### Preprocess the dataset
+## Get Started
 Assume the path to the dataset is `~/datasets/VCTK-Corpus-0.92`.
 Assume the path to the MFA result of VCTK is `./vctk_alignment`.
-Run the command below to preprocess the dataset.
-
+Run the command below to
+1. **source path**.
+2. preprocess the dataset,
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+    - synthesize waveform from text file.
 ```bash
-./preprocess.sh
+./run.sh
+```
+### Preprocess the dataset
+```bash
+./local/preprocess.sh ${conf_path}
 ```
 When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.

@@ -43,11 +52,11 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi

 Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance.

-## Train the model
-`./run.sh` calls `../train.py`.
+### Train the model
 ```bash
-./run.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
 ```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
 Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
@@ -81,14 +90,23 @@ optional arguments:
 5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
 6. `--phones-dict` is the path of the phone vocabulary file.

-## Pretrained Model
-
-## Synthesize
+### Synthesize
+We use [parallel wavegan](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/parallelwave_gan/baker) as the neural vocoder.

+Download pretrained parallel wavegan model from [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip)and unzip it.
+```bash
+unzip pwg_vctk_ckpt_0.5.zip
+```
+Parallel WaveGAN checkpoint contains files listed below.
+```text
+pwg_vctk_ckpt_0.5
+├── pwg_default.yaml               # default config used to train parallel wavegan
+├── pwg_snapshot_iter_1000000.pdz  # generator parameters of parallel wavegan
+└── pwg_stats.npy                  # statistics used to normalize spectrogram when training parallel wavegan
 ```
-`synthesize.sh` calls `../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
 ```bash
-./synthesize.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
 usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
@@ -128,19 +146,22 @@ optional arguments:
  --device DEVICE       device type to use.
  --verbose VERBOSE     verbose.
 ```
-`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file.
+`./local/synthesize_e2e.sh` calls `${BIN_DIR}/multi_spk_synthesize_e2e_en.py`, which can synthesize waveform from text file.
 ```bash
-./synthesize_e2e.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
-usage: synthesize_e2e.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
-                         [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT]
-                         [--fastspeech2-stat FASTSPEECH2_STAT]
-                         [--pwg-config PWG_CONFIG]
-                         [--pwg-checkpoint PWG_CHECKPOINT]
-                         [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT]
-                         [--text TEXT] [--output-dir OUTPUT_DIR]
-                         [--device DEVICE] [--verbose VERBOSE]
+usage: multi_spk_synthesize_e2e_en.py [-h]
+                                      [--fastspeech2-config FASTSPEECH2_CONFIG]
+                                      [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT]
+                                      [--fastspeech2-stat FASTSPEECH2_STAT]
+                                      [--pwg-config PWG_CONFIG]
+                                      [--pwg-checkpoint PWG_CHECKPOINT]
+                                      [--pwg-stat PWG_STAT]
+                                      [--phones-dict PHONES_DICT]
+                                      [--speaker-dict SPEAKER_DICT]
+                                      [--text TEXT] [--output-dir OUTPUT_DIR]
+                                      [--device DEVICE] [--verbose VERBOSE]

 Synthesize with fastspeech2 & parallel wavegan.

@@ -161,6 +182,8 @@ optional arguments:
                        spectrogram when training parallel wavegan.
  --phones-dict PHONES_DICT
                        phone vocabulary file.
+  --speaker-dict SPEAKER_DICT
+                        speaker id map file.
  --text TEXT           text to synthesize, a 'utt_id sentence' pair per line.
  --output-dir OUTPUT_DIR
                        output dir.
@@ -175,7 +198,34 @@ optional arguments:
 5. `--output-dir` is the directory to save synthesized audio files.
 6. `--device is` the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.

-You can use the following scripts to synthesize for `../sentences_en.txt` using pretrained fastspeech2 and parallel wavegan models.
-```bash
+## Pretrained Model
+Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_vctk_ckpt_0.5.zip)

+FastSpeech2 checkpoint contains files listed below.
+```text
+fastspeech2_nosil_vctk_ckpt_0.5
+├── default.yaml            # default config used to train fastspeech2
+├── phone_id_map.txt        # phone vocabulary file when training fastspeech2
+├── snapshot_iter_66200.pdz # model parameters and optimizer states
+├── speaker_id_map.txt      # speaker id map file when training a multi-speaker fastspeech2
+└── speech_stats.npy        # statistics used to normalize spectrogram when training fastspeech2
+```
+You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
+```bash
+source path.sh
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/multi_spk_synthesize_e2e_en.py \
+  --fastspeech2-config=fastspeech2_nosil_vctk_ckpt_0.5/default.yaml \
+  --fastspeech2-checkpoint=fastspeech2_nosil_vctk_ckpt_0.5/snapshot_iter_96400.pdz \
+  --fastspeech2-stat=fastspeech2_nosil_vctk_ckpt_0.5/speech_stats.npy \
+  --pwg-config=pwg_vctk_ckpt_0.5/pwg_default.yaml \
+  --pwg-checkpoint=pwg_vctk_ckpt_0.5/pwg_snapshot_iter_1000000.pdz \
+  --pwg-stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \
+  --text=${BIN_DIR}/../sentences.txt \
+  --output-dir=exp/default/test_e2e \
+  --device="gpu" \
+  --phones-dict=fastspeech2_nosil_vctk_ckpt_0.5/phone_id_map.txt \
+  --speaker-dict=fastspeech2_nosil_vctk_ckpt_0.5/speaker_id_map.txt
 ```
--- a/examples/vctk/fastspeech2/vctk/conf/default.yaml
+++ b/examples/vctk/fastspeech2/vctk/conf/default.yaml
--- a/examples/vctk/fastspeech2/vctk/preprocess.sh
+++ b/examples/vctk/fastspeech2/vctk/preprocess.sh
 #!/bin/bash

-stage=1
+stage=0
 stop_stage=100

-export MAIN_ROOT=`realpath ${PWD}/../../../`
+config_path=$1

 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # get durations from MFA's result
@@ -11,18 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
        --inputdir=./vctk_alignment \
        --output durations.txt \
-        --config=conf/default.yaml 
+        --config=${config_path}
 fi

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # extract features
    echo "Extract features ..."
-    python3 ../preprocess.py \
+    python3 ${BIN_DIR}/preprocess.py \
        --dataset=vctk \
        --rootdir=~/datasets/VCTK-Corpus-0.92/ \
        --dumpdir=dump \
        --dur-file=durations.txt \
-        --config=conf/default.yaml \
+        --config=${config_path} \
        --num-cpu=20 \
        --cut-sil=True
 fi
@@ -46,7 +46,7 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # normalize and covert phone/speaker to id, dev and test should use train's stats
    echo "Normalize ..."
-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --dumpdir=dump/train/norm \
        --speech-stats=dump/train/speech_stats.npy \
@@ -55,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --phones-dict=dump/phone_id_map.txt \
        --speaker-dict=dump/speaker_id_map.txt

-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/dev/raw/metadata.jsonl \
        --dumpdir=dump/dev/norm \
        --speech-stats=dump/train/speech_stats.npy \
@@ -64,7 +64,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
        --phones-dict=dump/phone_id_map.txt \
        --speaker-dict=dump/speaker_id_map.txt

-    python3 ../normalize.py \
+    python3 ${BIN_DIR}/normalize.py \
        --metadata=dump/test/raw/metadata.jsonl \
        --dumpdir=dump/test/norm \
        --speech-stats=dump/train/speech_stats.npy \

--- a/examples/vctk/fastspeech2/vctk/synthesize.sh
+++ b/examples/vctk/fastspeech2/vctk/synthesize.sh
 #!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3  ../synthesize.py \
-  --fastspeech2-config=conf/default.yaml \
-  --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_32769.pdz_bak\
+python3 ${BIN_DIR}/synthesize.py \
+  --fastspeech2-config=${config_path} \
+  --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --fastspeech2-stat=dump/train/speech_stats.npy \
-  --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
-  --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
-  --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+  --pwg-config=pwg_vctk_ckpt_0.5/pwg_default.yaml \
+  --pwg-checkpoint=pwg_vctk_ckpt_0.5/pwg_snapshot_iter_1000000.pdz \
+  --pwg-stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \
  --test-metadata=dump/test/norm/metadata.jsonl \
-  --output-dir=exp/default/test \
+  --output-dir=${train_output_path}/test \
  --device="gpu" \
  --phones-dict=dump/phone_id_map.txt \
  --speaker-dict=dump/speaker_id_map.txt
--- a/examples/vctk/fastspeech2/vctk/synthesize_e2e.sh
+++ b/examples/vctk/fastspeech2/vctk/synthesize_e2e.sh
 #!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 synthesize_e2e.py \
-  --fastspeech2-config=conf/default.yaml \
-  --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_32769.pdz_bak \
+python3 ${BIN_DIR}/multi_spk_synthesize_e2e_en.py \
+  --fastspeech2-config=${config_path} \
+  --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
  --fastspeech2-stat=dump/train/speech_stats.npy \
-  --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
-  --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
-  --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
-  --text=../sentences_en.txt \
-  --output-dir=exp/default/test_e2e \
+  --pwg-config=pwg_vctk_ckpt_0.5/pwg_default.yaml \
+  --pwg-checkpoint=pwg_vctk_ckpt_0.5/pwg_snapshot_iter_1000000.pdz \
+  --pwg-stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \
+  --text=${BIN_DIR}/../sentences_en.txt \
+  --output-dir=${train_output_path}/test_e2e \
  --device="gpu" \
  --phones-dict=dump/phone_id_map.txt \
  --speaker-dict=dump/speaker_id_map.txt
--- a/examples/vctk/fastspeech2/vctk/run.sh
+++ b/examples/vctk/fastspeech2/vctk/run.sh
 #!/bin/bash

-python3 ../train.py \
+config_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=conf/default.yaml \
-    --output-dir=exp/default \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
    --nprocs=2 \
    --phones-dict=dump/phone_id_map.txt \
    --speaker-dict=dump/speaker_id_map.txt
--- a/examples/vctk/tts3/path.sh
+++ b/examples/vctk/tts3/path.sh
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=fastspeech2
+export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
--- a/examples/vctk/tts3/run.sh
+++ b/examples/vctk/tts3/run.sh
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_331.pdz
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
--- a/examples/vctk/GANVocoder/parallelwave_gan/vctk/README.md
+++ b/examples/vctk/GANVocoder/parallelwave_gan/vctk/README.md
 # Parallel WaveGAN with VCTK
 This example contains code used to train a [parallel wavegan](http://arxiv.org/abs/1910.11480) model with [VCTK](https://datashare.ed.ac.uk/handle/10283/3443).
-## Preprocess the dataset
+
+## Dataset
 ### Download and Extract the datasaet
 Download VCTK-0.92  from the [official website](https://datashare.ed.ac.uk/handle/10283/3443) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/VCTK-Corpus-0.92`.

@@ -11,12 +12,21 @@ ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://gith
 1. `p315`, because no txt for it.
 2. `p280` and `p362`, because no *_mic2.flac (which is better than *_mic1.flac) for  them.

-### Preprocess the dataset
+## Get Started
 Assume the path to the dataset is `~/datasets/VCTK-Corpus-0.92`.
 Assume the path to the MFA result of VCTK is `./vctk_alignment`.
-Run the command below to preprocess the dataset.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset,
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
 ```bash
-./preprocess.sh
+./run.sh
+```
+### Preprocess the dataset
+```bash
+./local/preprocess.sh ${conf_path}
 ```
 When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.

@@ -38,12 +48,11 @@ The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of whi

 Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains id and paths to spectrogam of each utterance.

-## Train the model
-
-`./run.sh` calls `../train.py`.
+### Train the model
 ```bash
-./run.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
 ```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
 Here's the complete help message.

 ```text
@@ -88,15 +97,10 @@ benchmark:
 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
 4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported.
 5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
-
-## Pretrained Models
-
-
-## Synthesize
-
-`synthesize.sh` calls `../synthesize.py `, which can synthesize waveform from `metadata.jsonl`.
+### Synthesize
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
 ```bash
-./synthesize.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
 usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT]
@@ -124,5 +128,16 @@ optional arguments:
 4. `--output-dir` is the directory to save the synthesized audio files.
 5. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported.

+## Pretrained Models
+Pretrained models can be downloaded here [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip).
+
+Parallel WaveGAN checkpoint contains files listed below.
+
+```text
+pwg_vctk_ckpt_0.5
+├── pwg_default.yaml               # default config used to train parallel wavegan
+├── pwg_snapshot_iter_1000000.pdz  # generator parameters of parallel wavegan
+└── pwg_stats.npy                  # statistics used to normalize spectrogram when training parallel wavegan
+```
 ## Acknowledgement
 We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
--- a/examples/vctk/GANVocoder/parallelwave_gan/vctk/conf/default.yaml
+++ b/examples/vctk/GANVocoder/parallelwave_gan/vctk/conf/default.yaml
--- a/examples/vctk/GANVocoder/parallelwave_gan/vctk/preprocess.sh
+++ b/examples/vctk/GANVocoder/parallelwave_gan/vctk/preprocess.sh
@@ -3,7 +3,7 @@
 stage=0
 stop_stage=100

-export MAIN_ROOT=`realpath ${PWD}/../../../../`
+config_path=$1

 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # get durations from MFA's result
@@ -11,17 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
        --inputdir=./vctk_alignment \
        --output=durations.txt \
-        --config=conf/default.yaml
+        --config=${config_path}
 fi

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # extract features
    echo "Extract features ..."
-    python3 ../../preprocess.py \
+    python3 ${BIN_DIR}/../preprocess.py \
        --rootdir=~/datasets/VCTK-Corpus-0.92/ \
        --dataset=vctk \
        --dumpdir=dump \
        --dur-file=durations.txt \
-        --config=conf/default.yaml \
+        --config=${config_path} \
        --cut-sil=True \
        --num-cpu=20
 fi
@@ -38,16 +39,16 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # normalize, dev and test should use train's stats
    echo "Normalize ..."
   
-    python3 ../../normalize.py \
+    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/train/raw/metadata.jsonl \
        --dumpdir=dump/train/norm \
        --stats=dump/train/feats_stats.npy
-    python3 ../../normalize.py \
+    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/dev/raw/metadata.jsonl \
        --dumpdir=dump/dev/norm \
        --stats=dump/train/feats_stats.npy
    
-    python3 ../../normalize.py \
+    python3 ${BIN_DIR}/../normalize.py \
        --metadata=dump/test/raw/metadata.jsonl \
        --dumpdir=dump/test/norm \
        --stats=dump/train/feats_stats.npy

--- a/examples/vctk/voc1/local/synthesize.sh
+++ b/examples/vctk/voc1/local/synthesize.sh
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/synthesize.py \
+  --config=${config_path} \
+  --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
+  --test-metadata=dump/test/norm/metadata.jsonl \
+  --output-dir=${train_output_path}/test
--- a/examples/vctk/GANVocoder/parallelwave_gan/baker/run.sh
+++ b/examples/vctk/GANVocoder/parallelwave_gan/baker/run.sh
 #!/bin/bash

+config_path=$1
+train_output_path=$2
+
 FLAGS_cudnn_exhaustive_search=true \
 FLAGS_conv_workspace_size_limit=4000 \
-python ../train.py \
+python ${BIN_DIR}/train.py \
    --train-metadata=dump/train/norm/metadata.jsonl \
    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=conf/default.yaml \
-    --output-dir=exp/default \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
    --nprocs=1
--- a/examples/vctk/voc1/path.sh
+++ b/examples/vctk/voc1/path.sh
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=parallelwave_gan
+export BIN_DIR=${MAIN_ROOT}/parakeet/exps/gan_vocoder/${MODEL}
\ No newline at end of file
--- a/examples/vctk/voc1/run.sh
+++ b/examples/vctk/voc1/run.sh
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_5000.pdz
+
+# with the following command, you can choice the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
--- a/examples/voxceleb/README.md
+++ b/examples/voxceleb/README.md
-# Voxceleb
-
-* spk0 - ge2e
--- a/examples/voxceleb/spk0/local/ge2e/README_cn.md
+++ b/examples/voxceleb/spk0/local/ge2e/README_cn.md
-# Speaker Encoder
-
-本实验是的在多说话人数据集上以 Speaker Verification 为任务训练一个 speaker encoder, 这是作为 transfer learning from speaker verification to multispeaker text-to-speech synthesis 实验的一部分, 可以在 [tacotron2_aishell3](../tacotron2_aishell3) 中找到。用训练好的模型来提取音频的 utterance embedding.
-
-## 模型
-
-本实验使用的模型是 [GENERALIZED END-TO-END LOSS FOR SPEAKER VERIFICATION](https://arxiv.org/pdf/1710.10467.pdf) 中的 speaker encoder text independent 模型。使用的是 GE2E softmax 损失函数。
-
-## 目录结构
-
-```text
-ge2e
-├── README_cn.md
-├── audio_processor.py
-├── config.py
-├── dataset_processors.py
-├── inference.py
-├── preprocess.py
-├── random_cycle.py
-├── speaker_verification_dataset.py
-└── train.py
-```
-
-## 数据集下载
-
-本实验支持了 Librispeech-other-500, VoxCeleb, VoxCeleb2,ai-datatang-200zh, magicdata 数据集。可以在对应的页面下载。
-
-1. Librispeech/train-other-500
-
-   英文多说话人数据集，[下载链接](https://www.openslr.org/resources/12/train-other-500.tar.gz)，我们的实验中仅用到了 train-other-500 这个子集。
-
-2. VoxCeleb1
-
-   英文多说话人数据集，[下载链接](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.html)，需要下载其中的 Audio Files 中的 Dev A 到 Dev D 四个压缩文件并合并解压。
-
-3. VoxCeleb2
-
-   英文多说话人数据集，[下载链接](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox2.html)，需要下载其中的 Audio Files 中的 Dev A 到 Dev H 八个压缩文件并合并解压。
-
-4. Aidatatang-200zh
-
-   中文多说话人数据集，[下载链接](https://www.openslr.org/62/)。
-
-5. magicdata
-
-   中文多说话人数据集，[下载链接](https://www.openslr.org/68/)。
-
-如果用户需要使用其他的数据集，也可以自行下载并进行数据处理，只要符合如下的要求。
-
-## 数据集预处理
-
-训练中使用的数据集是多说话人数据集，transcription 并不会被使用。为了扩大数据的量，训练过程可以将多个数据集合并为一个。处理后的文件结果组织方式如下，每个句子的频谱存储为 `.npy` 格式。以 speaker-utterance 的两层目录结构存储。因为合并数据集的原因，为了避免 speaker id 冲突，dataset 名会被添加到 speaker id 前面。
-
-```text
-dataset_root
-├── dataset01_speaker01/
-│   ├── utterance01.npy
-│   ├── utterance02.npy
-│   └── utterance03.npy
-├── dataset01_speaker02/
-│   ├── utterance01.npy
-│   ├── utterance02.npy
-│   └── utterance03.npy
-├── dataset02_speaker01/
-│   ├── utterance01.npy
-│   ├── utterance02.npy
-│   └── utterance03.npy
-└── dataset02_speaker02/
-    ├── utterance01.npy
-    ├── utterance02.npy
-    └── utterance03.npy
-```
-
-运行数据处理脚本
-
-```bash
-python preprocess.py --datasets_root=<datasets_root> --output_dir=<output_dir> --dataset_names=<dataset_names>
-```
-
-其中 datasets_root 是包含多个原始数据集的路径，--output_dir 是多个数据集合并后输出的路径，dataset_names 是数据集的名称，多个数据集可以用逗号分割，比如 'librispeech_other, voxceleb1'. 目前支持的数据集有 librispeech_other, voxceleb1, voxceleb2, aidatatang_200zh, magicdata.
-
-## 训练
-
-数据处理完成后，使用如下的脚本训练。
-
-```bash
-python train.py --data=<data_path> --output=<output> --device="gpu" --nprocs=1
-```
-
- `--data` 是处理后的数据集路径。
- `--output` 是训练结果的保存路径，一般使用 runs 下的一个子目录。保存结果包含 visualdl 的 log 文件，文本 log 记录，运行 config 备份，以及 checkpoints 目录，里面包含参数文件和优化器状态文件。如果指定的 output 路径包含此前的训练结果，训练前会自动加载最近的参数文件和优化器状态文件。
- `--device` 是运行设备，目前支持 'cpu' 和 'gpu'.
- `--nprocs` 是指定运行进程数。目前仅在使用 'gpu' 是支持多进程训练。可以配合 `CUDA_VISIBLE_DEVICES` 环境变量指定可见卡号。
-
-另外还有几个选项。
-
- `--config` 是用于覆盖默认配置（默认配置可以查看 `config.py`) 的配置文件，为 `.yaml` 文件。
- `--opts` 是用命令行参数进一步覆盖配置。这是最后一个传入的命令行选项，用多组空格分隔的 KEY VALUE 对的方式传入。
- `--checkpoint_path` 指定从中恢复的 checkpoint, 不需要包含扩展名。同名的参数文件( `.pdparams`) 和优化器文件( `.pdopt`)会被加载以恢复训练。这个参数指定的恢复训练优先级高于自动从 `output` 文件夹中恢复训练。
-
-## 预训练模型
-
-预训练模型是在 Librispeech-other-500 和 voxceleb1 上训练到 1560k steps 后用 aidatatang_200h 和 magic_data 训练到 3000k 的结果。
-
-下载链接 [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip)
-
-## 预测
-
-使用训练好的模型进行预测，对一个数据集中的所有 utterance 生成一个 embedding.
-
-```bash
-python inference.py --input=<input> --output=<output> --checkpoint_path=<checkpoint_path> --device="gpu"
-```
-
- `--input` 是需要处理的数据集的路径。
- `--output` 是处理的结果，它会保持和 `--input` 相同的文件夹结构，对应 input 中的每一个音频文件会有一个同名的 `*.npy` 文件，是从这个音频文件中提取到的 utterance embedding.
- `--checkpoint_path` 为用于预测的参数文件路径，不包含扩展名。
- `--pattern` 是用于筛选数据集中需要处理的音频文件的通配符模式，默认为 `*.wav`.
- `--device` 和 `--opts` 的语义和训练脚本一致。
-
-## 参考文献
-
-1. [GENERALIZED END-TO-END LOSS FOR SPEAKER VERIFICATION](https://arxiv.org/pdf/1710.10467.pdf)
-2. [Transfer Learning from Speaker Verification toMultispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf)
--- a/examples/voxceleb/spk0/run.sh
+++ b/examples/voxceleb/spk0/run.sh
--- a/parakeet/__init__.py
+++ b/parakeet/__init__.py
@@ -11,10 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import logging
+
 from . import data
 from . import datasets
+from . import exps
 from . import frontend
 from . import models
 from . import modules

--- a/parakeet/audio/__init__.py
+++ b/parakeet/audio/__init__.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from .audio import AudioProcessor
 from .spec_normalizer import LogMagnitude
 from .spec_normalizer import NormalizerBase
--- a/parakeet/data/__init__.py
+++ b/parakeet/data/__init__.py
@@ -13,6 +13,5 @@
 # limitations under the License.
 """Parakeet's infrastructure for data processing.
 """
-
-from .dataset import *
 from .batch import *
+from .dataset import *
--- a/parakeet/datasets/__init__.py
+++ b/parakeet/datasets/__init__.py
@@ -11,6 +11,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from .common import *
 from .ljspeech import *
--- a/parakeet/datasets/preprocess_utils.py
+++ b/parakeet/datasets/preprocess_utils.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import re



--- a/parakeet/datasets/vocoder_batch_fn.py
+++ b/parakeet/datasets/vocoder_batch_fn.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import numpy as np
 import paddle


--- a/parakeet/exps/__init__.py
+++ b/parakeet/exps/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/parakeet/exps/fastspeech2/__init__.py
+++ b/parakeet/exps/fastspeech2/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/examples/vctk/fastspeech2/aishell3/synthesize_e2e.py
+++ b/examples/vctk/fastspeech2/aishell3/synthesize_e2e.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
 import logging
 from pathlib import Path
@@ -20,13 +19,14 @@ import numpy as np
 import paddle
 import soundfile as sf
 import yaml
+from yacs.config import CfgNode
+
 from parakeet.frontend.zh_frontend import Frontend
 from parakeet.models.fastspeech2 import FastSpeech2
 from parakeet.models.fastspeech2 import FastSpeech2Inference
 from parakeet.models.parallel_wavegan import PWGGenerator
 from parakeet.models.parallel_wavegan import PWGInference
 from parakeet.modules.normalizer import ZScore
-from yacs.config import CfgNode


 def evaluate(args, fastspeech2_config, pwg_config):

--- a/examples/vctk/fastspeech2/vctk/synthesize_e2e.py
+++ b/examples/vctk/fastspeech2/vctk/synthesize_e2e.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
 import logging
 from pathlib import Path
@@ -20,13 +19,14 @@ import numpy as np
 import paddle
 import soundfile as sf
 import yaml
+from yacs.config import CfgNode
+
 from parakeet.frontend import English
 from parakeet.models.fastspeech2 import FastSpeech2
 from parakeet.models.fastspeech2 import FastSpeech2Inference
 from parakeet.models.parallel_wavegan import PWGGenerator
 from parakeet.models.parallel_wavegan import PWGInference
 from parakeet.modules.normalizer import ZScore
-from yacs.config import CfgNode


 def evaluate(args, fastspeech2_config, pwg_config):

--- a/examples/vctk/fastspeech2/normalize.py
+++ b/examples/vctk/fastspeech2/normalize.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Normalize feature files and dump them."""
-
 import argparse
 import logging
 from operator import itemgetter
@@ -20,10 +19,11 @@ from pathlib import Path

 import jsonlines
 import numpy as np
-from parakeet.datasets.data_table import DataTable
 from sklearn.preprocessing import StandardScaler
 from tqdm import tqdm

+from parakeet.datasets.data_table import DataTable
+

 def main():
    """Run preprocessing process."""

--- a/examples/vctk/fastspeech2/preprocess.py
+++ b/examples/vctk/fastspeech2/preprocess.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
 import os
 from concurrent.futures import ThreadPoolExecutor
@@ -26,6 +25,8 @@ import librosa
 import numpy as np
 import tqdm
 import yaml
+from yacs.config import CfgNode
+
 from parakeet.data.get_feats import Energy
 from parakeet.data.get_feats import LogMelFBank
 from parakeet.data.get_feats import Pitch
@@ -34,7 +35,6 @@ from parakeet.datasets.preprocess_utils import get_input_token
 from parakeet.datasets.preprocess_utils import get_phn_dur
 from parakeet.datasets.preprocess_utils import get_spk_id_map
 from parakeet.datasets.preprocess_utils import merge_silence
-from yacs.config import CfgNode


 def process_sentence(config: Dict[str, Any],

--- a/examples/vctk/fastspeech2/synthesize.py
+++ b/examples/vctk/fastspeech2/synthesize.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
 import logging
 from pathlib import Path
@@ -22,6 +21,7 @@ import paddle
 import soundfile as sf
 import yaml
 from yacs.config import CfgNode
+
 from parakeet.datasets.data_table import DataTable
 from parakeet.models.fastspeech2 import FastSpeech2
 from parakeet.models.fastspeech2 import FastSpeech2Inference

--- a/examples/vctk/fastspeech2/baker/synthesize_e2e.py
+++ b/examples/vctk/fastspeech2/baker/synthesize_e2e.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
 import logging
 from pathlib import Path
@@ -20,13 +19,14 @@ import numpy as np
 import paddle
 import soundfile as sf
 import yaml
+from yacs.config import CfgNode
+
 from parakeet.frontend.zh_frontend import Frontend
 from parakeet.models.fastspeech2 import FastSpeech2
 from parakeet.models.fastspeech2 import FastSpeech2Inference
 from parakeet.models.parallel_wavegan import PWGGenerator
 from parakeet.models.parallel_wavegan import PWGInference
 from parakeet.modules.normalizer import ZScore
-from yacs.config import CfgNode


 def evaluate(args, fastspeech2_config, pwg_config):

--- a/examples/vctk/fastspeech2/ljspeech/synthesize_e2e.py
+++ b/examples/vctk/fastspeech2/ljspeech/synthesize_e2e.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
 import logging
 from pathlib import Path
@@ -21,6 +20,7 @@ import paddle
 import soundfile as sf
 import yaml
 from yacs.config import CfgNode
+
 from parakeet.frontend import English
 from parakeet.models.fastspeech2 import FastSpeech2
 from parakeet.models.fastspeech2 import FastSpeech2Inference

--- a/examples/vctk/fastspeech2/train.py
+++ b/examples/vctk/fastspeech2/train.py
@@ -11,11 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
 import logging
 import os
 import shutil
+from pathlib import Path

 import jsonlines
 import numpy as np
@@ -25,9 +25,12 @@ from paddle import DataParallel
 from paddle import distributed as dist
 from paddle.io import DataLoader
 from paddle.io import DistributedBatchSampler
-from parakeet.datasets.data_table import DataTable
-from parakeet.datasets.am_batch_fn import fastspeech2_single_spk_batch_fn
+from visualdl import LogWriter
+from yacs.config import CfgNode
+
 from parakeet.datasets.am_batch_fn import fastspeech2_multi_spk_batch_fn
+from parakeet.datasets.am_batch_fn import fastspeech2_single_spk_batch_fn
+from parakeet.datasets.data_table import DataTable
 from parakeet.models.fastspeech2 import FastSpeech2
 from parakeet.models.fastspeech2 import FastSpeech2Evaluator
 from parakeet.models.fastspeech2 import FastSpeech2Updater
@@ -36,9 +39,6 @@ from parakeet.training.extensions.visualizer import VisualDL
 from parakeet.training.optimizer import build_optimizers
 from parakeet.training.seeding import seed_everything
 from parakeet.training.trainer import Trainer
-from pathlib import Path
-from visualdl import LogWriter
-from yacs.config import CfgNode


 def train_sp(args, config):

--- a/examples/vctk/GANVocoder/README.md
+++ b/examples/vctk/GANVocoder/README.md
--- a/parakeet/exps/gan_vocoder/__init__.py
+++ b/parakeet/exps/gan_vocoder/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/examples/vctk/GANVocoder/normalize.py
+++ b/examples/vctk/GANVocoder/normalize.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Normalize feature files and dump them."""
-
 import argparse
 import logging
 from operator import itemgetter
@@ -20,10 +19,11 @@ from pathlib import Path

 import jsonlines
 import numpy as np
-from parakeet.datasets.data_table import DataTable
 from sklearn.preprocessing import StandardScaler
 from tqdm import tqdm

+from parakeet.datasets.data_table import DataTable
+

 def main():
    """Run preprocessing process."""

--- a/parakeet/exps/gan_vocoder/parallelwave_gan/__init__.py
+++ b/parakeet/exps/gan_vocoder/parallelwave_gan/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/examples/vctk/GANVocoder/parallelwave_gan/synthesize.py
+++ b/examples/vctk/GANVocoder/parallelwave_gan/synthesize.py
@@ -11,11 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
 import os
 from pathlib import Path
-from timer import timer

 import jsonlines
 import numpy as np
@@ -23,9 +21,11 @@ import paddle
 import soundfile as sf
 import yaml
 from paddle import distributed as dist
+from timer import timer
+from yacs.config import CfgNode
+
 from parakeet.datasets.data_table import DataTable
 from parakeet.models.parallel_wavegan import PWGGenerator
-from yacs.config import CfgNode


 def main():

--- a/examples/vctk/GANVocoder/parallelwave_gan/baker/synthesize_from_wav.py
+++ b/examples/vctk/GANVocoder/parallelwave_gan/baker/synthesize_from_wav.py
@@ -11,10 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
-import os
 import logging
+import os
 from pathlib import Path

 import librosa
@@ -22,20 +21,12 @@ import numpy as np
 import paddle
 import soundfile as sf
 import yaml
+from yacs.config import CfgNode
+
 from parakeet.data.get_feats import LogMelFBank
 from parakeet.models.parallel_wavegan import PWGGenerator
 from parakeet.models.parallel_wavegan import PWGInference
 from parakeet.modules.normalizer import ZScore
-from yacs.config import CfgNode as Configuration
-
-
-def get_cfg_default():
-    config_path = (Path(__file__).parent / "conf" / "default.yaml").resolve()
-    with open(config_path, 'rt') as f:
-        _C = yaml.safe_load(f)
-        _C = Configuration(_C)
-    config = _C.clone()
-    return config


 def evaluate(args, config):
@@ -91,7 +82,7 @@ def main():
        description="Synthesize with parallel wavegan.")

    parser.add_argument(
-        "--config", type=str, help="config file to overwrite default config.")
+        "--config", type=str, help="parallel wavegan config file.")
    parser.add_argument("--checkpoint", type=str, help="snapshot to load.")
    parser.add_argument(
        "--stat",
@@ -108,9 +99,8 @@ def main():

    paddle.set_device(args.device)

-    config = get_cfg_default()
-    if args.config:
-        config.merge_from_file(args.config)
+    with open(args.config) as f:
+        config = CfgNode(yaml.safe_load(f))

    print("========Args========")
    print(yaml.safe_dump(vars(args)))

--- a/examples/vctk/GANVocoder/parallelwave_gan/train.py
+++ b/examples/vctk/GANVocoder/parallelwave_gan/train.py
@@ -11,11 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
 import logging
 import os
 import shutil
+from pathlib import Path

 import jsonlines
 import numpy as np
@@ -28,20 +28,20 @@ from paddle.io import DataLoader
 from paddle.io import DistributedBatchSampler
 from paddle.optimizer import Adam  # No RAdaom
 from paddle.optimizer.lr import StepDecay
+from visualdl import LogWriter
+from yacs.config import CfgNode
+
 from parakeet.datasets.data_table import DataTable
 from parakeet.datasets.vocoder_batch_fn import Clip
-from parakeet.models.parallel_wavegan import PWGGenerator
 from parakeet.models.parallel_wavegan import PWGDiscriminator
-from parakeet.models.parallel_wavegan import PWGUpdater
 from parakeet.models.parallel_wavegan import PWGEvaluator
+from parakeet.models.parallel_wavegan import PWGGenerator
+from parakeet.models.parallel_wavegan import PWGUpdater
 from parakeet.modules.stft_loss import MultiResolutionSTFTLoss
 from parakeet.training.extensions.snapshot import Snapshot
 from parakeet.training.extensions.visualizer import VisualDL
 from parakeet.training.seeding import seed_everything
 from parakeet.training.trainer import Trainer
-from pathlib import Path
-from visualdl import LogWriter
-from yacs.config import CfgNode


 def train_sp(args, config):

--- a/examples/vctk/GANVocoder/preprocess.py
+++ b/examples/vctk/GANVocoder/preprocess.py
@@ -11,10 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
 import os
+from concurrent.futures import ThreadPoolExecutor
 from operator import itemgetter
+from pathlib import Path
 from typing import Any
 from typing import Dict
 from typing import List
@@ -24,12 +25,11 @@ import librosa
 import numpy as np
 import tqdm
 import yaml
-from concurrent.futures import ThreadPoolExecutor
+from yacs.config import CfgNode
+
 from parakeet.data.get_feats import LogMelFBank
 from parakeet.datasets.preprocess_utils import get_phn_dur
 from parakeet.datasets.preprocess_utils import merge_silence
-from pathlib import Path
-from yacs.config import CfgNode


 def process_sentence(config: Dict[str, Any],

--- a/parakeet/exps/gan_vocoder/pwgan/__init__.py
+++ b/parakeet/exps/gan_vocoder/pwgan/__init__.py
--- a/parakeet/exps/ge2e/__init__.py
+++ b/parakeet/exps/ge2e/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/examples/voxceleb/spk0/local/ge2e/audio_processor.py
+++ b/examples/voxceleb/spk0/local/ge2e/audio_processor.py
@@ -11,14 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import struct
 from pathlib import Path
 from warnings import warn
-import struct

-from scipy.ndimage.morphology import binary_dilation
-import numpy as np
 import librosa
+import numpy as np
+from scipy.ndimage.morphology import binary_dilation

 try:
    import webrtcvad
@@ -97,7 +96,7 @@ def trim_long_silences(wav,
        return ret[width - 1:] / width

    audio_mask = moving_average(voice_flags, vad_moving_average_width)
-    audio_mask = np.round(audio_mask).astype(np.bool)
+    audio_mask = np.round(audio_mask).astype(bool)

    # Dilate the voiced regions
    audio_mask = binary_dilation(audio_mask,

--- a/examples/voxceleb/spk0/local/ge2e/config.py
+++ b/examples/voxceleb/spk0/local/ge2e/config.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from yacs.config import CfgNode

 _C = CfgNode()

--- a/examples/voxceleb/spk0/local/ge2e/dataset_processors.py
+++ b/examples/voxceleb/spk0/local/ge2e/dataset_processors.py
@@ -11,16 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import multiprocessing as mp
 from functools import partial
-from typing import List
 from pathlib import Path
-import multiprocessing as mp
+from typing import List

 import numpy as np
 from tqdm import tqdm

-from audio_processor import SpeakerVerificationPreprocessor
+from parakeet.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor


 def _process_utterance(path_pair, processor: SpeakerVerificationPreprocessor):

--- a/examples/voxceleb/spk0/local/ge2e/inference.py
+++ b/examples/voxceleb/spk0/local/ge2e/inference.py
@@ -11,19 +11,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
 from pathlib import Path

-import tqdm
-import paddle
 import numpy as np
+import paddle
+import tqdm

+from parakeet.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
+from parakeet.exps.ge2e.config import get_cfg_defaults
 from parakeet.models.lstm_speaker_encoder import LSTMSpeakerEncoder

-from audio_processor import SpeakerVerificationPreprocessor
-from config import get_cfg_defaults
-

 def embed_utterance(processor, model, fpath_or_wav):
    # audio processor

--- a/examples/voxceleb/spk0/local/ge2e/preprocess.py
+++ b/examples/voxceleb/spk0/local/ge2e/preprocess.py
@@ -11,14 +11,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
 from pathlib import Path
-from config import get_cfg_defaults
+
 from audio_processor import SpeakerVerificationPreprocessor
-from dataset_processors import (process_librispeech, process_voxceleb1,
-                                process_voxceleb2, process_aidatatang_200zh,
-                                process_magicdata)
+
+from parakeet.exps.ge2e.config import get_cfg_defaults
+from parakeet.exps.ge2e.dataset_processors import process_aidatatang_200zh
+from parakeet.exps.ge2e.dataset_processors import process_librispeech
+from parakeet.exps.ge2e.dataset_processors import process_magicdata
+from parakeet.exps.ge2e.dataset_processors import process_voxceleb1
+from parakeet.exps.ge2e.dataset_processors import process_voxceleb2

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(

--- a/examples/voxceleb/spk0/local/ge2e/random_cycle.py
+++ b/examples/voxceleb/spk0/local/ge2e/random_cycle.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import random



--- a/examples/voxceleb/spk0/local/ge2e/speaker_verification_dataset.py
+++ b/examples/voxceleb/spk0/local/ge2e/speaker_verification_dataset.py
@@ -11,14 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import random
 from pathlib import Path

 import numpy as np
-from paddle.io import Dataset, BatchSampler
+from paddle.io import BatchSampler
+from paddle.io import Dataset

-from random_cycle import random_cycle
+from parakeet.exps.ge2e.random_cycle import random_cycle


 class MultiSpeakerMelDataset(Dataset):

--- a/examples/voxceleb/spk0/local/ge2e/train.py
+++ b/examples/voxceleb/spk0/local/ge2e/train.py
@@ -11,23 +11,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import time

-from paddle import distributed as dist
-from paddle.optimizer import Adam
 from paddle import DataParallel
+from paddle import distributed as dist
 from paddle.io import DataLoader
 from paddle.nn.clip import ClipGradByGlobalNorm
+from paddle.optimizer import Adam

+from parakeet.exps.ge2e.config import get_cfg_defaults
+from parakeet.exps.ge2e.speaker_verification_dataset import Collate
+from parakeet.exps.ge2e.speaker_verification_dataset import MultiSpeakerMelDataset
+from parakeet.exps.ge2e.speaker_verification_dataset import MultiSpeakerSampler
 from parakeet.models.lstm_speaker_encoder import LSTMSpeakerEncoder
-from parakeet.training import ExperimentBase
 from parakeet.training import default_argument_parser
-
-from speaker_verification_dataset import MultiSpeakerMelDataset
-from speaker_verification_dataset import MultiSpeakerSampler
-from speaker_verification_dataset import Collate
-from config import get_cfg_defaults
+from parakeet.training import ExperimentBase


 class Ge2eExperiment(ExperimentBase):

--- a/examples/csmsc/speedyspeech/sentences.txt
+++ b/examples/csmsc/speedyspeech/sentences.txt
--- a/examples/vctk/fastspeech2/sentences_en.txt
+++ b/examples/vctk/fastspeech2/sentences_en.txt
--- a/parakeet/exps/speedyspeech/__init__.py
+++ b/parakeet/exps/speedyspeech/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/examples/csmsc/speedyspeech/baker/inference.py
+++ b/examples/csmsc/speedyspeech/baker/inference.py
@@ -11,13 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
 import os
 from pathlib import Path

 import soundfile as sf
 from paddle import inference
+
 from parakeet.frontend.zh_frontend import Frontend



--- a/examples/csmsc/speedyspeech/normalize.py
+++ b/examples/csmsc/speedyspeech/normalize.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Normalize feature files and dump them."""
-
 import argparse
 import logging
 from operator import itemgetter
@@ -20,10 +19,11 @@ from pathlib import Path

 import jsonlines
 import numpy as np
-from parakeet.datasets.data_table import DataTable
 from sklearn.preprocessing import StandardScaler
 from tqdm import tqdm

+from parakeet.datasets.data_table import DataTable
+

 def main():
    """Run preprocessing process."""

--- a/examples/csmsc/speedyspeech/preprocess.py
+++ b/examples/csmsc/speedyspeech/preprocess.py
@@ -11,27 +11,27 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import argparse
+import re
+from concurrent.futures import ThreadPoolExecutor
 from operator import itemgetter
+from pathlib import Path
 from typing import Any
 from typing import Dict
 from typing import List

-import argparse
 import jsonlines
 import librosa
 import numpy as np
-import re
 import tqdm
 import yaml
-from concurrent.futures import ThreadPoolExecutor
+from yacs.config import CfgNode
+
 from parakeet.data.get_feats import LogMelFBank
 from parakeet.datasets.preprocess_utils import compare_duration_and_mel_length
-from parakeet.datasets.preprocess_utils import get_phones_tones
 from parakeet.datasets.preprocess_utils import get_phn_dur
+from parakeet.datasets.preprocess_utils import get_phones_tones
 from parakeet.datasets.preprocess_utils import merge_silence
-from pathlib import Path
-from yacs.config import CfgNode


 def process_sentence(config: Dict[str, Any],

--- a/examples/csmsc/speedyspeech/synthesize.py
+++ b/examples/csmsc/speedyspeech/synthesize.py
@@ -11,25 +11,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
-import logging
 import argparse
+import logging
+import os
 from pathlib import Path

 import jsonlines
 import numpy as np
-import soundfile as sf
 import paddle
+import soundfile as sf
 import yaml
 from paddle import jit
 from paddle.static import InputSpec
 from yacs.config import CfgNode

 from parakeet.datasets.data_table import DataTable
-from parakeet.models.speedyspeech import SpeedySpeech
-from parakeet.models.speedyspeech import SpeedySpeechInference
 from parakeet.models.parallel_wavegan import PWGGenerator
 from parakeet.models.parallel_wavegan import PWGInference
+from parakeet.models.speedyspeech import SpeedySpeech
+from parakeet.models.speedyspeech import SpeedySpeechInference
 from parakeet.modules.normalizer import ZScore



--- a/examples/csmsc/speedyspeech/baker/synthesize_e2e.py
+++ b/examples/csmsc/speedyspeech/baker/synthesize_e2e.py
@@ -11,25 +11,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
 import logging
 import os
 from pathlib import Path

 import numpy as np
-import soundfile as sf
 import paddle
+import soundfile as sf
 import yaml
 from paddle import jit
 from paddle.static import InputSpec
+from yacs.config import CfgNode
+
 from parakeet.frontend.zh_frontend import Frontend
-from parakeet.models.speedyspeech import SpeedySpeech
-from parakeet.models.speedyspeech import SpeedySpeechInference
 from parakeet.models.parallel_wavegan import PWGGenerator
 from parakeet.models.parallel_wavegan import PWGInference
+from parakeet.models.speedyspeech import SpeedySpeech
+from parakeet.models.speedyspeech import SpeedySpeechInference
 from parakeet.modules.normalizer import ZScore
-from yacs.config import CfgNode


 def evaluate(args, speedyspeech_config, pwg_config):

--- a/examples/csmsc/speedyspeech/train.py
+++ b/examples/csmsc/speedyspeech/train.py
@@ -11,22 +11,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
 import logging
 import os
 import shutil
+from pathlib import Path

 import jsonlines
 import numpy as np
 import paddle
 import yaml
-from paddle import distributed as dist
 from paddle import DataParallel
+from paddle import distributed as dist
 from paddle.io import DataLoader
 from paddle.io import DistributedBatchSampler
-from parakeet.datasets.data_table import DataTable
+from visualdl import LogWriter
+from yacs.config import CfgNode
+
 from parakeet.datasets.am_batch_fn import speedyspeech_batch_fn
+from parakeet.datasets.data_table import DataTable
 from parakeet.models.speedyspeech import SpeedySpeech
 from parakeet.models.speedyspeech import SpeedySpeechEvaluator
 from parakeet.models.speedyspeech import SpeedySpeechUpdater
@@ -35,9 +38,6 @@ from parakeet.training.extensions.visualizer import VisualDL
 from parakeet.training.optimizer import build_optimizers
 from parakeet.training.seeding import seed_everything
 from parakeet.training.trainer import Trainer
-from pathlib import Path
-from visualdl import LogWriter
-from yacs.config import CfgNode


 def train_sp(args, config):

--- a/parakeet/exps/tacotron2/__init__.py
+++ b/parakeet/exps/tacotron2/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/examples/ljspeech/tts0/local/tacotron2/config.py
+++ b/examples/ljspeech/tts0/local/tacotron2/config.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from yacs.config import CfgNode as CN

 _C = CN()

--- a/examples/ljspeech/tts0/local/tacotron2/ljspeech.py
+++ b/examples/ljspeech/tts0/local/tacotron2/ljspeech.py
@@ -11,14 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from pathlib import Path
 import pickle
+from pathlib import Path

 import numpy as np
 from paddle.io import Dataset

-from parakeet.data.batch import batch_spec, batch_text_id
+from parakeet.data.batch import batch_spec
+from parakeet.data.batch import batch_text_id


 class LJSpeech(Dataset):

--- a/examples/ljspeech/tts0/local/tacotron2/preprocess.py
+++ b/examples/ljspeech/tts0/local/tacotron2/preprocess.py
@@ -11,21 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import argparse
 import os
 import pickle
-import argparse
 from pathlib import Path

-import tqdm
 import numpy as np
+import tqdm

+from parakeet.audio import AudioProcessor
+from parakeet.audio import LogMagnitude
 from parakeet.datasets import LJSpeechMetaData
-from parakeet.audio import AudioProcessor, LogMagnitude
+from parakeet.exps.tacotron2.config import get_cfg_defaults
 from parakeet.frontend import EnglishCharacter

-from config import get_cfg_defaults
-

 def create_dataset(config, source_path, target_path, verbose=False):
    # create output dir

--- a/examples/ljspeech/tts0/local/tacotron2/synthesize.ipynb
+++ b/examples/ljspeech/tts0/local/tacotron2/synthesize.ipynb
--- a/examples/ljspeech/tts0/local/tacotron2/synthesize.py
+++ b/examples/ljspeech/tts0/local/tacotron2/synthesize.py
@@ -11,20 +11,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
 from pathlib import Path

-import paddle
 import numpy as np
+import paddle
 from matplotlib import pyplot as plt

+from parakeet.exps.tacotron2.config import get_cfg_defaults
 from parakeet.frontend import EnglishCharacter
 from parakeet.models.tacotron2 import Tacotron2
 from parakeet.utils import display

-from config import get_cfg_defaults
-

 def main(config, args):
    paddle.set_device(args.device)
@@ -36,8 +34,13 @@ def main(config, args):

    # inputs
    input_path = Path(args.input).expanduser()
+    sentences = []
    with open(input_path, "rt") as f:
-        sentences = f.readlines()
+        for line in f:
+            line_list = line.strip().split()
+            utt_id = line_list[0]
+            sentence = " ".join(line_list[1:])
+            sentences.append((utt_id, sentence))

    if args.output is None:
        output_dir = input_path.parent / "synthesis"

--- a/examples/ljspeech/tts0/local/tacotron2/train.py
+++ b/examples/ljspeech/tts0/local/tacotron2/train.py
@@ -11,23 +11,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import time
 from collections import defaultdict

 import numpy as np
 import paddle
+from paddle import distributed as dist
 from paddle.io import DataLoader
 from paddle.io import DistributedBatchSampler
-from paddle import distributed as dist
+
 from parakeet.data import dataset
+from parakeet.exps.tacotron2.config import get_cfg_defaults
+from parakeet.exps.tacotron2.ljspeech import LJSpeech
+from parakeet.exps.tacotron2.ljspeech import LJSpeechCollector
+from parakeet.models.tacotron2 import Tacotron2
+from parakeet.models.tacotron2 import Tacotron2Loss
 from parakeet.training.cli import default_argument_parser
 from parakeet.training.experiment import ExperimentBase
-from parakeet.utils import display, mp_tools
-from parakeet.models.tacotron2 import Tacotron2, Tacotron2Loss
-
-from config import get_cfg_defaults
-from ljspeech import LJSpeech, LJSpeechCollector
+from parakeet.utils import display
+from parakeet.utils import mp_tools


 class Experiment(ExperimentBase):

--- a/parakeet/exps/transformer_tts/__init__.py
+++ b/parakeet/exps/transformer_tts/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/examples/ljspeech/tts1/local/transformer_tts/normalize.py
+++ b/examples/ljspeech/tts1/local/transformer_tts/normalize.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Normalize feature files and dump them."""
-
 import argparse
 import logging
 from operator import itemgetter
@@ -20,10 +19,11 @@ from pathlib import Path

 import jsonlines
 import numpy as np
-from parakeet.datasets.data_table import DataTable
 from sklearn.preprocessing import StandardScaler
 from tqdm import tqdm

+from parakeet.datasets.data_table import DataTable
+

 def main():
    """Run preprocessing process."""

--- a/examples/ljspeech/tts1/local/transformer_tts/preprocess.py
+++ b/examples/ljspeech/tts1/local/transformer_tts/preprocess.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
 from concurrent.futures import ThreadPoolExecutor
 from operator import itemgetter
@@ -25,9 +24,10 @@ import librosa
 import numpy as np
 import tqdm
 import yaml
+from yacs.config import CfgNode as Configuration
+
 from parakeet.data.get_feats import LogMelFBank
 from parakeet.frontend import English
-from yacs.config import CfgNode as Configuration


 def get_lj_sentences(file_name, frontend):

--- a/examples/ljspeech/tts1/local/transformer_tts/synthesize.py
+++ b/examples/ljspeech/tts1/local/transformer_tts/synthesize.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
 import logging
 from pathlib import Path
@@ -22,6 +21,7 @@ import paddle
 import soundfile as sf
 import yaml
 from yacs.config import CfgNode
+
 from parakeet.datasets.data_table import DataTable
 from parakeet.models.transformer_tts import TransformerTTS
 from parakeet.models.transformer_tts import TransformerTTSInference

--- a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/synthesize_e2e.py
+++ b/examples/ljspeech/tts1/local/transformer_tts/ljspeech/synthesize_e2e.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
 import logging
 from pathlib import Path
@@ -21,6 +20,7 @@ import paddle
 import soundfile as sf
 import yaml
 from yacs.config import CfgNode
+
 from parakeet.frontend import English
 from parakeet.models.transformer_tts import TransformerTTS
 from parakeet.models.transformer_tts import TransformerTTSInference

--- a/examples/ljspeech/tts1/local/transformer_tts/train.py
+++ b/examples/ljspeech/tts1/local/transformer_tts/train.py
@@ -11,10 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
-import os
 import logging
+import os
 import shutil
 from pathlib import Path

@@ -26,18 +25,19 @@ from paddle import DataParallel
 from paddle import distributed as dist
 from paddle.io import DataLoader
 from paddle.io import DistributedBatchSampler
-from parakeet.datasets.data_table import DataTable
+from visualdl import LogWriter
+from yacs.config import CfgNode
+
 from parakeet.datasets.am_batch_fn import transformer_single_spk_batch_fn
+from parakeet.datasets.data_table import DataTable
 from parakeet.models.transformer_tts import TransformerTTS
-from parakeet.models.transformer_tts import TransformerTTSUpdater
 from parakeet.models.transformer_tts import TransformerTTSEvaluator
+from parakeet.models.transformer_tts import TransformerTTSUpdater
 from parakeet.training.extensions.snapshot import Snapshot
 from parakeet.training.extensions.visualizer import VisualDL
 from parakeet.training.optimizer import build_optimizers
 from parakeet.training.seeding import seed_everything
 from parakeet.training.trainer import Trainer
-from visualdl import LogWriter
-from yacs.config import CfgNode


 def train_sp(args, config):

--- a/parakeet/exps/voice_cloning/__init__.py
+++ b/parakeet/exps/voice_cloning/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/parakeet/exps/voice_cloning/tacotron2_ge2e/__init__.py
+++ b/parakeet/exps/voice_cloning/tacotron2_ge2e/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/examples/aishell3/vc0/local/tacotron2/aishell3.py
+++ b/examples/aishell3/vc0/local/tacotron2/aishell3.py
@@ -11,16 +11,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import pickle
 from pathlib import Path

 import numpy as np
 from paddle.io import Dataset
-from parakeet.frontend import Vocab
-from parakeet.data import batch_text_id, batch_spec

-from preprocess_transcription import _phones, _tones
+from parakeet.data import batch_spec
+from parakeet.data import batch_text_id
+from parakeet.exps.voice_cloning.tacotron2_ge2e.preprocess_transcription import _phones
+from parakeet.exps.voice_cloning.tacotron2_ge2e.preprocess_transcription import _tones
+from parakeet.frontend import Vocab

 voc_phones = Vocab(sorted(list(_phones)))
 print("vocab_phones:\n", voc_phones)

--- a/examples/aishell3/vc0/local/tacotron2/chinese_g2p.py
+++ b/examples/aishell3/vc0/local/tacotron2/chinese_g2p.py
@@ -11,10 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import List
+from typing import Tuple

-from typing import List, Tuple
-from pypinyin import lazy_pinyin, Style
-from preprocess_transcription import split_syllable
+from pypinyin import lazy_pinyin
+from pypinyin import Style
+
+from parakeet.exps.voice_cloning.tacotron2_ge2e.preprocess_transcription import split_syllable


 def convert_to_pinyin(text: str) -> List[str]:

--- a/examples/aishell3/vc0/local/tacotron2/config.py
+++ b/examples/aishell3/vc0/local/tacotron2/config.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from yacs.config import CfgNode as CN

 _C = CN()

--- a/examples/aishell3/vc0/local/tacotron2/extract_mel.py
+++ b/examples/aishell3/vc0/local/tacotron2/extract_mel.py
@@ -11,19 +11,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
 import multiprocessing as mp
 from functools import partial
 from pathlib import Path

 import numpy as np
-from parakeet.audio import AudioProcessor
-from parakeet.audio.spec_normalizer import NormalizerBase, LogMagnitude
-
 import tqdm

-from config import get_cfg_defaults
+from parakeet.audio import AudioProcessor
+from parakeet.audio.spec_normalizer import LogMagnitude
+from parakeet.audio.spec_normalizer import NormalizerBase
+from parakeet.exps.voice_cloning.tacotron2_ge2e.config import get_cfg_defaults


 def extract_mel(fname: Path,
@@ -47,7 +46,7 @@ def extract_mel_multispeaker(config, input_dir, output_dir, extension=".wav"):
    output_dir.mkdir(parents=True, exist_ok=True)

    p = AudioProcessor(config.sample_rate, config.n_fft, config.win_length,
-                       config.hop_length, config.n_mels, config.fmin,
+                       config.hop_length, config.d_mels, config.fmin,
                       config.fmax)
    n = LogMagnitude(1e-5)


--- a/examples/aishell3/vc0/local/tacotron2/lexicon.txt
+++ b/examples/aishell3/vc0/local/tacotron2/lexicon.txt
--- a/examples/aishell3/vc0/local/tacotron2/preprocess_transcription.py
+++ b/examples/aishell3/vc0/local/tacotron2/preprocess_transcription.py
@@ -11,14 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
-from pathlib import Path
-import re
 import pickle
+import re
+from pathlib import Path

-import yaml
 import tqdm
+import yaml

 zh_pattern = re.compile("[\u4e00-\u9fa5]")


--- a/examples/aishell3/vc0/local/tacotron2/process_wav.py
+++ b/examples/aishell3/vc0/local/tacotron2/process_wav.py
@@ -11,17 +11,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
-from pathlib import Path
-from multiprocessing import Pool
 from functools import partial
+from multiprocessing import Pool
+from pathlib import Path

-import numpy as np
 import librosa
+import numpy as np
 import soundfile as sf
-from tqdm import tqdm
 from praatio import tgio
+from tqdm import tqdm


 def get_valid_part(fpath):

--- a/examples/aishell3/vc0/local/tacotron2/train.py
+++ b/examples/aishell3/vc0/local/tacotron2/train.py
@@ -11,26 +11,27 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import time
-from pathlib import Path
 from collections import defaultdict
+from pathlib import Path

 import numpy as np
-from matplotlib import pyplot as plt
-
 import paddle
+from matplotlib import pyplot as plt
 from paddle import distributed as dist
-from paddle.io import DataLoader, DistributedBatchSampler
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler

 from parakeet.data import dataset
+from parakeet.exps.voice_cloning.tacotron2_ge2e.aishell3 import AiShell3
+from parakeet.exps.voice_cloning.tacotron2_ge2e.aishell3 import collate_aishell3_examples
+from parakeet.exps.voice_cloning.tacotron2_ge2e.config import get_cfg_defaults
+from parakeet.models.tacotron2 import Tacotron2
+from parakeet.models.tacotron2 import Tacotron2Loss
 from parakeet.training.cli import default_argument_parser
 from parakeet.training.experiment import ExperimentBase
-from parakeet.utils import display, mp_tools
-from parakeet.models.tacotron2 import Tacotron2, Tacotron2Loss
-
-from config import get_cfg_defaults
-from aishell3 import AiShell3, collate_aishell3_examples
+from parakeet.utils import display
+from parakeet.utils import mp_tools


 class Experiment(ExperimentBase):
@@ -192,9 +193,9 @@ class Experiment(ExperimentBase):
    def setup_dataloader(self):
        args = self.args
        config = self.config
-        ljspeech_dataset = AiShell3(args.data)
+        aishell3_dataset = AiShell3(args.data)

-        valid_set, train_set = dataset.split(ljspeech_dataset,
+        valid_set, train_set = dataset.split(aishell3_dataset,
                                             config.data.valid_size)
        batch_fn = collate_aishell3_examples


--- a/parakeet/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py
+++ b/parakeet/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from pathlib import Path
+
+import numpy as np
+import paddle
+import soundfile as sf
+from matplotlib import pyplot as plt
+
+from parakeet.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
+from parakeet.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_phones
+from parakeet.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_tones
+from parakeet.exps.voice_cloning.tacotron2_ge2e.chinese_g2p import convert_sentence
+from parakeet.models.lstm_speaker_encoder import LSTMSpeakerEncoder
+from parakeet.models.tacotron2 import Tacotron2
+from parakeet.models.waveflow import ConditionalWaveFlow
+from parakeet.utils import display
+
+
+def voice_cloning(args):
+    # speaker encoder
+    p = SpeakerVerificationPreprocessor(
+        sampling_rate=16000,
+        audio_norm_target_dBFS=-30,
+        vad_window_length=30,
+        vad_moving_average_width=8,
+        vad_max_silence_length=6,
+        mel_window_length=25,
+        mel_window_step=10,
+        n_mels=40,
+        partial_n_frames=160,
+        min_pad_coverage=0.75,
+        partial_overlap_ratio=0.5)
+    print("Audio Processor Done!")
+
+    speaker_encoder = LSTMSpeakerEncoder(
+        n_mels=40, num_layers=3, hidden_size=256, output_size=256)
+    speaker_encoder.set_state_dict(paddle.load(args.ge2e_params_path))
+    speaker_encoder.eval()
+    print("GE2E Done!")
+
+    synthesizer = Tacotron2(
+        vocab_size=68,
+        n_tones=10,
+        d_mels=80,
+        d_encoder=512,
+        encoder_conv_layers=3,
+        encoder_kernel_size=5,
+        d_prenet=256,
+        d_attention_rnn=1024,
+        d_decoder_rnn=1024,
+        attention_filters=32,
+        attention_kernel_size=31,
+        d_attention=128,
+        d_postnet=512,
+        postnet_kernel_size=5,
+        postnet_conv_layers=5,
+        reduction_factor=1,
+        p_encoder_dropout=0.5,
+        p_prenet_dropout=0.5,
+        p_attention_dropout=0.1,
+        p_decoder_dropout=0.1,
+        p_postnet_dropout=0.5,
+        d_global_condition=256,
+        use_stop_token=False, )
+    synthesizer.set_state_dict(paddle.load(args.tacotron2_params_path))
+    synthesizer.eval()
+    print("Tacotron2 Done!")
+
+    # vocoder
+    vocoder = ConditionalWaveFlow(
+        upsample_factors=[16, 16],
+        n_flows=8,
+        n_layers=8,
+        n_group=16,
+        channels=128,
+        n_mels=80,
+        kernel_size=[3, 3])
+    vocoder.set_state_dict(paddle.load(args.waveflow_params_path))
+    vocoder.eval()
+    print("WaveFlow Done!")
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    input_dir = Path(args.input_dir)
+
+    # 因为 AISHELL-3 数据集中使用 % 和 $ 表示韵律词和韵律短语的边界，它们大约对应着较短和较长的停顿，在文本中可以使用 % 和 $ 来调节韵律。
+    # 值得的注意的是，句子的有效字符集仅包含汉字和 %, $, 因此输入的句子只能包含这些字符。
+    sentence = "每当你觉得%想要批评什么人的时候$你切要记着%这个世界上的人%并非都具备你禀有的条件$"
+    phones, tones = convert_sentence(sentence)
+    phones = np.array(
+        [voc_phones.lookup(item) for item in phones], dtype=np.int64)
+    tones = np.array([voc_tones.lookup(item) for item in tones], dtype=np.int64)
+    phones = paddle.to_tensor(phones).unsqueeze(0)
+    tones = paddle.to_tensor(tones).unsqueeze(0)
+
+    for name in os.listdir(input_dir):
+        utt_id = name.split(".")[0]
+        ref_audio_path = input_dir / name
+        mel_sequences = p.extract_mel_partials(p.preprocess_wav(ref_audio_path))
+        print("mel_sequences: ", mel_sequences.shape)
+        with paddle.no_grad():
+            embed = speaker_encoder.embed_utterance(
+                paddle.to_tensor(mel_sequences))
+        print("embed shape: ", embed.shape)
+        utterance_embeds = paddle.unsqueeze(embed, 0)
+        outputs = synthesizer.infer(
+            phones, tones=tones, global_condition=utterance_embeds)
+        mel_input = paddle.transpose(outputs["mel_outputs_postnet"], [0, 2, 1])
+        alignment = outputs["alignments"][0].numpy().T
+        display.plot_alignment(alignment)
+        plt.savefig(str(output_dir / (utt_id + ".png")))
+
+        with paddle.no_grad():
+            wav = vocoder.infer(mel_input)
+        wav = wav.numpy()[0]
+        sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=22050)
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(description="")
+    parser.add_argument(
+        "--ge2e_params_path", type=str, help="ge2e params path.")
+    parser.add_argument(
+        "--tacotron2_params_path", type=str, help="tacotron2 params path.")
+    parser.add_argument(
+        "--waveflow_params_path", type=str, help="waveflow params path.")
+    parser.add_argument(
+        "--device", type=str, default="gpu", help="device type to use.")
+
+    parser.add_argument(
+        "--input-dir",
+        type=str,
+        help="input dir of *.wav, the sample rate will be resample to 16k.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+
+    args = parser.parse_args()
+
+    paddle.set_device(args.device)
+
+    voice_cloning(args)
+
+
+if __name__ == "__main__":
+    main()
--- a/parakeet/exps/waveflow/__init__.py
+++ b/parakeet/exps/waveflow/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/examples/ljspeech/voc0/local/waveflow/config.py
+++ b/examples/ljspeech/voc0/local/waveflow/config.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from yacs.config import CfgNode as CN

 _C = CN()

--- a/examples/ljspeech/voc0/local/waveflow/ljspeech.py
+++ b/examples/ljspeech/voc0/local/waveflow/ljspeech.py
@@ -11,14 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from pathlib import Path

 import numpy as np
 import pandas
 from paddle.io import Dataset

-from parakeet.data.batch import batch_spec, batch_wav
+from parakeet.data.batch import batch_spec
+from parakeet.data.batch import batch_wav


 class LJSpeech(Dataset):

--- a/examples/ljspeech/voc0/local/waveflow/preprocess.py
+++ b/examples/ljspeech/voc0/local/waveflow/preprocess.py
@@ -11,20 +11,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-import os
 import argparse
+import os
 from pathlib import Path

-import tqdm
-import numpy as np
 import librosa
+import numpy as np
 import pandas as pd
+import tqdm

-from parakeet.datasets import LJSpeechMetaData
 from parakeet.audio import LogMagnitude
-
-from config import get_cfg_defaults
+from parakeet.datasets import LJSpeechMetaData
+from parakeet.exps.waveflow.config import get_cfg_defaults


 class Transform(object):

--- a/examples/ljspeech/voc0/local/waveflow/synthesize.py
+++ b/examples/ljspeech/voc0/local/waveflow/synthesize.py
@@ -11,20 +11,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-import os
 import argparse
+import os
 from pathlib import Path

 import numpy as np
-import soundfile as sf
 import paddle
+import soundfile as sf

+from parakeet.exps.waveflow.config import get_cfg_defaults
 from parakeet.models.waveflow import ConditionalWaveFlow
 from parakeet.utils import layer_tools

-from config import get_cfg_defaults
-

 def main(config, args):
    paddle.set_device(args.device)

--- a/examples/ljspeech/voc0/local/waveflow/train.py
+++ b/examples/ljspeech/voc0/local/waveflow/train.py
@@ -11,22 +11,24 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import time

 import numpy as np
 import paddle
 from paddle import distributed as dist
-from paddle.io import DataLoader, DistributedBatchSampler
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler

 from parakeet.data import dataset
-from parakeet.models.waveflow import ConditionalWaveFlow, WaveFlowLoss
-from parakeet.utils import mp_tools
+from parakeet.exps.waveflow.config import get_cfg_defaults
+from parakeet.exps.waveflow.ljspeech import LJSpeech
+from parakeet.exps.waveflow.ljspeech import LJSpeechClipCollector
+from parakeet.exps.waveflow.ljspeech import LJSpeechCollector
+from parakeet.models.waveflow import ConditionalWaveFlow
+from parakeet.models.waveflow import WaveFlowLoss
 from parakeet.training.cli import default_argument_parser
 from parakeet.training.experiment import ExperimentBase
-
-from config import get_cfg_defaults
-from ljspeech import LJSpeech, LJSpeechClipCollector, LJSpeechCollector
+from parakeet.utils import mp_tools


 class Experiment(ExperimentBase):

--- a/parakeet/frontend/__init__.py
+++ b/parakeet/frontend/__init__.py
@@ -11,11 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from .zh_normalization import *
 from .generate_lexicon import *
 from .normalizer import *
 from .phonectic import *
 from .punctuation import *
 from .tone_sandhi import *
 from .vocab import *
+from .zh_normalization import *
--- a/parakeet/frontend/normalizer/__init__.py
+++ b/parakeet/frontend/normalizer/__init__.py
@@ -11,6 +11,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from parakeet.frontend.normalizer.normalizer import *
 from parakeet.frontend.normalizer.numbers import *
--- a/parakeet/frontend/phonectic.py
+++ b/parakeet/frontend/phonectic.py
@@ -17,9 +17,9 @@ from abc import abstractmethod
 from g2p_en import G2p
 from g2pM import G2pM

-from parakeet.frontend.vocab import Vocab
 from parakeet.frontend.normalizer.normalizer import normalize
 from parakeet.frontend.punctuation import get_punctuations
+from parakeet.frontend.vocab import Vocab

 # discard opencc untill we find an easy solution to install it on windows
 # from opencc import OpenCC

--- a/parakeet/frontend/zh_frontend.py
+++ b/parakeet/frontend/zh_frontend.py
@@ -22,9 +22,9 @@ from g2pM import G2pM
 from pypinyin import lazy_pinyin
 from pypinyin import Style

-from parakeet.frontend.zh_normalization.text_normlization import TextNormalizer
 from parakeet.frontend.generate_lexicon import generate_lexicon
 from parakeet.frontend.tone_sandhi import ToneSandhi
+from parakeet.frontend.zh_normalization.text_normlization import TextNormalizer


 class Frontend():

--- a/parakeet/frontend/zh_normalization/__init__.py
+++ b/parakeet/frontend/zh_normalization/__init__.py
@@ -11,5 +11,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from parakeet.frontend.zh_normalization.text_normlization import *
--- a/parakeet/models/__init__.py
+++ b/parakeet/models/__init__.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from .fastspeech2 import *
 from .tacotron2 import *
 from .transformer_tts import *

--- a/parakeet/models/fastspeech2/__init__.py
+++ b/parakeet/models/fastspeech2/__init__.py
@@ -11,6 +11,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from .fastspeech2 import *
 from .fastspeech2_updater import *
--- a/parakeet/models/fastspeech2/fastspeech2.py
+++ b/parakeet/models/fastspeech2/fastspeech2.py
@@ -28,10 +28,10 @@ from parakeet.modules.fastspeech2_predictor.variance_predictor import VariancePr
 from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding
 from parakeet.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding
 from parakeet.modules.fastspeech2_transformer.encoder import Encoder as TransformerEncoder
-from parakeet.modules.tacotron2.decoder import Postnet
 from parakeet.modules.nets_utils import initialize
 from parakeet.modules.nets_utils import make_non_pad_mask
 from parakeet.modules.nets_utils import make_pad_mask
+from parakeet.modules.tacotron2.decoder import Postnet


 class FastSpeech2(nn.Layer):

--- a/parakeet/models/fastspeech2/fastspeech2_updater.py
+++ b/parakeet/models/fastspeech2/fastspeech2_updater.py
@@ -14,6 +14,7 @@
 import logging

 from paddle import distributed as dist
+
 from parakeet.models.fastspeech2 import FastSpeech2Loss
 from parakeet.training.extensions.evaluator import StandardEvaluator
 from parakeet.training.reporter import report

--- a/parakeet/models/lstm_speaker_encoder.py
+++ b/parakeet/models/lstm_speaker_encoder.py
@@ -106,10 +106,10 @@ class LSTMSpeakerEncoder(nn.Layer):
    def do_gradient_ops(self):
        for p in [self.similarity_weight, self.similarity_bias]:
            g = p._grad_ivar()
-            g[...] = g * 0.01
+            g = g * 0.01

    def inv_argmax(self, i, num):
-        return np.eye(1, num, i, dtype=np.int)[0]
+        return np.eye(1, num, i, dtype=int)[0]

    def loss(self, embeds):
        """

--- a/parakeet/models/parallel_wavegan/__init__.py
+++ b/parakeet/models/parallel_wavegan/__init__.py
@@ -11,6 +11,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from .parallel_wavegan import *
 from .parallel_wavegan_updater import *
--- a/parakeet/models/parallel_wavegan/parallel_wavegan_updater.py
+++ b/parakeet/models/parallel_wavegan/parallel_wavegan_updater.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import logging
 from typing import Dict

@@ -21,11 +20,12 @@ from paddle.io import DataLoader
 from paddle.nn import Layer
 from paddle.optimizer import Optimizer
 from paddle.optimizer.lr import LRScheduler
+from timer import timer
+
 from parakeet.training.extensions.evaluator import StandardEvaluator
 from parakeet.training.reporter import report
 from parakeet.training.updaters.standard_updater import StandardUpdater
 from parakeet.training.updaters.standard_updater import UpdaterState
-from timer import timer
 logging.basicConfig(
    format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
    datefmt='[%Y-%m-%d %H:%M:%S]')

--- a/parakeet/models/speedyspeech/__init__.py
+++ b/parakeet/models/speedyspeech/__init__.py
@@ -11,6 +11,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from .speedyspeech import *
 from .speedyspeech_updater import *
--- a/parakeet/models/speedyspeech/speedyspeech_updater.py
+++ b/parakeet/models/speedyspeech/speedyspeech_updater.py
@@ -17,6 +17,7 @@ import paddle
 from paddle import distributed as dist
 from paddle.fluid.layers import huber_loss
 from paddle.nn import functional as F
+
 from parakeet.modules.losses import masked_l1_loss
 from parakeet.modules.losses import weighted_mean
 from parakeet.modules.ssim import ssim

--- a/parakeet/models/transformer_tts/__init__.py
+++ b/parakeet/models/transformer_tts/__init__.py
@@ -11,6 +11,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from .transformer_tts import *
 from .transformer_tts_updater import *
--- a/parakeet/models/transformer_tts/transformer_tts.py
+++ b/parakeet/models/transformer_tts/transformer_tts.py
@@ -15,10 +15,11 @@
 from typing import Dict
 from typing import Sequence
 from typing import Tuple
+
 import numpy
 import paddle
-from paddle import nn
 import paddle.nn.functional as F
+from paddle import nn
 from typeguard import check_argument_types

 from parakeet.modules.fastspeech2_transformer.attention import MultiHeadedAttention
@@ -27,13 +28,13 @@ from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncodin
 from parakeet.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding
 from parakeet.modules.fastspeech2_transformer.encoder import Encoder
 from parakeet.modules.fastspeech2_transformer.mask import subsequent_mask
+from parakeet.modules.nets_utils import initialize
+from parakeet.modules.nets_utils import make_non_pad_mask
+from parakeet.modules.nets_utils import make_pad_mask
 from parakeet.modules.style_encoder import StyleEncoder
 from parakeet.modules.tacotron2.decoder import Postnet
 from parakeet.modules.tacotron2.decoder import Prenet as DecoderPrenet
 from parakeet.modules.tacotron2.encoder import Encoder as EncoderPrenet
-from parakeet.modules.nets_utils import initialize
-from parakeet.modules.nets_utils import make_non_pad_mask
-from parakeet.modules.nets_utils import make_pad_mask


 class TransformerTTS(nn.Layer):

--- a/parakeet/models/transformer_tts/transformer_tts_updater.py
+++ b/parakeet/models/transformer_tts/transformer_tts_updater.py
@@ -16,6 +16,7 @@ from typing import Sequence

 import paddle
 from paddle import distributed as dist
+
 from parakeet.models.transformer_tts import GuidedMultiHeadAttentionLoss
 from parakeet.models.transformer_tts import TransformerTTSLoss
 from parakeet.training.extensions.evaluator import StandardEvaluator

--- a/parakeet/modules/__init__.py
+++ b/parakeet/modules/__init__.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from .attention import *
 from .conv import *
 from .geometry import *

--- a/parakeet/modules/fastspeech2_transformer/decoder.py
+++ b/parakeet/modules/fastspeech2_transformer/decoder.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 # 暂时删除了 dyminic conv
 """Decoder definition."""
 import logging

--- a/parakeet/modules/fastspeech2_transformer/decoder_layer.py
+++ b/parakeet/modules/fastspeech2_transformer/decoder_layer.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Decoder self-attention layer definition."""
-
 import paddle
 from paddle import nn
+
 from parakeet.modules.layer_norm import LayerNorm



--- a/parakeet/modules/fastspeech2_transformer/lightconv.py
+++ b/parakeet/modules/fastspeech2_transformer/lightconv.py
@@ -12,11 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Lightweight Convolution Module."""
-
 import numpy
 import paddle
-from paddle import nn
 import paddle.nn.functional as F
+from paddle import nn

 from parakeet.modules.glu import GLU
 from parakeet.modules.masked_fill import masked_fill

--- a/parakeet/modules/fastspeech2_transformer/mask.py
+++ b/parakeet/modules/fastspeech2_transformer/mask.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Mask module."""
-
 import paddle



--- a/parakeet/modules/style_encoder.py
+++ b/parakeet/modules/style_encoder.py
@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Style encoder of GST-Tacotron."""
-
-from typeguard import check_argument_types
 from typing import Sequence

 import paddle
 from paddle import nn
+from typeguard import check_argument_types
+
 from parakeet.modules.fastspeech2_transformer.attention import MultiHeadedAttention as BaseMultiHeadedAttention



--- a/parakeet/modules/tacotron2/decoder.py
+++ b/parakeet/modules/tacotron2/decoder.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tacotron2 decoder related modules."""
-
-import six
 import paddle.nn.functional as F
+import six
 from paddle import nn



--- a/parakeet/modules/tacotron2/encoder.py
+++ b/parakeet/modules/tacotron2/encoder.py
@@ -12,10 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tacotron2 encoder related modules."""
-
-import six
-
 import paddle
+import six
 from paddle import nn



--- a/parakeet/modules/transformer.py
+++ b/parakeet/modules/transformer.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from paddle import nn
-from parakeet.modules import attention as attn
 from paddle.nn import functional as F

+from parakeet.modules import attention as attn
+
 __all__ = [
    "PositionwiseFFN",
    "TransformerEncoderLayer",

--- a/parakeet/training/__init__.py
+++ b/parakeet/training/__init__.py
@@ -11,6 +11,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from .cli import *
 from .experiment import *
--- a/parakeet/training/optimizer.py
+++ b/parakeet/training/optimizer.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import paddle

 optim_classes = dict(

--- a/parakeet/utils/__init__.py
+++ b/parakeet/utils/__init__.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from . import checkpoint
 from . import display
 from . import layer_tools

--- a/parakeet/utils/profiler.py
+++ b/parakeet/utils/profiler.py
@@ -11,8 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import sys
+
 import paddle

 # A global variable to record the number of calling times for profiler

--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,6 @@ import contextlib
 import inspect
 import io
 import os
-import re
 import subprocess as sp
 import sys
 from pathlib import Path
@@ -84,7 +83,7 @@ def _post_install(install_lib_dir):
    tools_extrs_dir = HERE / 'tools/extras'
    with pushd(tools_extrs_dir):
        print(os.getcwd())
-        check_call(f"./install_autolog.sh")
+        check_call("./install_autolog.sh")
    print("autolog install.")

    # ctcdecoder

--- a/tests/benchmark/pwgan/README.md
+++ b/tests/benchmark/pwgan/README.md
@@ -4,8 +4,8 @@
 ```
 即可运行.
 执行逻辑：
-1. cd 到 ../../../ (也就是 Parakeet 目录)
+1. cd 到 ../../../ (也就是 Deepspeech 目录)
 2. 安装 parakeet 所需依赖
 3. 从 bos 下载数据集并解压缩
-4. 预处理数据集为训练 pwg 所需格式，保存到 Parakeet/dump 文件夹底下
+4. 预处理数据集为训练 pwg 所需格式，保存到 Deepspeech/dump 文件夹底下
 5. 按照不同的参数执行 run_benchmark.sh 脚本
--- a/tests/benchmark/pwgan/run_all.sh
+++ b/tests/benchmark/pwgan/run_all.sh
@@ -10,6 +10,9 @@ cd ../../../
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
      sudo apt-get install libsndfile1
      pip install -e .
+      pushd examples/csmsc/voc1
+      source path.sh
+      popd
 fi
 # 2 拷贝该模型需要数据、预训练模型
 # 下载 baker 数据集到 home 目录下并解压缩到 home 目录下
@@ -22,15 +25,14 @@ fi
 # 数据预处理
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then

-      python examples/GANVocoder/preprocess.py --rootdir=BZNSYP/ --dumpdir=dump --num-cpu=20 --cut-sil=True --dur-file=durations.txt --config=examples/GANVocoder/parallelwave_gan/baker/conf/default.yaml
-      python utils/compute_statistics.py --metadata=dump/train/raw/metadata.jsonl --field-name="feats"
-      python examples/GANVocoder/normalize.py --metadata=dump/train/raw/metadata.jsonl --dumpdir=dump/train/norm --stats=dump/train/feats_stats.npy
-      python examples/GANVocoder/normalize.py --metadata=dump/dev/raw/metadata.jsonl --dumpdir=dump/dev/norm --stats=dump/train/feats_stats.npy
-      python examples/GANVocoder/normalize.py --metadata=dump/test/raw/metadata.jsonl --dumpdir=dump/test/norm --stats=dump/train/feats_stats.npy
+      python3 parakeet/exps/gan_vocoder/preprocess.py --rootdir=BZNSYP/ --dumpdir=dump --num-cpu=20 --cut-sil=True --dur-file=durations.txt --config=examples/csmsc/voc1/conf/default.yaml
+      python3 utils/compute_statistics.py --metadata=dump/train/raw/metadata.jsonl --field-name="feats"
+      python3 parakeet/exps/gan_vocoder/normalize.py --metadata=dump/train/raw/metadata.jsonl --dumpdir=dump/train/norm --stats=dump/train/feats_stats.npy
+      python3 parakeet/exps/gan_vocoder/normalize.py --metadata=dump/dev/raw/metadata.jsonl --dumpdir=dump/dev/norm --stats=dump/train/feats_stats.npy
+      python3 parakeet/exps/gan_vocoder/normalize.py --metadata=dump/test/raw/metadata.jsonl --dumpdir=dump/test/norm --stats=dump/train/feats_stats.npy
 fi
 # 3 批量运行（如不方便批量，1，2需放到单个模型中）
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
- 
      model_mode_list=(pwg)
      fp_item_list=(fp32)
      # 满 bs 是 26
@@ -40,7 +42,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
            for bs_item in ${bs_item_list[@]}; do
                  echo "index is speed, 1gpus, begin, ${model_name}"
                  run_mode=sp
-                  CUDA_VISIBLE_DEVICES=0 bash tests/benchmark/PWGAN/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 100 ${model_mode}     #  (5min)
+                  CUDA_VISIBLE_DEVICES=0 bash tests/benchmark/pwgan/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 100 ${model_mode}     #  (5min)
                  sleep 60
                  echo "index is speed, 8gpus, run_mode is multi_process, begin, ${model_name}"
                  run_mode=mp

--- a/tests/benchmark/pwgan/run_benchmark.sh
+++ b/tests/benchmark/pwgan/run_benchmark.sh
@@ -24,13 +24,13 @@ function _train(){
               --max-iter=${max_iter} 
               --train-metadata=dump/train/norm/metadata.jsonl \
               --dev-metadata=dump/dev/norm/metadata.jsonl \
-               --config=examples/GANVocoder/parallelwave_gan/baker/conf/default.yaml \
+               --config=examples/csmsc/voc1/conf/default.yaml \
               --output-dir=exp/default \
               --run-benchmark=true"   

    case ${run_mode} in
-    sp) train_cmd="python3 examples/GANVocoder/parallelwave_gan/train.py --nprocs=1 ${train_cmd}" ;;
-    mp) train_cmd="python3 examples/GANVocoder/parallelwave_gan/train.py --nprocs=8 ${train_cmd}"
+    sp) train_cmd="python3 parakeet/exps/gan_vocoder/parallelwave_gan/train.py --nprocs=1 ${train_cmd}" ;;
+    mp) train_cmd="python3 parakeet/exps/gan_vocoder/parallelwave_gan/train.py --nprocs=8 ${train_cmd}"
        log_parse_file="mylog/workerlog.0" ;;
    *) echo "choose run_mode(sp or mp)"; exit 1;
    esac

--- a/tests/chains/speedyspeech/lite_train_infer.sh
+++ b/tests/chains/speedyspeech/lite_train_infer.sh
--- a/tests/chains/speedyspeech/prepare.sh
+++ b/tests/chains/speedyspeech/prepare.sh
--- a/tests/chains/speedyspeech/speedyspeech_params_lite_multi_gpu.txt
+++ b/tests/chains/speedyspeech/speedyspeech_params_lite_multi_gpu.txt
@@ -13,7 +13,7 @@ null:null
 null:null
 ##
 trainer:norm_train
-norm_train:../../examples/speedyspeech/train.py --train-metadata=train_data/mini_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/mini_BZNSYP/dev/norm/metadata.jsonl --config=../../examples/speedyspeech/baker/conf/default.yaml --batch_size=32 --max_epoch=20 --num_snapshots=10 --output-dir=exp/default --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt --use-relative-path=True
+norm_train:../../../parakeet/exps/speedyspeech/train.py --train-metadata=train_data/mini_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/mini_BZNSYP/dev/norm/metadata.jsonl --config=../../../examples/csmsc/tts2/conf/default.yaml --batch_size=32 --max_epoch=20 --num_snapshots=10 --output-dir=exp/default --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt --use-relative-path=True
 null:null
 null:null
 null:null
@@ -21,7 +21,7 @@ null:null
 null:null
 ##
 ===========================eval_params===========================
-eval:../../examples/speedyspeech/baker/synthesize_e2e.py --speedyspeech-config=../../examples/speedyspeech/baker/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_20.pdz --speedyspeech-stat=train_data/mini_BZNSYP/train/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../examples/speedyspeech/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt
+eval:../../../parakeet/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=../../../examples/csmsc/tts2/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_20.pdz --speedyspeech-stat=train_data/mini_BZNSYP/train/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../parakeet/exps/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt
 null:null
 ##
 ===========================infer_params===========================
@@ -37,7 +37,7 @@ null:null
 null:null
 null:null
 null:null
-inference:../../examples/speedyspeech/baker/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../../examples/speedyspeech/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt
+inference:../../../parakeet/exps/speedyspeech/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../../../parakeet/exps/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt
 null:null
 null:null
 null:null

--- a/tests/chains/speedyspeech/speedyspeech_params_lite_single_gpu.txt
+++ b/tests/chains/speedyspeech/speedyspeech_params_lite_single_gpu.txt
@@ -13,7 +13,7 @@ null:null
 null:null
 ##
 trainer:norm_train
-norm_train:../../examples/speedyspeech/train.py --train-metadata=train_data/mini_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/mini_BZNSYP/dev/norm/metadata.jsonl --config=../../examples/speedyspeech/baker/conf/default.yaml --batch_size=32 --max_epoch=10 --num_snapshots=10 --output-dir=exp/default --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt --use-relative-path=True
+norm_train:../../../parakeet/exps/speedyspeech/train.py --train-metadata=train_data/mini_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/mini_BZNSYP/dev/norm/metadata.jsonl --config=../../../examples/csmsc/tts2/conf/default.yaml --batch_size=32 --max_epoch=10 --num_snapshots=10 --output-dir=exp/default --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt --use-relative-path=True
 null:null
 null:null
 null:null
@@ -21,7 +21,7 @@ null:null
 null:null
 ##
 ===========================eval_params===========================
-eval:../../examples/speedyspeech/baker/synthesize_e2e.py --speedyspeech-config=../../examples/speedyspeech/baker/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_30.pdz --speedyspeech-stat=train_data/mini_BZNSYP/train/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../examples/speedyspeech/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt
+eval:../../../parakeet/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=../../../examples/csmsc/tts2/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_30.pdz --speedyspeech-stat=train_data/mini_BZNSYP/train/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../parakeet/exps/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt
 null:null
 ##
 ===========================infer_params===========================
@@ -37,7 +37,7 @@ null:null
 null:null
 null:null
 null:null
-inference:../../examples/speedyspeech/baker/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../../examples/speedyspeech/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt
+inference:../../../parakeet/exps/speedyspeech/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../../../parakeet/exps/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt
 --use_gpu:True
 null:null
 null:null

--- a/tests/chains/speedyspeech/speedyspeech_params_whole_multi_gpu.txt
+++ b/tests/chains/speedyspeech/speedyspeech_params_whole_multi_gpu.txt
@@ -13,7 +13,7 @@ null:null
 null:null
 ##
 trainer:norm_train
-norm_train:../../examples/speedyspeech/train.py --train-metadata=train_data/processed_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/processed_BZNSYP/dev/norm/metadata.jsonl --config=../../examples/speedyspeech/baker/conf/default.yaml --output-dir=exp/whole --phones-dict=train_data/processed_BZNSYP/phone_id_map.txt --tones-dict=train_data/processed_BZNSYP/tone_id_map.txt --use-relative-path=True
+norm_train:../../../parakeet/exps/speedyspeech/train.py --train-metadata=train_data/processed_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/processed_BZNSYP/dev/norm/metadata.jsonl --config=../../../examples/csmsc/tts2/conf/default.yaml --output-dir=exp/whole --phones-dict=train_data/processed_BZNSYP/phone_id_map.txt --tones-dict=train_data/processed_BZNSYP/tone_id_map.txt --use-relative-path=True
 null:null
 null:null
 null:null
@@ -21,7 +21,7 @@ null:null
 null:null
 ##
 ===========================eval_params===========================
-eval:../../examples/speedyspeech/baker/synthesize_e2e.py --speedyspeech-config=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/default.yaml --speedyspeech-checkpoint=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz --speedyspeech-stat=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../examples/speedyspeech/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
+eval:../../../parakeet/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/default.yaml --speedyspeech-checkpoint=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz --speedyspeech-stat=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../parakeet/exps/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
 null:null
 ##
 ===========================infer_params===========================
@@ -37,7 +37,7 @@ null:null
 null:null
 null:null
 null:null
-inference:../../examples/speedyspeech/baker/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../../examples/speedyspeech/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt
+inference:../../../parakeet/exps/speedyspeech/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../../../parakeet/exps/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt
 null:null
 null:null
 null:null

--- a/tests/chains/speedyspeech/speedyspeech_params_whole_single_gpu.txt
+++ b/tests/chains/speedyspeech/speedyspeech_params_whole_single_gpu.txt
@@ -13,7 +13,7 @@ null:null
 null:null
 ##
 trainer:norm_train
-norm_train:../../examples/speedyspeech/train.py --train-metadata=train_data/processed_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/processed_BZNSYP/dev/norm/metadata.jsonl --config=../../examples/speedyspeech/baker/conf/default.yaml --output-dir=exp/whole --phones-dict=train_data/processed_BZNSYP/phone_id_map.txt --tones-dict=train_data/processed_BZNSYP/tone_id_map.txt --use-relative-path=True
+norm_train:../../../parakeet/exps/speedyspeech/train.py --train-metadata=train_data/processed_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/processed_BZNSYP/dev/norm/metadata.jsonl --config=../../../examples/csmsc/tts2/conf/default.yaml --output-dir=exp/whole --phones-dict=train_data/processed_BZNSYP/phone_id_map.txt --tones-dict=train_data/processed_BZNSYP/tone_id_map.txt --use-relative-path=True
 null:null
 null:null
 null:null
@@ -21,7 +21,7 @@ null:null
 null:null
 ##
 ===========================eval_params===========================
-eval:../../examples/speedyspeech/baker/synthesize_e2e.py --speedyspeech-config=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/default.yaml --speedyspeech-checkpoint=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz --speedyspeech-stat=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../examples/speedyspeech/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
+eval:../../../parakeet/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/default.yaml --speedyspeech-checkpoint=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz --speedyspeech-stat=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../parakeet/exps/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
 null:null
 ##
 ===========================infer_params===========================
@@ -37,7 +37,7 @@ null:null
 null:null
 null:null
 null:null
-inference:../../examples/speedyspeech/baker/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../../examples/speedyspeech/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt
+inference:../../../parakeet/exps/speedyspeech/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../../../parakeet/exps/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt
 null:null
 null:null
 null:null

--- a/tests/chains/speedyspeech/test.sh
+++ b/tests/chains/speedyspeech/test.sh
--- a/tests/chains/speedyspeech/whole_train_infer.sh
+++ b/tests/chains/speedyspeech/whole_train_infer.sh
--- a/utils/json2trn.py
+++ b/utils/json2trn.py
@@ -4,7 +4,6 @@
 #           2018 Xuankai Chang (Shanghai Jiao Tong University)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 import argparse
-import json
 import logging
 import sys